diff options
Diffstat (limited to 'fs')
221 files changed, 6945 insertions, 4630 deletions
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c index a89f3cfe3c7d..c202930086ed 100644 --- a/fs/9p/v9fs.c +++ b/fs/9p/v9fs.c @@ -333,10 +333,6 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses, goto err_names; init_rwsem(&v9ses->rename_sem); - rc = bdi_setup_and_register(&v9ses->bdi, "9p"); - if (rc) - goto err_names; - v9ses->uid = INVALID_UID; v9ses->dfltuid = V9FS_DEFUID; v9ses->dfltgid = V9FS_DEFGID; @@ -345,7 +341,7 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses, if (IS_ERR(v9ses->clnt)) { rc = PTR_ERR(v9ses->clnt); p9_debug(P9_DEBUG_ERROR, "problem initializing 9p client\n"); - goto err_bdi; + goto err_names; } v9ses->flags = V9FS_ACCESS_USER; @@ -415,8 +411,6 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses, err_clnt: p9_client_destroy(v9ses->clnt); -err_bdi: - bdi_destroy(&v9ses->bdi); err_names: kfree(v9ses->uname); kfree(v9ses->aname); @@ -445,8 +439,6 @@ void v9fs_session_close(struct v9fs_session_info *v9ses) kfree(v9ses->uname); kfree(v9ses->aname); - bdi_destroy(&v9ses->bdi); - spin_lock(&v9fs_sessionlist_lock); list_del(&v9ses->slist); spin_unlock(&v9fs_sessionlist_lock); diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h index 443d12e02043..76eaf49abd3a 100644 --- a/fs/9p/v9fs.h +++ b/fs/9p/v9fs.h @@ -114,7 +114,6 @@ struct v9fs_session_info { kuid_t uid; /* if ACCESS_SINGLE, the uid that has access */ struct p9_client *clnt; /* 9p client */ struct list_head slist; /* list of sessions registered with v9fs */ - struct backing_dev_info bdi; struct rw_semaphore rename_sem; }; diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c index de3ed8629196..a0965fb587a5 100644 --- a/fs/9p/vfs_super.c +++ b/fs/9p/vfs_super.c @@ -72,10 +72,12 @@ static int v9fs_set_super(struct super_block *s, void *data) * */ -static void +static int v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses, int flags, void *data) { + int ret; + sb->s_maxbytes = MAX_LFS_FILESIZE; sb->s_blocksize_bits = fls(v9ses->maxdata - 1); sb->s_blocksize = 1 << sb->s_blocksize_bits; @@ -85,7 +87,11 @@ v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses, sb->s_xattr = v9fs_xattr_handlers; } else sb->s_op = &v9fs_super_ops; - sb->s_bdi = &v9ses->bdi; + + ret = super_setup_bdi(sb); + if (ret) + return ret; + if (v9ses->cache) sb->s_bdi->ra_pages = (VM_MAX_READAHEAD * 1024)/PAGE_SIZE; @@ -99,6 +105,7 @@ v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses, #endif save_mount_options(sb, data); + return 0; } /** @@ -138,7 +145,9 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags, retval = PTR_ERR(sb); goto clunk_fid; } - v9fs_fill_super(sb, v9ses, flags, data); + retval = v9fs_fill_super(sb, v9ses, flags, data); + if (retval) + goto release_sb; if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) sb->s_d_op = &v9fs_cached_dentry_operations; diff --git a/fs/afs/internal.h b/fs/afs/internal.h index a6901360fb81..393672997cc2 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -318,7 +318,6 @@ struct afs_volume { unsigned short rjservers; /* number of servers discarded due to -ENOMEDIUM */ struct afs_server *servers[8]; /* servers on which volume resides (ordered) */ struct rw_semaphore server_sem; /* lock for accessing current server */ - struct backing_dev_info bdi; }; /* diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c index 8f76b13d5549..d5990eb160bd 100644 --- a/fs/afs/rxrpc.c +++ b/fs/afs/rxrpc.c @@ -419,7 +419,7 @@ error_do_abort: call->state = AFS_CALL_COMPLETE; if (ret != -ECONNABORTED) { rxrpc_kernel_abort_call(afs_socket, rxcall, RX_USER_ABORT, - -ret, "KSD"); + ret, "KSD"); } else { abort_code = 0; offset = 0; @@ -478,12 +478,12 @@ static void afs_deliver_to_call(struct afs_call *call) case -ENOTCONN: abort_code = RX_CALL_DEAD; rxrpc_kernel_abort_call(afs_socket, call->rxcall, - abort_code, -ret, "KNC"); + abort_code, ret, "KNC"); goto save_error; case -ENOTSUPP: abort_code = RXGEN_OPCODE; rxrpc_kernel_abort_call(afs_socket, call->rxcall, - abort_code, -ret, "KIV"); + abort_code, ret, "KIV"); goto save_error; case -ENODATA: case -EBADMSG: @@ -493,7 +493,7 @@ static void afs_deliver_to_call(struct afs_call *call) if (call->state != AFS_CALL_AWAIT_REPLY) abort_code = RXGEN_SS_UNMARSHAL; rxrpc_kernel_abort_call(afs_socket, call->rxcall, - abort_code, EBADMSG, "KUM"); + abort_code, -EBADMSG, "KUM"); goto save_error; } } @@ -754,7 +754,7 @@ void afs_send_empty_reply(struct afs_call *call) case -ENOMEM: _debug("oom"); rxrpc_kernel_abort_call(afs_socket, call->rxcall, - RX_USER_ABORT, ENOMEM, "KOO"); + RX_USER_ABORT, -ENOMEM, "KOO"); default: _leave(" [error]"); return; @@ -792,7 +792,7 @@ void afs_send_simple_reply(struct afs_call *call, const void *buf, size_t len) if (n == -ENOMEM) { _debug("oom"); rxrpc_kernel_abort_call(afs_socket, call->rxcall, - RX_USER_ABORT, ENOMEM, "KOO"); + RX_USER_ABORT, -ENOMEM, "KOO"); } _leave(" [error]"); } diff --git a/fs/afs/super.c b/fs/afs/super.c index fbdb022b75a2..c79633e5cfd8 100644 --- a/fs/afs/super.c +++ b/fs/afs/super.c @@ -319,7 +319,10 @@ static int afs_fill_super(struct super_block *sb, sb->s_blocksize_bits = PAGE_SHIFT; sb->s_magic = AFS_FS_MAGIC; sb->s_op = &afs_super_ops; - sb->s_bdi = &as->volume->bdi; + ret = super_setup_bdi(sb); + if (ret) + return ret; + sb->s_bdi->ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_SIZE; strlcpy(sb->s_id, as->volume->vlocation->vldb.name, sizeof(sb->s_id)); /* allocate the root inode and dentry */ diff --git a/fs/afs/volume.c b/fs/afs/volume.c index 546f9d01710b..db73d6dad02b 100644 --- a/fs/afs/volume.c +++ b/fs/afs/volume.c @@ -106,11 +106,6 @@ struct afs_volume *afs_volume_lookup(struct afs_mount_params *params) volume->cell = params->cell; volume->vid = vlocation->vldb.vid[params->type]; - volume->bdi.ra_pages = VM_MAX_READAHEAD*1024/PAGE_SIZE; - ret = bdi_setup_and_register(&volume->bdi, "afs"); - if (ret) - goto error_bdi; - init_rwsem(&volume->server_sem); /* look up all the applicable server records */ @@ -156,8 +151,6 @@ error: return ERR_PTR(ret); error_discard: - bdi_destroy(&volume->bdi); -error_bdi: up_write(¶ms->cell->vl_sem); for (loop = volume->nservers - 1; loop >= 0; loop--) @@ -207,7 +200,6 @@ void afs_put_volume(struct afs_volume *volume) for (loop = volume->nservers - 1; loop >= 0; loop--) afs_put_server(volume->servers[loop]); - bdi_destroy(&volume->bdi); kfree(volume); _leave(" [destroyed]"); diff --git a/fs/autofs4/Kconfig b/fs/autofs4/Kconfig index 1204d6384d39..44727bf18297 100644 --- a/fs/autofs4/Kconfig +++ b/fs/autofs4/Kconfig @@ -7,7 +7,7 @@ config AUTOFS4_FS automounter (amd), which is a pure user space daemon. To use the automounter you need the user-space tools from - <ftp://ftp.kernel.org/pub/linux/daemons/autofs/v4/>; you also + <https://www.kernel.org/pub/linux/daemons/autofs/v4/>; you also want to answer Y to "NFS file system support", below. To compile this support as a module, choose M here: the module will be diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c index c500e954debb..63e7c4760bfb 100644 --- a/fs/befs/linuxvfs.c +++ b/fs/befs/linuxvfs.c @@ -58,6 +58,7 @@ static struct dentry *befs_fh_to_dentry(struct super_block *sb, struct fid *fid, int fh_len, int fh_type); static struct dentry *befs_fh_to_parent(struct super_block *sb, struct fid *fid, int fh_len, int fh_type); +static struct dentry *befs_get_parent(struct dentry *child); static const struct super_operations befs_sops = { .alloc_inode = befs_alloc_inode, /* allocate a new inode */ @@ -93,6 +94,7 @@ static const struct address_space_operations befs_symlink_aops = { static const struct export_operations befs_export_operations = { .fh_to_dentry = befs_fh_to_dentry, .fh_to_parent = befs_fh_to_parent, + .get_parent = befs_get_parent, }; /* @@ -667,6 +669,19 @@ static struct dentry *befs_fh_to_parent(struct super_block *sb, befs_nfs_get_inode); } +static struct dentry *befs_get_parent(struct dentry *child) +{ + struct inode *parent; + struct befs_inode_info *befs_ino = BEFS_I(d_inode(child)); + + parent = befs_iget(child->d_sb, + (unsigned long)befs_ino->i_parent.start); + if (IS_ERR(parent)) + return ERR_CAST(parent); + + return d_obtain_alias(parent); +} + enum { Opt_uid, Opt_gid, Opt_charset, Opt_debug, Opt_err, }; diff --git a/fs/block_dev.c b/fs/block_dev.c index 2eca00ec4370..2a305c1a2d88 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -18,6 +18,7 @@ #include <linux/module.h> #include <linux/blkpg.h> #include <linux/magic.h> +#include <linux/dax.h> #include <linux/buffer_head.h> #include <linux/swap.h> #include <linux/pagevec.h> @@ -103,12 +104,11 @@ void invalidate_bdev(struct block_device *bdev) { struct address_space *mapping = bdev->bd_inode->i_mapping; - if (mapping->nrpages == 0) - return; - - invalidate_bh_lrus(); - lru_add_drain_all(); /* make sure all lru add caches are flushed */ - invalidate_mapping_pages(mapping, 0, -1); + if (mapping->nrpages) { + invalidate_bh_lrus(); + lru_add_drain_all(); /* make sure all lru add caches are flushed */ + invalidate_mapping_pages(mapping, 0, -1); + } /* 99% of the time, we don't need to flush the cleancache on the bdev. * But, for the strange corners, lets be cautious */ @@ -717,50 +717,18 @@ int bdev_write_page(struct block_device *bdev, sector_t sector, } EXPORT_SYMBOL_GPL(bdev_write_page); -/** - * bdev_direct_access() - Get the address for directly-accessibly memory - * @bdev: The device containing the memory - * @dax: control and output parameters for ->direct_access - * - * If a block device is made up of directly addressable memory, this function - * will tell the caller the PFN and the address of the memory. The address - * may be directly dereferenced within the kernel without the need to call - * ioremap(), kmap() or similar. The PFN is suitable for inserting into - * page tables. - * - * Return: negative errno if an error occurs, otherwise the number of bytes - * accessible at this address. - */ -long bdev_direct_access(struct block_device *bdev, struct blk_dax_ctl *dax) +int bdev_dax_pgoff(struct block_device *bdev, sector_t sector, size_t size, + pgoff_t *pgoff) { - sector_t sector = dax->sector; - long avail, size = dax->size; - const struct block_device_operations *ops = bdev->bd_disk->fops; + phys_addr_t phys_off = (get_start_sect(bdev) + sector) * 512; - /* - * The device driver is allowed to sleep, in order to make the - * memory directly accessible. - */ - might_sleep(); - - if (size < 0) - return size; - if (!blk_queue_dax(bdev_get_queue(bdev)) || !ops->direct_access) - return -EOPNOTSUPP; - if ((sector + DIV_ROUND_UP(size, 512)) > - part_nr_sects_read(bdev->bd_part)) - return -ERANGE; - sector += get_start_sect(bdev); - if (sector % (PAGE_SIZE / 512)) + if (pgoff) + *pgoff = PHYS_PFN(phys_off); + if (phys_off % PAGE_SIZE || size % PAGE_SIZE) return -EINVAL; - avail = ops->direct_access(bdev, sector, &dax->addr, &dax->pfn, size); - if (!avail) - return -ERANGE; - if (avail > 0 && avail & ~PAGE_MASK) - return -ENXIO; - return min(avail, size); + return 0; } -EXPORT_SYMBOL_GPL(bdev_direct_access); +EXPORT_SYMBOL(bdev_dax_pgoff); /** * bdev_dax_supported() - Check if the device supports dax for filesystem @@ -774,62 +742,46 @@ EXPORT_SYMBOL_GPL(bdev_direct_access); */ int bdev_dax_supported(struct super_block *sb, int blocksize) { - struct blk_dax_ctl dax = { - .sector = 0, - .size = PAGE_SIZE, - }; - int err; + struct block_device *bdev = sb->s_bdev; + struct dax_device *dax_dev; + pgoff_t pgoff; + int err, id; + void *kaddr; + pfn_t pfn; + long len; if (blocksize != PAGE_SIZE) { vfs_msg(sb, KERN_ERR, "error: unsupported blocksize for dax"); return -EINVAL; } - err = bdev_direct_access(sb->s_bdev, &dax); - if (err < 0) { - switch (err) { - case -EOPNOTSUPP: - vfs_msg(sb, KERN_ERR, - "error: device does not support dax"); - break; - case -EINVAL: - vfs_msg(sb, KERN_ERR, - "error: unaligned partition for dax"); - break; - default: - vfs_msg(sb, KERN_ERR, - "error: dax access failed (%d)", err); - } + err = bdev_dax_pgoff(bdev, 0, PAGE_SIZE, &pgoff); + if (err) { + vfs_msg(sb, KERN_ERR, "error: unaligned partition for dax"); return err; } - return 0; -} -EXPORT_SYMBOL_GPL(bdev_dax_supported); - -/** - * bdev_dax_capable() - Return if the raw device is capable for dax - * @bdev: The device for raw block device access - */ -bool bdev_dax_capable(struct block_device *bdev) -{ - struct blk_dax_ctl dax = { - .size = PAGE_SIZE, - }; + dax_dev = dax_get_by_host(bdev->bd_disk->disk_name); + if (!dax_dev) { + vfs_msg(sb, KERN_ERR, "error: device does not support dax"); + return -EOPNOTSUPP; + } - if (!IS_ENABLED(CONFIG_FS_DAX)) - return false; + id = dax_read_lock(); + len = dax_direct_access(dax_dev, pgoff, 1, &kaddr, &pfn); + dax_read_unlock(id); - dax.sector = 0; - if (bdev_direct_access(bdev, &dax) < 0) - return false; + put_dax(dax_dev); - dax.sector = bdev->bd_part->nr_sects - (PAGE_SIZE / 512); - if (bdev_direct_access(bdev, &dax) < 0) - return false; + if (len < 1) { + vfs_msg(sb, KERN_ERR, + "error: dax access failed (%ld)", len); + return len < 0 ? len : -EIO; + } - return true; + return 0; } +EXPORT_SYMBOL_GPL(bdev_dax_supported); /* * pseudo-fs @@ -885,6 +837,8 @@ static void bdev_evict_inode(struct inode *inode) spin_lock(&bdev_lock); list_del_init(&bdev->bd_list); spin_unlock(&bdev_lock); + /* Detach inode from wb early as bdi_put() may free bdi->wb */ + inode_detach_wb(inode); if (bdev->bd_bdi != &noop_backing_dev_info) { bdi_put(bdev->bd_bdi); bdev->bd_bdi = &noop_backing_dev_info; @@ -1451,7 +1405,6 @@ int revalidate_disk(struct gendisk *disk) if (disk->fops->revalidate_disk) ret = disk->fops->revalidate_disk(disk); - blk_integrity_revalidate(disk); bdev = bdget_disk(disk, 0); if (!bdev) return ret; @@ -1556,8 +1509,6 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) bdev->bd_disk = disk; bdev->bd_queue = disk->queue; bdev->bd_contains = bdev; - if (bdev->bd_bdi == &noop_backing_dev_info) - bdev->bd_bdi = bdi_get(disk->queue->backing_dev_info); if (!partno) { ret = -ENXIO; @@ -1622,6 +1573,9 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) } bd_set_size(bdev, (loff_t)bdev->bd_part->nr_sects << 9); } + + if (bdev->bd_bdi == &noop_backing_dev_info) + bdev->bd_bdi = bdi_get(disk->queue->backing_dev_info); } else { if (bdev->bd_contains == bdev) { ret = 0; @@ -1653,8 +1607,6 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) bdev->bd_disk = NULL; bdev->bd_part = NULL; bdev->bd_queue = NULL; - bdi_put(bdev->bd_bdi); - bdev->bd_bdi = &noop_backing_dev_info; if (bdev != bdev->bd_contains) __blkdev_put(bdev->bd_contains, mode, 1); bdev->bd_contains = NULL; @@ -1876,12 +1828,6 @@ static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part) kill_bdev(bdev); bdev_write_inode(bdev); - /* - * Detaching bdev inode from its wb in __destroy_inode() - * is too late: the queue which embeds its bdi (along with - * root wb) can be gone as soon as we put_disk() below. - */ - inode_detach_wb(bdev->bd_inode); } if (bdev->bd_contains == bdev) { if (disk->fops->release) @@ -2074,7 +2020,6 @@ static long blkdev_fallocate(struct file *file, int mode, loff_t start, loff_t len) { struct block_device *bdev = I_BDEV(bdev_file_inode(file)); - struct request_queue *q = bdev_get_queue(bdev); struct address_space *mapping; loff_t end = start + len - 1; loff_t isize; @@ -2110,18 +2055,13 @@ static long blkdev_fallocate(struct file *file, int mode, loff_t start, case FALLOC_FL_ZERO_RANGE: case FALLOC_FL_ZERO_RANGE | FALLOC_FL_KEEP_SIZE: error = blkdev_issue_zeroout(bdev, start >> 9, len >> 9, - GFP_KERNEL, false); + GFP_KERNEL, BLKDEV_ZERO_NOUNMAP); break; case FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE: - /* Only punch if the device can do zeroing discard. */ - if (!blk_queue_discard(q) || !q->limits.discard_zeroes_data) - return -EOPNOTSUPP; - error = blkdev_issue_discard(bdev, start >> 9, len >> 9, - GFP_KERNEL, 0); + error = blkdev_issue_zeroout(bdev, start >> 9, len >> 9, + GFP_KERNEL, BLKDEV_ZERO_NOFALLBACK); break; case FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE | FALLOC_FL_NO_HIDE_STALE: - if (!blk_queue_discard(q)) - return -EOPNOTSUPP; error = blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL, 0); break; diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index c4115901d906..3e21211e99c3 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -810,7 +810,6 @@ struct btrfs_fs_info { struct btrfs_super_block *super_for_commit; struct super_block *sb; struct inode *btree_inode; - struct backing_dev_info bdi; struct mutex tree_log_mutex; struct mutex transaction_kthread_mutex; struct mutex cleaner_mutex; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index eb1ee7b6f532..061c1d1f774f 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1808,21 +1808,6 @@ static int btrfs_congested_fn(void *congested_data, int bdi_bits) return ret; } -static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi) -{ - int err; - - err = bdi_setup_and_register(bdi, "btrfs"); - if (err) - return err; - - bdi->ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_SIZE; - bdi->congested_fn = btrfs_congested_fn; - bdi->congested_data = info; - bdi->capabilities |= BDI_CAP_CGROUP_WRITEBACK; - return 0; -} - /* * called by the kthread helper functions to finally call the bio end_io * functions. This is where read checksum verification actually happens @@ -2601,16 +2586,10 @@ int open_ctree(struct super_block *sb, goto fail; } - ret = setup_bdi(fs_info, &fs_info->bdi); - if (ret) { - err = ret; - goto fail_srcu; - } - ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0, GFP_KERNEL); if (ret) { err = ret; - goto fail_bdi; + goto fail_srcu; } fs_info->dirty_metadata_batch = PAGE_SIZE * (1 + ilog2(nr_cpu_ids)); @@ -2718,7 +2697,6 @@ int open_ctree(struct super_block *sb, sb->s_blocksize = 4096; sb->s_blocksize_bits = blksize_bits(4096); - sb->s_bdi = &fs_info->bdi; btrfs_init_btree_inode(fs_info); @@ -2915,9 +2893,12 @@ int open_ctree(struct super_block *sb, goto fail_sb_buffer; } - fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super); - fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages, - SZ_4M / PAGE_SIZE); + sb->s_bdi->congested_fn = btrfs_congested_fn; + sb->s_bdi->congested_data = fs_info; + sb->s_bdi->capabilities |= BDI_CAP_CGROUP_WRITEBACK; + sb->s_bdi->ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_SIZE; + sb->s_bdi->ra_pages *= btrfs_super_num_devices(disk_super); + sb->s_bdi->ra_pages = max(sb->s_bdi->ra_pages, SZ_4M / PAGE_SIZE); sb->s_blocksize = sectorsize; sb->s_blocksize_bits = blksize_bits(sectorsize); @@ -3285,8 +3266,6 @@ fail_delalloc_bytes: percpu_counter_destroy(&fs_info->delalloc_bytes); fail_dirty_metadata_bytes: percpu_counter_destroy(&fs_info->dirty_metadata_bytes); -fail_bdi: - bdi_destroy(&fs_info->bdi); fail_srcu: cleanup_srcu_struct(&fs_info->subvol_srcu); fail: @@ -4007,7 +3986,6 @@ void close_ctree(struct btrfs_fs_info *fs_info) percpu_counter_destroy(&fs_info->dirty_metadata_bytes); percpu_counter_destroy(&fs_info->delalloc_bytes); percpu_counter_destroy(&fs_info->bio_counter); - bdi_destroy(&fs_info->bdi); cleanup_srcu_struct(&fs_info->subvol_srcu); btrfs_free_stripe_hash_table(fs_info); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index a18510be76c1..5e71f1ea3391 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -7910,7 +7910,6 @@ struct btrfs_retry_complete { static void btrfs_retry_endio_nocsum(struct bio *bio) { struct btrfs_retry_complete *done = bio->bi_private; - struct inode *inode; struct bio_vec *bvec; int i; @@ -7918,12 +7917,12 @@ static void btrfs_retry_endio_nocsum(struct bio *bio) goto end; ASSERT(bio->bi_vcnt == 1); - inode = bio->bi_io_vec->bv_page->mapping->host; - ASSERT(bio->bi_io_vec->bv_len == btrfs_inode_sectorsize(inode)); + ASSERT(bio->bi_io_vec->bv_len == btrfs_inode_sectorsize(done->inode)); done->uptodate = 1; bio_for_each_segment_all(bvec, bio, i) - clean_io_failure(BTRFS_I(done->inode), done->start, bvec->bv_page, 0); + clean_io_failure(BTRFS_I(done->inode), done->start, + bvec->bv_page, 0); end: complete(&done->done); bio_put(bio); @@ -7973,8 +7972,10 @@ next_block_or_try_again: start += sectorsize; - if (nr_sectors--) { + nr_sectors--; + if (nr_sectors) { pgoff += sectorsize; + ASSERT(pgoff < PAGE_SIZE); goto next_block_or_try_again; } } @@ -7986,9 +7987,7 @@ static void btrfs_retry_endio(struct bio *bio) { struct btrfs_retry_complete *done = bio->bi_private; struct btrfs_io_bio *io_bio = btrfs_io_bio(bio); - struct inode *inode; struct bio_vec *bvec; - u64 start; int uptodate; int ret; int i; @@ -7998,11 +7997,8 @@ static void btrfs_retry_endio(struct bio *bio) uptodate = 1; - start = done->start; - ASSERT(bio->bi_vcnt == 1); - inode = bio->bi_io_vec->bv_page->mapping->host; - ASSERT(bio->bi_io_vec->bv_len == btrfs_inode_sectorsize(inode)); + ASSERT(bio->bi_io_vec->bv_len == btrfs_inode_sectorsize(done->inode)); bio_for_each_segment_all(bvec, bio, i) { ret = __readpage_endio_check(done->inode, io_bio, i, @@ -8080,8 +8076,10 @@ next: ASSERT(nr_sectors); - if (--nr_sectors) { + nr_sectors--; + if (nr_sectors) { pgoff += sectorsize; + ASSERT(pgoff < PAGE_SIZE); goto next_block; } } diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index a59801dc2a34..afbea61d957e 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -1042,9 +1042,12 @@ static void report_reserved_underflow(struct btrfs_fs_info *fs_info, struct btrfs_qgroup *qgroup, u64 num_bytes) { - btrfs_warn(fs_info, +#ifdef CONFIG_BTRFS_DEBUG + WARN_ON(qgroup->reserved < num_bytes); + btrfs_debug(fs_info, "qgroup %llu reserved space underflow, have: %llu, to free: %llu", qgroup->qgroupid, qgroup->reserved, num_bytes); +#endif qgroup->reserved = 0; } /* @@ -1075,7 +1078,7 @@ static int __qgroup_excl_accounting(struct btrfs_fs_info *fs_info, qgroup->excl += sign * num_bytes; qgroup->excl_cmpr += sign * num_bytes; if (sign > 0) { - if (WARN_ON(qgroup->reserved < num_bytes)) + if (qgroup->reserved < num_bytes) report_reserved_underflow(fs_info, qgroup, num_bytes); else qgroup->reserved -= num_bytes; @@ -1100,7 +1103,7 @@ static int __qgroup_excl_accounting(struct btrfs_fs_info *fs_info, WARN_ON(sign < 0 && qgroup->excl < num_bytes); qgroup->excl += sign * num_bytes; if (sign > 0) { - if (WARN_ON(qgroup->reserved < num_bytes)) + if (qgroup->reserved < num_bytes) report_reserved_underflow(fs_info, qgroup, num_bytes); else @@ -2469,7 +2472,7 @@ void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info, qg = unode_aux_to_qgroup(unode); - if (WARN_ON(qg->reserved < num_bytes)) + if (qg->reserved < num_bytes) report_reserved_underflow(fs_info, qg, num_bytes); else qg->reserved -= num_bytes; diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index da687dc79cce..72a053c9a7f0 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -549,16 +549,19 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, case Opt_ssd: btrfs_set_and_info(info, SSD, "use ssd allocation scheme"); + btrfs_clear_opt(info->mount_opt, NOSSD); break; case Opt_ssd_spread: btrfs_set_and_info(info, SSD_SPREAD, "use spread ssd allocation scheme"); btrfs_set_opt(info->mount_opt, SSD); + btrfs_clear_opt(info->mount_opt, NOSSD); break; case Opt_nossd: btrfs_set_and_info(info, NOSSD, "not using ssd allocation scheme"); btrfs_clear_opt(info->mount_opt, SSD); + btrfs_clear_opt(info->mount_opt, SSD_SPREAD); break; case Opt_barrier: btrfs_clear_and_info(info, NOBARRIER, @@ -1133,6 +1136,13 @@ static int btrfs_fill_super(struct super_block *sb, #endif sb->s_flags |= MS_I_VERSION; sb->s_iflags |= SB_I_CGROUPWB; + + err = super_setup_bdi(sb); + if (err) { + btrfs_err(fs_info, "super_setup_bdi failed"); + return err; + } + err = open_ctree(sb, fs_devices, (char *)data); if (err) { btrfs_err(fs_info, "open_ctree failed"); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 73d56eef5e60..ab8a66d852f9 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -6213,7 +6213,7 @@ int btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio, for (dev_nr = 0; dev_nr < total_devs; dev_nr++) { dev = bbio->stripes[dev_nr].dev; if (!dev || !dev->bdev || - (bio_op(bio) == REQ_OP_WRITE && !dev->writeable)) { + (bio_op(first_bio) == REQ_OP_WRITE && !dev->writeable)) { bbio_error(bbio, first_bio, logical); continue; } diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 1a3e1b40799a..9ecb2fd348cb 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -578,7 +578,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) writeback_stat = atomic_long_inc_return(&fsc->writeback_count); if (writeback_stat > CONGESTION_ON_THRESH(fsc->mount_options->congestion_kb)) - set_bdi_congested(&fsc->backing_dev_info, BLK_RW_ASYNC); + set_bdi_congested(inode_to_bdi(inode), BLK_RW_ASYNC); set_page_writeback(page); err = ceph_osdc_writepages(osdc, ceph_vino(inode), @@ -700,7 +700,7 @@ static void writepages_finish(struct ceph_osd_request *req) if (atomic_long_dec_return(&fsc->writeback_count) < CONGESTION_OFF_THRESH( fsc->mount_options->congestion_kb)) - clear_bdi_congested(&fsc->backing_dev_info, + clear_bdi_congested(inode_to_bdi(inode), BLK_RW_ASYNC); if (rc < 0) @@ -979,7 +979,7 @@ get_more_pages: if (atomic_long_inc_return(&fsc->writeback_count) > CONGESTION_ON_THRESH( fsc->mount_options->congestion_kb)) { - set_bdi_congested(&fsc->backing_dev_info, + set_bdi_congested(inode_to_bdi(inode), BLK_RW_ASYNC); } diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c index f2ae393e2c31..3ef11bc8d728 100644 --- a/fs/ceph/debugfs.c +++ b/fs/ceph/debugfs.c @@ -251,7 +251,7 @@ int ceph_fs_debugfs_init(struct ceph_fs_client *fsc) goto out; snprintf(name, sizeof(name), "../../bdi/%s", - dev_name(fsc->backing_dev_info.dev)); + dev_name(fsc->sb->s_bdi->dev)); fsc->debugfs_bdi = debugfs_create_symlink("bdi", fsc->client->debugfs_dir, diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index d449e1c03cbd..d3119fe3ab45 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -2071,11 +2071,6 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr) if (inode_dirty_flags) __mark_inode_dirty(inode, inode_dirty_flags); - if (ia_valid & ATTR_MODE) { - err = posix_acl_chmod(inode, attr->ia_mode); - if (err) - goto out_put; - } if (mask) { req->r_inode = inode; @@ -2089,13 +2084,11 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr) ceph_cap_string(dirtied), mask); ceph_mdsc_put_request(req); - if (mask & CEPH_SETATTR_SIZE) - __ceph_do_pending_vmtruncate(inode); - ceph_free_cap_flush(prealloc_cf); - return err; -out_put: - ceph_mdsc_put_request(req); ceph_free_cap_flush(prealloc_cf); + + if (err >= 0 && (mask & CEPH_SETATTR_SIZE)) + __ceph_do_pending_vmtruncate(inode); + return err; } @@ -2114,7 +2107,12 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr) if (err != 0) return err; - return __ceph_setattr(inode, attr); + err = __ceph_setattr(inode, attr); + + if (err >= 0 && (attr->ia_valid & ATTR_MODE)) + err = posix_acl_chmod(inode, attr->ia_mode); + + return err; } /* diff --git a/fs/ceph/super.c b/fs/ceph/super.c index 0ec8d0114e57..a8c81b2052ca 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -579,10 +579,6 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt, atomic_long_set(&fsc->writeback_count, 0); - err = bdi_init(&fsc->backing_dev_info); - if (err < 0) - goto fail_client; - err = -ENOMEM; /* * The number of concurrent works can be high but they don't need @@ -590,7 +586,7 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt, */ fsc->wb_wq = alloc_workqueue("ceph-writeback", 0, 1); if (fsc->wb_wq == NULL) - goto fail_bdi; + goto fail_client; fsc->pg_inv_wq = alloc_workqueue("ceph-pg-invalid", 0, 1); if (fsc->pg_inv_wq == NULL) goto fail_wb_wq; @@ -624,8 +620,6 @@ fail_pg_inv_wq: destroy_workqueue(fsc->pg_inv_wq); fail_wb_wq: destroy_workqueue(fsc->wb_wq); -fail_bdi: - bdi_destroy(&fsc->backing_dev_info); fail_client: ceph_destroy_client(fsc->client); fail: @@ -643,8 +637,6 @@ static void destroy_fs_client(struct ceph_fs_client *fsc) destroy_workqueue(fsc->pg_inv_wq); destroy_workqueue(fsc->trunc_wq); - bdi_destroy(&fsc->backing_dev_info); - mempool_destroy(fsc->wb_pagevec_pool); destroy_mount_options(fsc->mount_options); @@ -937,33 +929,32 @@ static int ceph_compare_super(struct super_block *sb, void *data) */ static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0); -static int ceph_register_bdi(struct super_block *sb, - struct ceph_fs_client *fsc) +static int ceph_setup_bdi(struct super_block *sb, struct ceph_fs_client *fsc) { int err; + err = super_setup_bdi_name(sb, "ceph-%ld", + atomic_long_inc_return(&bdi_seq)); + if (err) + return err; + /* set ra_pages based on rasize mount option? */ if (fsc->mount_options->rasize >= PAGE_SIZE) - fsc->backing_dev_info.ra_pages = + sb->s_bdi->ra_pages = (fsc->mount_options->rasize + PAGE_SIZE - 1) >> PAGE_SHIFT; else - fsc->backing_dev_info.ra_pages = - VM_MAX_READAHEAD * 1024 / PAGE_SIZE; + sb->s_bdi->ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_SIZE; if (fsc->mount_options->rsize > fsc->mount_options->rasize && fsc->mount_options->rsize >= PAGE_SIZE) - fsc->backing_dev_info.io_pages = + sb->s_bdi->io_pages = (fsc->mount_options->rsize + PAGE_SIZE - 1) >> PAGE_SHIFT; else if (fsc->mount_options->rsize == 0) - fsc->backing_dev_info.io_pages = ULONG_MAX; + sb->s_bdi->io_pages = ULONG_MAX; - err = bdi_register(&fsc->backing_dev_info, NULL, "ceph-%ld", - atomic_long_inc_return(&bdi_seq)); - if (!err) - sb->s_bdi = &fsc->backing_dev_info; - return err; + return 0; } static struct dentry *ceph_mount(struct file_system_type *fs_type, @@ -1018,7 +1009,7 @@ static struct dentry *ceph_mount(struct file_system_type *fs_type, dout("get_sb got existing client %p\n", fsc); } else { dout("get_sb using new client %p\n", fsc); - err = ceph_register_bdi(sb, fsc); + err = ceph_setup_bdi(sb, fsc); if (err < 0) { res = ERR_PTR(err); goto out_splat; diff --git a/fs/ceph/super.h b/fs/ceph/super.h index fe6b9cfc4013..176186b12457 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -92,8 +92,6 @@ struct ceph_fs_client { struct workqueue_struct *trunc_wq; atomic_long_t writeback_count; - struct backing_dev_info backing_dev_info; - #ifdef CONFIG_DEBUG_FS struct dentry *debugfs_dentry_lru, *debugfs_caps; struct dentry *debugfs_congestion_kb; diff --git a/fs/char_dev.c b/fs/char_dev.c index 44a240c4bb65..fb8507f521b2 100644 --- a/fs/char_dev.c +++ b/fs/char_dev.c @@ -471,6 +471,85 @@ int cdev_add(struct cdev *p, dev_t dev, unsigned count) return 0; } +/** + * cdev_set_parent() - set the parent kobject for a char device + * @p: the cdev structure + * @kobj: the kobject to take a reference to + * + * cdev_set_parent() sets a parent kobject which will be referenced + * appropriately so the parent is not freed before the cdev. This + * should be called before cdev_add. + */ +void cdev_set_parent(struct cdev *p, struct kobject *kobj) +{ + WARN_ON(!kobj->state_initialized); + p->kobj.parent = kobj; +} + +/** + * cdev_device_add() - add a char device and it's corresponding + * struct device, linkink + * @dev: the device structure + * @cdev: the cdev structure + * + * cdev_device_add() adds the char device represented by @cdev to the system, + * just as cdev_add does. It then adds @dev to the system using device_add + * The dev_t for the char device will be taken from the struct device which + * needs to be initialized first. This helper function correctly takes a + * reference to the parent device so the parent will not get released until + * all references to the cdev are released. + * + * This helper uses dev->devt for the device number. If it is not set + * it will not add the cdev and it will be equivalent to device_add. + * + * This function should be used whenever the struct cdev and the + * struct device are members of the same structure whose lifetime is + * managed by the struct device. + * + * NOTE: Callers must assume that userspace was able to open the cdev and + * can call cdev fops callbacks at any time, even if this function fails. + */ +int cdev_device_add(struct cdev *cdev, struct device *dev) +{ + int rc = 0; + + if (dev->devt) { + cdev_set_parent(cdev, &dev->kobj); + + rc = cdev_add(cdev, dev->devt, 1); + if (rc) + return rc; + } + + rc = device_add(dev); + if (rc) + cdev_del(cdev); + + return rc; +} + +/** + * cdev_device_del() - inverse of cdev_device_add + * @dev: the device structure + * @cdev: the cdev structure + * + * cdev_device_del() is a helper function to call cdev_del and device_del. + * It should be used whenever cdev_device_add is used. + * + * If dev->devt is not set it will not remove the cdev and will be equivalent + * to device_del. + * + * NOTE: This guarantees that associated sysfs callbacks are not running + * or runnable, however any cdevs already open will remain and their fops + * will still be callable even after this function returns. + */ +void cdev_device_del(struct cdev *cdev, struct device *dev) +{ + device_del(dev); + if (dev->devt) + cdev_del(cdev); +} + static void cdev_unmap(dev_t dev, unsigned count) { kobj_unmap(cdev_map, dev, count); @@ -482,6 +561,10 @@ static void cdev_unmap(dev_t dev, unsigned count) * * cdev_del() removes @p from the system, possibly freeing the structure * itself. + * + * NOTE: This guarantees that cdev device will no longer be able to be + * opened, however any cdevs already open will remain and their fops will + * still be callable even after cdev_del returns. */ void cdev_del(struct cdev *p) { @@ -570,5 +653,8 @@ EXPORT_SYMBOL(cdev_init); EXPORT_SYMBOL(cdev_alloc); EXPORT_SYMBOL(cdev_del); EXPORT_SYMBOL(cdev_add); +EXPORT_SYMBOL(cdev_set_parent); +EXPORT_SYMBOL(cdev_device_add); +EXPORT_SYMBOL(cdev_device_del); EXPORT_SYMBOL(__register_chrdev); EXPORT_SYMBOL(__unregister_chrdev); diff --git a/fs/cifs/cifs_fs_sb.h b/fs/cifs/cifs_fs_sb.h index 07ed81cf1552..cbd216b57239 100644 --- a/fs/cifs/cifs_fs_sb.h +++ b/fs/cifs/cifs_fs_sb.h @@ -68,7 +68,6 @@ struct cifs_sb_info { umode_t mnt_dir_mode; unsigned int mnt_cifs_flags; char *mountdata; /* options received at mount time or via DFS refs */ - struct backing_dev_info bdi; struct delayed_work prune_tlinks; struct rcu_head rcu; char *prepath; diff --git a/fs/cifs/cifs_unicode.c b/fs/cifs/cifs_unicode.c index 02b071bf3732..a0b3e7d1be48 100644 --- a/fs/cifs/cifs_unicode.c +++ b/fs/cifs/cifs_unicode.c @@ -83,6 +83,9 @@ convert_sfm_char(const __u16 src_char, char *target) case SFM_COLON: *target = ':'; break; + case SFM_DOUBLEQUOTE: + *target = '"'; + break; case SFM_ASTERISK: *target = '*'; break; @@ -418,6 +421,9 @@ static __le16 convert_to_sfm_char(char src_char, bool end_of_string) case ':': dest_char = cpu_to_le16(SFM_COLON); break; + case '"': + dest_char = cpu_to_le16(SFM_DOUBLEQUOTE); + break; case '*': dest_char = cpu_to_le16(SFM_ASTERISK); break; diff --git a/fs/cifs/cifs_unicode.h b/fs/cifs/cifs_unicode.h index 3d7298cc0aeb..8a79a34e66b8 100644 --- a/fs/cifs/cifs_unicode.h +++ b/fs/cifs/cifs_unicode.h @@ -57,6 +57,7 @@ * not conflict (although almost does) with the mapping above. */ +#define SFM_DOUBLEQUOTE ((__u16) 0xF020) #define SFM_ASTERISK ((__u16) 0xF021) #define SFM_QUESTION ((__u16) 0xF025) #define SFM_COLON ((__u16) 0xF022) @@ -64,8 +65,8 @@ #define SFM_LESSTHAN ((__u16) 0xF023) #define SFM_PIPE ((__u16) 0xF027) #define SFM_SLASH ((__u16) 0xF026) -#define SFM_PERIOD ((__u16) 0xF028) -#define SFM_SPACE ((__u16) 0xF029) +#define SFM_SPACE ((__u16) 0xF028) +#define SFM_PERIOD ((__u16) 0xF029) /* * Mapping mechanism to use when one of the seven reserved characters is diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index dd3f5fabfdf6..9a1667e0e8d6 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -37,6 +37,7 @@ #include <linux/freezer.h> #include <linux/namei.h> #include <linux/random.h> +#include <linux/uuid.h> #include <linux/xattr.h> #include <net/ipv6.h> #include "cifsfs.h" @@ -87,6 +88,7 @@ extern mempool_t *cifs_req_poolp; extern mempool_t *cifs_mid_poolp; struct workqueue_struct *cifsiod_wq; +struct workqueue_struct *cifsoplockd_wq; __u32 cifs_lock_secret; /* @@ -138,7 +140,12 @@ cifs_read_super(struct super_block *sb) sb->s_magic = CIFS_MAGIC_NUMBER; sb->s_op = &cifs_super_ops; sb->s_xattr = cifs_xattr_handlers; - sb->s_bdi = &cifs_sb->bdi; + rc = super_setup_bdi(sb); + if (rc) + goto out_no_root; + /* tune readahead according to rsize */ + sb->s_bdi->ra_pages = cifs_sb->rsize / PAGE_SIZE; + sb->s_blocksize = CIFS_MAX_MSGSIZE; sb->s_blocksize_bits = 14; /* default 2**14 = CIFS_MAX_MSGSIZE */ inode = cifs_root_iget(sb); @@ -1369,9 +1376,16 @@ init_cifs(void) goto out_clean_proc; } + cifsoplockd_wq = alloc_workqueue("cifsoplockd", + WQ_FREEZABLE|WQ_MEM_RECLAIM, 0); + if (!cifsoplockd_wq) { + rc = -ENOMEM; + goto out_destroy_cifsiod_wq; + } + rc = cifs_fscache_register(); if (rc) - goto out_destroy_wq; + goto out_destroy_cifsoplockd_wq; rc = cifs_init_inodecache(); if (rc) @@ -1419,7 +1433,9 @@ out_destroy_inodecache: cifs_destroy_inodecache(); out_unreg_fscache: cifs_fscache_unregister(); -out_destroy_wq: +out_destroy_cifsoplockd_wq: + destroy_workqueue(cifsoplockd_wq); +out_destroy_cifsiod_wq: destroy_workqueue(cifsiod_wq); out_clean_proc: cifs_proc_clean(); @@ -1442,6 +1458,7 @@ exit_cifs(void) cifs_destroy_mids(); cifs_destroy_inodecache(); cifs_fscache_unregister(); + destroy_workqueue(cifsoplockd_wq); destroy_workqueue(cifsiod_wq); cifs_proc_clean(); } diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index d07f13a63369..8be55be70faf 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -948,7 +948,6 @@ struct cifs_tcon { bool use_persistent:1; /* use persistent instead of durable handles */ #ifdef CONFIG_CIFS_SMB2 bool print:1; /* set if connection to printer share */ - bool bad_network_name:1; /* set if ret status STATUS_BAD_NETWORK_NAME */ __le32 capabilities; __u32 share_flags; __u32 maximal_access; @@ -1116,6 +1115,23 @@ struct cifs_io_parms { struct cifs_tcon *tcon; }; +struct cifs_aio_ctx { + struct kref refcount; + struct list_head list; + struct mutex aio_mutex; + struct completion done; + struct iov_iter iter; + struct kiocb *iocb; + struct cifsFileInfo *cfile; + struct bio_vec *bv; + loff_t pos; + unsigned int npages; + ssize_t rc; + unsigned int len; + unsigned int total_len; + bool should_dirty; +}; + struct cifs_readdata; /* asynchronous read support */ @@ -1125,6 +1141,7 @@ struct cifs_readdata { struct completion done; struct cifsFileInfo *cfile; struct address_space *mapping; + struct cifs_aio_ctx *ctx; __u64 offset; unsigned int bytes; unsigned int got_bytes; @@ -1155,6 +1172,7 @@ struct cifs_writedata { enum writeback_sync_modes sync_mode; struct work_struct work; struct cifsFileInfo *cfile; + struct cifs_aio_ctx *ctx; __u64 offset; pid_t pid; unsigned int bytes; @@ -1684,6 +1702,7 @@ void cifs_oplock_break(struct work_struct *work); extern const struct slow_work_ops cifs_oplock_break_ops; extern struct workqueue_struct *cifsiod_wq; +extern struct workqueue_struct *cifsoplockd_wq; extern __u32 cifs_lock_secret; extern mempool_t *cifs_mid_poolp; diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index ec5e5e514fdd..e49958c3f8bb 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -79,8 +79,7 @@ extern void cifs_delete_mid(struct mid_q_entry *mid); extern void cifs_wake_up_task(struct mid_q_entry *mid); extern int cifs_handle_standard(struct TCP_Server_Info *server, struct mid_q_entry *mid); -extern int cifs_discard_remaining_data(struct TCP_Server_Info *server, - char *buf); +extern int cifs_discard_remaining_data(struct TCP_Server_Info *server); extern int cifs_call_async(struct TCP_Server_Info *server, struct smb_rqst *rqst, mid_receive_t *receive, mid_callback_t *callback, @@ -536,4 +535,7 @@ int __cifs_calc_signature(struct smb_rqst *rqst, struct shash_desc *shash); enum securityEnum cifs_select_sectype(struct TCP_Server_Info *, enum securityEnum); +struct cifs_aio_ctx *cifs_aio_ctx_alloc(void); +void cifs_aio_ctx_release(struct kref *refcount); +int setup_aio_ctx_iter(struct cifs_aio_ctx *ctx, struct iov_iter *iter, int rw); #endif /* _CIFSPROTO_H */ diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index 967b92631807..205fd94f52fd 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -718,6 +718,9 @@ CIFSSMBEcho(struct TCP_Server_Info *server) if (rc) return rc; + if (server->capabilities & CAP_UNICODE) + smb->hdr.Flags2 |= SMBFLG2_UNICODE; + /* set up echo request */ smb->hdr.Tid = 0xffff; smb->hdr.WordCount = 1; @@ -1400,9 +1403,9 @@ openRetry: * current bigbuf. */ int -cifs_discard_remaining_data(struct TCP_Server_Info *server, char *buf) +cifs_discard_remaining_data(struct TCP_Server_Info *server) { - unsigned int rfclen = get_rfc1002_length(buf); + unsigned int rfclen = get_rfc1002_length(server->smallbuf); int remaining = rfclen + 4 - server->total_read; while (remaining > 0) { @@ -1426,8 +1429,10 @@ cifs_readv_discard(struct TCP_Server_Info *server, struct mid_q_entry *mid) int length; struct cifs_readdata *rdata = mid->callback_data; - length = cifs_discard_remaining_data(server, mid->resp_buf); + length = cifs_discard_remaining_data(server); dequeue_mid(mid, rdata->result); + mid->resp_buf = server->smallbuf; + server->smallbuf = NULL; return length; } @@ -1459,7 +1464,7 @@ cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid) if (server->ops->is_status_pending && server->ops->is_status_pending(buf, server, 0)) { - cifs_discard_remaining_data(server, buf); + cifs_discard_remaining_data(server); return -1; } @@ -1519,9 +1524,6 @@ cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid) cifs_dbg(FYI, "0: iov_base=%p iov_len=%u\n", rdata->iov[0].iov_base, server->total_read); - mid->resp_buf = server->smallbuf; - server->smallbuf = NULL; - /* how much data is in the response? */ data_len = server->ops->read_data_length(buf); if (data_offset + data_len > buflen) { @@ -1544,6 +1546,8 @@ cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid) return cifs_readv_discard(server, mid); dequeue_mid(mid, false); + mid->resp_buf = server->smallbuf; + server->smallbuf = NULL; return length; } diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 0c7596cef4b8..9365c0cf77ad 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -35,6 +35,7 @@ #include <linux/pagevec.h> #include <linux/freezer.h> #include <linux/namei.h> +#include <linux/uuid.h> #include <linux/uaccess.h> #include <asm/processor.h> #include <linux/inet.h> @@ -1945,9 +1946,14 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, } if (!got_ip) { + int len; + const char *slash; + /* No ip= option specified? Try to get it from UNC */ - if (!cifs_convert_address(dstaddr, &vol->UNC[2], - strlen(&vol->UNC[2]))) { + /* Use the address part of the UNC. */ + slash = strchr(&vol->UNC[2], '\\'); + len = slash - &vol->UNC[2]; + if (!cifs_convert_address(dstaddr, &vol->UNC[2], len)) { pr_err("Unable to determine destination address.\n"); goto cifs_parse_mount_err; } @@ -2912,16 +2918,14 @@ match_prepath(struct super_block *sb, struct cifs_mnt_data *mnt_data) { struct cifs_sb_info *old = CIFS_SB(sb); struct cifs_sb_info *new = mnt_data->cifs_sb; + bool old_set = old->mnt_cifs_flags & CIFS_MOUNT_USE_PREFIX_PATH; + bool new_set = new->mnt_cifs_flags & CIFS_MOUNT_USE_PREFIX_PATH; - if (old->mnt_cifs_flags & CIFS_MOUNT_USE_PREFIX_PATH) { - if (!(new->mnt_cifs_flags & CIFS_MOUNT_USE_PREFIX_PATH)) - return 0; - /* The prepath should be null terminated strings */ - if (strcmp(new->prepath, old->prepath)) - return 0; - + if (old_set && new_set && !strcmp(new->prepath, old->prepath)) return 1; - } + else if (!old_set && !new_set) + return 1; + return 0; } @@ -3692,10 +3696,6 @@ cifs_mount(struct cifs_sb_info *cifs_sb, struct smb_vol *volume_info) int referral_walks_count = 0; #endif - rc = bdi_setup_and_register(&cifs_sb->bdi, "cifs"); - if (rc) - return rc; - #ifdef CONFIG_CIFS_DFS_UPCALL try_mount_again: /* cleanup activities if we're chasing a referral */ @@ -3723,7 +3723,6 @@ try_mount_again: server = cifs_get_tcp_session(volume_info); if (IS_ERR(server)) { rc = PTR_ERR(server); - bdi_destroy(&cifs_sb->bdi); goto out; } if ((volume_info->max_credits < 20) || @@ -3753,6 +3752,9 @@ try_mount_again: if (IS_ERR(tcon)) { rc = PTR_ERR(tcon); tcon = NULL; + if (rc == -EACCES) + goto mount_fail_check; + goto remote_path_check; } @@ -3777,9 +3779,6 @@ try_mount_again: cifs_sb->wsize = server->ops->negotiate_wsize(tcon, volume_info); cifs_sb->rsize = server->ops->negotiate_rsize(tcon, volume_info); - /* tune readahead according to rsize */ - cifs_sb->bdi.ra_pages = cifs_sb->rsize / PAGE_SIZE; - remote_path_check: #ifdef CONFIG_CIFS_DFS_UPCALL /* @@ -3896,7 +3895,6 @@ mount_fail_check: cifs_put_smb_ses(ses); else cifs_put_tcp_session(server, 0); - bdi_destroy(&cifs_sb->bdi); } out: @@ -4099,7 +4097,6 @@ cifs_umount(struct cifs_sb_info *cifs_sb) } spin_unlock(&cifs_sb->tlink_tree_lock); - bdi_destroy(&cifs_sb->bdi); kfree(cifs_sb->mountdata); kfree(cifs_sb->prepath); call_rcu(&cifs_sb->rcu, delayed_free); diff --git a/fs/cifs/file.c b/fs/cifs/file.c index aa3debbba826..6ef78ad838e6 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -2458,11 +2458,14 @@ cifs_uncached_writedata_release(struct kref *refcount) struct cifs_writedata *wdata = container_of(refcount, struct cifs_writedata, refcount); + kref_put(&wdata->ctx->refcount, cifs_aio_ctx_release); for (i = 0; i < wdata->nr_pages; i++) put_page(wdata->pages[i]); cifs_writedata_release(refcount); } +static void collect_uncached_write_data(struct cifs_aio_ctx *ctx); + static void cifs_uncached_writev_complete(struct work_struct *work) { @@ -2478,7 +2481,8 @@ cifs_uncached_writev_complete(struct work_struct *work) spin_unlock(&inode->i_lock); complete(&wdata->done); - + collect_uncached_write_data(wdata->ctx); + /* the below call can possibly free the last ref to aio ctx */ kref_put(&wdata->refcount, cifs_uncached_writedata_release); } @@ -2527,7 +2531,8 @@ wdata_fill_from_iovec(struct cifs_writedata *wdata, struct iov_iter *from, static int cifs_write_from_iter(loff_t offset, size_t len, struct iov_iter *from, struct cifsFileInfo *open_file, - struct cifs_sb_info *cifs_sb, struct list_head *wdata_list) + struct cifs_sb_info *cifs_sb, struct list_head *wdata_list, + struct cifs_aio_ctx *ctx) { int rc = 0; size_t cur_len; @@ -2595,9 +2600,11 @@ cifs_write_from_iter(loff_t offset, size_t len, struct iov_iter *from, wdata->pagesz = PAGE_SIZE; wdata->tailsz = cur_len - ((nr_pages - 1) * PAGE_SIZE); wdata->credits = credits; + wdata->ctx = ctx; + kref_get(&ctx->refcount); if (!wdata->cfile->invalidHandle || - !cifs_reopen_file(wdata->cfile, false)) + !(rc = cifs_reopen_file(wdata->cfile, false))) rc = server->ops->async_writev(wdata, cifs_uncached_writedata_release); if (rc) { @@ -2620,81 +2627,61 @@ cifs_write_from_iter(loff_t offset, size_t len, struct iov_iter *from, return rc; } -ssize_t cifs_user_writev(struct kiocb *iocb, struct iov_iter *from) +static void collect_uncached_write_data(struct cifs_aio_ctx *ctx) { - struct file *file = iocb->ki_filp; - ssize_t total_written = 0; - struct cifsFileInfo *open_file; + struct cifs_writedata *wdata, *tmp; struct cifs_tcon *tcon; struct cifs_sb_info *cifs_sb; - struct cifs_writedata *wdata, *tmp; - struct list_head wdata_list; - struct iov_iter saved_from = *from; + struct dentry *dentry = ctx->cfile->dentry; + unsigned int i; int rc; - /* - * BB - optimize the way when signing is disabled. We can drop this - * extra memory-to-memory copying and use iovec buffers for constructing - * write request. - */ - - rc = generic_write_checks(iocb, from); - if (rc <= 0) - return rc; - - INIT_LIST_HEAD(&wdata_list); - cifs_sb = CIFS_FILE_SB(file); - open_file = file->private_data; - tcon = tlink_tcon(open_file->tlink); - - if (!tcon->ses->server->ops->async_writev) - return -ENOSYS; + tcon = tlink_tcon(ctx->cfile->tlink); + cifs_sb = CIFS_SB(dentry->d_sb); - rc = cifs_write_from_iter(iocb->ki_pos, iov_iter_count(from), from, - open_file, cifs_sb, &wdata_list); + mutex_lock(&ctx->aio_mutex); - /* - * If at least one write was successfully sent, then discard any rc - * value from the later writes. If the other write succeeds, then - * we'll end up returning whatever was written. If it fails, then - * we'll get a new rc value from that. - */ - if (!list_empty(&wdata_list)) - rc = 0; + if (list_empty(&ctx->list)) { + mutex_unlock(&ctx->aio_mutex); + return; + } + rc = ctx->rc; /* * Wait for and collect replies for any successful sends in order of - * increasing offset. Once an error is hit or we get a fatal signal - * while waiting, then return without waiting for any more replies. + * increasing offset. Once an error is hit, then return without waiting + * for any more replies. */ restart_loop: - list_for_each_entry_safe(wdata, tmp, &wdata_list, list) { + list_for_each_entry_safe(wdata, tmp, &ctx->list, list) { if (!rc) { - /* FIXME: freezable too? */ - rc = wait_for_completion_killable(&wdata->done); - if (rc) - rc = -EINTR; - else if (wdata->result) + if (!try_wait_for_completion(&wdata->done)) { + mutex_unlock(&ctx->aio_mutex); + return; + } + + if (wdata->result) rc = wdata->result; else - total_written += wdata->bytes; + ctx->total_len += wdata->bytes; /* resend call if it's a retryable error */ if (rc == -EAGAIN) { struct list_head tmp_list; - struct iov_iter tmp_from = saved_from; + struct iov_iter tmp_from = ctx->iter; INIT_LIST_HEAD(&tmp_list); list_del_init(&wdata->list); iov_iter_advance(&tmp_from, - wdata->offset - iocb->ki_pos); + wdata->offset - ctx->pos); rc = cifs_write_from_iter(wdata->offset, wdata->bytes, &tmp_from, - open_file, cifs_sb, &tmp_list); + ctx->cfile, cifs_sb, &tmp_list, + ctx); - list_splice(&tmp_list, &wdata_list); + list_splice(&tmp_list, &ctx->list); kref_put(&wdata->refcount, cifs_uncached_writedata_release); @@ -2705,12 +2692,111 @@ restart_loop: kref_put(&wdata->refcount, cifs_uncached_writedata_release); } + for (i = 0; i < ctx->npages; i++) + put_page(ctx->bv[i].bv_page); + + cifs_stats_bytes_written(tcon, ctx->total_len); + set_bit(CIFS_INO_INVALID_MAPPING, &CIFS_I(dentry->d_inode)->flags); + + ctx->rc = (rc == 0) ? ctx->total_len : rc; + + mutex_unlock(&ctx->aio_mutex); + + if (ctx->iocb && ctx->iocb->ki_complete) + ctx->iocb->ki_complete(ctx->iocb, ctx->rc, 0); + else + complete(&ctx->done); +} + +ssize_t cifs_user_writev(struct kiocb *iocb, struct iov_iter *from) +{ + struct file *file = iocb->ki_filp; + ssize_t total_written = 0; + struct cifsFileInfo *cfile; + struct cifs_tcon *tcon; + struct cifs_sb_info *cifs_sb; + struct cifs_aio_ctx *ctx; + struct iov_iter saved_from = *from; + int rc; + + /* + * BB - optimize the way when signing is disabled. We can drop this + * extra memory-to-memory copying and use iovec buffers for constructing + * write request. + */ + + rc = generic_write_checks(iocb, from); + if (rc <= 0) + return rc; + + cifs_sb = CIFS_FILE_SB(file); + cfile = file->private_data; + tcon = tlink_tcon(cfile->tlink); + + if (!tcon->ses->server->ops->async_writev) + return -ENOSYS; + + ctx = cifs_aio_ctx_alloc(); + if (!ctx) + return -ENOMEM; + + ctx->cfile = cifsFileInfo_get(cfile); + + if (!is_sync_kiocb(iocb)) + ctx->iocb = iocb; + + ctx->pos = iocb->ki_pos; + + rc = setup_aio_ctx_iter(ctx, from, WRITE); + if (rc) { + kref_put(&ctx->refcount, cifs_aio_ctx_release); + return rc; + } + + /* grab a lock here due to read response handlers can access ctx */ + mutex_lock(&ctx->aio_mutex); + + rc = cifs_write_from_iter(iocb->ki_pos, ctx->len, &saved_from, + cfile, cifs_sb, &ctx->list, ctx); + + /* + * If at least one write was successfully sent, then discard any rc + * value from the later writes. If the other write succeeds, then + * we'll end up returning whatever was written. If it fails, then + * we'll get a new rc value from that. + */ + if (!list_empty(&ctx->list)) + rc = 0; + + mutex_unlock(&ctx->aio_mutex); + + if (rc) { + kref_put(&ctx->refcount, cifs_aio_ctx_release); + return rc; + } + + if (!is_sync_kiocb(iocb)) { + kref_put(&ctx->refcount, cifs_aio_ctx_release); + return -EIOCBQUEUED; + } + + rc = wait_for_completion_killable(&ctx->done); + if (rc) { + mutex_lock(&ctx->aio_mutex); + ctx->rc = rc = -EINTR; + total_written = ctx->total_len; + mutex_unlock(&ctx->aio_mutex); + } else { + rc = ctx->rc; + total_written = ctx->total_len; + } + + kref_put(&ctx->refcount, cifs_aio_ctx_release); + if (unlikely(!total_written)) return rc; iocb->ki_pos += total_written; - set_bit(CIFS_INO_INVALID_MAPPING, &CIFS_I(file_inode(file))->flags); - cifs_stats_bytes_written(tcon, total_written); return total_written; } @@ -2859,6 +2945,7 @@ cifs_uncached_readdata_release(struct kref *refcount) struct cifs_readdata, refcount); unsigned int i; + kref_put(&rdata->ctx->refcount, cifs_aio_ctx_release); for (i = 0; i < rdata->nr_pages; i++) { put_page(rdata->pages[i]); rdata->pages[i] = NULL; @@ -2900,6 +2987,8 @@ cifs_readdata_to_iov(struct cifs_readdata *rdata, struct iov_iter *iter) return remaining ? -EFAULT : 0; } +static void collect_uncached_read_data(struct cifs_aio_ctx *ctx); + static void cifs_uncached_readv_complete(struct work_struct *work) { @@ -2907,6 +2996,8 @@ cifs_uncached_readv_complete(struct work_struct *work) struct cifs_readdata, work); complete(&rdata->done); + collect_uncached_read_data(rdata->ctx); + /* the below call can possibly free the last ref to aio ctx */ kref_put(&rdata->refcount, cifs_uncached_readdata_release); } @@ -2973,7 +3064,8 @@ cifs_uncached_copy_into_pages(struct TCP_Server_Info *server, static int cifs_send_async_read(loff_t offset, size_t len, struct cifsFileInfo *open_file, - struct cifs_sb_info *cifs_sb, struct list_head *rdata_list) + struct cifs_sb_info *cifs_sb, struct list_head *rdata_list, + struct cifs_aio_ctx *ctx) { struct cifs_readdata *rdata; unsigned int npages, rsize, credits; @@ -3020,9 +3112,11 @@ cifs_send_async_read(loff_t offset, size_t len, struct cifsFileInfo *open_file, rdata->read_into_pages = cifs_uncached_read_into_pages; rdata->copy_into_pages = cifs_uncached_copy_into_pages; rdata->credits = credits; + rdata->ctx = ctx; + kref_get(&ctx->refcount); if (!rdata->cfile->invalidHandle || - !cifs_reopen_file(rdata->cfile, true)) + !(rc = cifs_reopen_file(rdata->cfile, true))) rc = server->ops->async_readv(rdata); error: if (rc) { @@ -3042,50 +3136,37 @@ error: return rc; } -ssize_t cifs_user_readv(struct kiocb *iocb, struct iov_iter *to) +static void +collect_uncached_read_data(struct cifs_aio_ctx *ctx) { - struct file *file = iocb->ki_filp; - ssize_t rc; - size_t len; - ssize_t total_read = 0; - loff_t offset = iocb->ki_pos; + struct cifs_readdata *rdata, *tmp; + struct iov_iter *to = &ctx->iter; struct cifs_sb_info *cifs_sb; struct cifs_tcon *tcon; - struct cifsFileInfo *open_file; - struct cifs_readdata *rdata, *tmp; - struct list_head rdata_list; - - len = iov_iter_count(to); - if (!len) - return 0; - - INIT_LIST_HEAD(&rdata_list); - cifs_sb = CIFS_FILE_SB(file); - open_file = file->private_data; - tcon = tlink_tcon(open_file->tlink); - - if (!tcon->ses->server->ops->async_readv) - return -ENOSYS; + unsigned int i; + int rc; - if ((file->f_flags & O_ACCMODE) == O_WRONLY) - cifs_dbg(FYI, "attempting read on write only file instance\n"); + tcon = tlink_tcon(ctx->cfile->tlink); + cifs_sb = CIFS_SB(ctx->cfile->dentry->d_sb); - rc = cifs_send_async_read(offset, len, open_file, cifs_sb, &rdata_list); + mutex_lock(&ctx->aio_mutex); - /* if at least one read request send succeeded, then reset rc */ - if (!list_empty(&rdata_list)) - rc = 0; + if (list_empty(&ctx->list)) { + mutex_unlock(&ctx->aio_mutex); + return; + } - len = iov_iter_count(to); + rc = ctx->rc; /* the loop below should proceed in the order of increasing offsets */ again: - list_for_each_entry_safe(rdata, tmp, &rdata_list, list) { + list_for_each_entry_safe(rdata, tmp, &ctx->list, list) { if (!rc) { - /* FIXME: freezable sleep too? */ - rc = wait_for_completion_killable(&rdata->done); - if (rc) - rc = -EINTR; - else if (rdata->result == -EAGAIN) { + if (!try_wait_for_completion(&rdata->done)) { + mutex_unlock(&ctx->aio_mutex); + return; + } + + if (rdata->result == -EAGAIN) { /* resend call if it's a retryable error */ struct list_head tmp_list; unsigned int got_bytes = rdata->got_bytes; @@ -3111,9 +3192,9 @@ again: rdata->offset + got_bytes, rdata->bytes - got_bytes, rdata->cfile, cifs_sb, - &tmp_list); + &tmp_list, ctx); - list_splice(&tmp_list, &rdata_list); + list_splice(&tmp_list, &ctx->list); kref_put(&rdata->refcount, cifs_uncached_readdata_release); @@ -3131,14 +3212,110 @@ again: kref_put(&rdata->refcount, cifs_uncached_readdata_release); } - total_read = len - iov_iter_count(to); + for (i = 0; i < ctx->npages; i++) { + if (ctx->should_dirty) + set_page_dirty(ctx->bv[i].bv_page); + put_page(ctx->bv[i].bv_page); + } + + ctx->total_len = ctx->len - iov_iter_count(to); - cifs_stats_bytes_read(tcon, total_read); + cifs_stats_bytes_read(tcon, ctx->total_len); /* mask nodata case */ if (rc == -ENODATA) rc = 0; + ctx->rc = (rc == 0) ? ctx->total_len : rc; + + mutex_unlock(&ctx->aio_mutex); + + if (ctx->iocb && ctx->iocb->ki_complete) + ctx->iocb->ki_complete(ctx->iocb, ctx->rc, 0); + else + complete(&ctx->done); +} + +ssize_t cifs_user_readv(struct kiocb *iocb, struct iov_iter *to) +{ + struct file *file = iocb->ki_filp; + ssize_t rc; + size_t len; + ssize_t total_read = 0; + loff_t offset = iocb->ki_pos; + struct cifs_sb_info *cifs_sb; + struct cifs_tcon *tcon; + struct cifsFileInfo *cfile; + struct cifs_aio_ctx *ctx; + + len = iov_iter_count(to); + if (!len) + return 0; + + cifs_sb = CIFS_FILE_SB(file); + cfile = file->private_data; + tcon = tlink_tcon(cfile->tlink); + + if (!tcon->ses->server->ops->async_readv) + return -ENOSYS; + + if ((file->f_flags & O_ACCMODE) == O_WRONLY) + cifs_dbg(FYI, "attempting read on write only file instance\n"); + + ctx = cifs_aio_ctx_alloc(); + if (!ctx) + return -ENOMEM; + + ctx->cfile = cifsFileInfo_get(cfile); + + if (!is_sync_kiocb(iocb)) + ctx->iocb = iocb; + + if (to->type & ITER_IOVEC) + ctx->should_dirty = true; + + rc = setup_aio_ctx_iter(ctx, to, READ); + if (rc) { + kref_put(&ctx->refcount, cifs_aio_ctx_release); + return rc; + } + + len = ctx->len; + + /* grab a lock here due to read response handlers can access ctx */ + mutex_lock(&ctx->aio_mutex); + + rc = cifs_send_async_read(offset, len, cfile, cifs_sb, &ctx->list, ctx); + + /* if at least one read request send succeeded, then reset rc */ + if (!list_empty(&ctx->list)) + rc = 0; + + mutex_unlock(&ctx->aio_mutex); + + if (rc) { + kref_put(&ctx->refcount, cifs_aio_ctx_release); + return rc; + } + + if (!is_sync_kiocb(iocb)) { + kref_put(&ctx->refcount, cifs_aio_ctx_release); + return -EIOCBQUEUED; + } + + rc = wait_for_completion_killable(&ctx->done); + if (rc) { + mutex_lock(&ctx->aio_mutex); + ctx->rc = rc = -EINTR; + total_read = ctx->total_len; + mutex_unlock(&ctx->aio_mutex); + } else { + rc = ctx->rc; + total_read = ctx->total_len; + } + + kref_put(&ctx->refcount, cifs_aio_ctx_release); + if (total_read) { iocb->ki_pos += total_read; return total_read; @@ -3617,7 +3794,7 @@ static int cifs_readpages(struct file *file, struct address_space *mapping, } if (!rdata->cfile->invalidHandle || - !cifs_reopen_file(rdata->cfile, true)) + !(rc = cifs_reopen_file(rdata->cfile, true))) rc = server->ops->async_readv(rdata); if (rc) { add_credits_and_wake_if(server, rdata->credits, 0); diff --git a/fs/cifs/ioctl.c b/fs/cifs/ioctl.c index 265c45fe4ea5..76fb0917dc8c 100644 --- a/fs/cifs/ioctl.c +++ b/fs/cifs/ioctl.c @@ -74,7 +74,8 @@ static long cifs_ioctl_copychunk(unsigned int xid, struct file *dst_file, rc = cifs_file_copychunk_range(xid, src_file.file, 0, dst_file, 0, src_inode->i_size, 0); - + if (rc > 0) + rc = 0; out_fput: fdput(src_file); out_drop_write: @@ -208,10 +209,14 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg) rc = -EOPNOTSUPP; break; case CIFS_IOC_GET_MNT_INFO: + if (pSMBFile == NULL) + break; tcon = tlink_tcon(pSMBFile->tlink); rc = smb_mnt_get_fsinfo(xid, tcon, (void __user *)arg); break; case CIFS_ENUMERATE_SNAPSHOTS: + if (pSMBFile == NULL) + break; if (arg == 0) { rc = -EINVAL; goto cifs_ioc_exit; diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index d3fb11529ed9..b08531977daa 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -22,6 +22,7 @@ #include <linux/slab.h> #include <linux/ctype.h> #include <linux/mempool.h> +#include <linux/vmalloc.h> #include "cifspdu.h" #include "cifsglob.h" #include "cifsproto.h" @@ -167,13 +168,11 @@ cifs_buf_get(void) /* clear the first few header bytes */ /* for most paths, more is cleared in header_assemble */ - if (ret_buf) { - memset(ret_buf, 0, buf_size + 3); - atomic_inc(&bufAllocCount); + memset(ret_buf, 0, buf_size + 3); + atomic_inc(&bufAllocCount); #ifdef CONFIG_CIFS_STATS2 - atomic_inc(&totBufAllocCount); + atomic_inc(&totBufAllocCount); #endif /* CONFIG_CIFS_STATS2 */ - } return ret_buf; } @@ -201,15 +200,13 @@ cifs_small_buf_get(void) albeit slightly larger than necessary and maxbuffersize defaults to this and can not be bigger */ ret_buf = mempool_alloc(cifs_sm_req_poolp, GFP_NOFS); - if (ret_buf) { /* No need to clear memory here, cleared in header assemble */ /* memset(ret_buf, 0, sizeof(struct smb_hdr) + 27);*/ - atomic_inc(&smBufAllocCount); + atomic_inc(&smBufAllocCount); #ifdef CONFIG_CIFS_STATS2 - atomic_inc(&totSmBufAllocCount); + atomic_inc(&totSmBufAllocCount); #endif /* CONFIG_CIFS_STATS2 */ - } return ret_buf; } @@ -492,7 +489,7 @@ is_valid_oplock_break(char *buffer, struct TCP_Server_Info *srv) CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2, &pCifsInode->flags); - queue_work(cifsiod_wq, + queue_work(cifsoplockd_wq, &netfile->oplock_break); netfile->oplock_break_cancelled = false; @@ -745,3 +742,122 @@ parse_DFS_referrals_exit: } return rc; } + +struct cifs_aio_ctx * +cifs_aio_ctx_alloc(void) +{ + struct cifs_aio_ctx *ctx; + + ctx = kzalloc(sizeof(struct cifs_aio_ctx), GFP_KERNEL); + if (!ctx) + return NULL; + + INIT_LIST_HEAD(&ctx->list); + mutex_init(&ctx->aio_mutex); + init_completion(&ctx->done); + kref_init(&ctx->refcount); + return ctx; +} + +void +cifs_aio_ctx_release(struct kref *refcount) +{ + struct cifs_aio_ctx *ctx = container_of(refcount, + struct cifs_aio_ctx, refcount); + + cifsFileInfo_put(ctx->cfile); + kvfree(ctx->bv); + kfree(ctx); +} + +#define CIFS_AIO_KMALLOC_LIMIT (1024 * 1024) + +int +setup_aio_ctx_iter(struct cifs_aio_ctx *ctx, struct iov_iter *iter, int rw) +{ + ssize_t rc; + unsigned int cur_npages; + unsigned int npages = 0; + unsigned int i; + size_t len; + size_t count = iov_iter_count(iter); + unsigned int saved_len; + size_t start; + unsigned int max_pages = iov_iter_npages(iter, INT_MAX); + struct page **pages = NULL; + struct bio_vec *bv = NULL; + + if (iter->type & ITER_KVEC) { + memcpy(&ctx->iter, iter, sizeof(struct iov_iter)); + ctx->len = count; + iov_iter_advance(iter, count); + return 0; + } + + if (max_pages * sizeof(struct bio_vec) <= CIFS_AIO_KMALLOC_LIMIT) + bv = kmalloc_array(max_pages, sizeof(struct bio_vec), + GFP_KERNEL); + + if (!bv) { + bv = vmalloc(max_pages * sizeof(struct bio_vec)); + if (!bv) + return -ENOMEM; + } + + if (max_pages * sizeof(struct page *) <= CIFS_AIO_KMALLOC_LIMIT) + pages = kmalloc_array(max_pages, sizeof(struct page *), + GFP_KERNEL); + + if (!pages) { + pages = vmalloc(max_pages * sizeof(struct page *)); + if (!bv) { + kvfree(bv); + return -ENOMEM; + } + } + + saved_len = count; + + while (count && npages < max_pages) { + rc = iov_iter_get_pages(iter, pages, count, max_pages, &start); + if (rc < 0) { + cifs_dbg(VFS, "couldn't get user pages (rc=%zd)\n", rc); + break; + } + + if (rc > count) { + cifs_dbg(VFS, "get pages rc=%zd more than %zu\n", rc, + count); + break; + } + + iov_iter_advance(iter, rc); + count -= rc; + rc += start; + cur_npages = DIV_ROUND_UP(rc, PAGE_SIZE); + + if (npages + cur_npages > max_pages) { + cifs_dbg(VFS, "out of vec array capacity (%u vs %u)\n", + npages + cur_npages, max_pages); + break; + } + + for (i = 0; i < cur_npages; i++) { + len = rc > PAGE_SIZE ? PAGE_SIZE : rc; + bv[npages + i].bv_page = pages[i]; + bv[npages + i].bv_offset = start; + bv[npages + i].bv_len = len - start; + rc -= len; + start = 0; + } + + npages += cur_npages; + } + + kvfree(pages); + ctx->bv = bv; + ctx->len = saved_len - count; + ctx->npages = npages; + iov_iter_bvec(&ctx->iter, ITER_BVEC | rw, ctx->bv, npages, ctx->len); + return 0; +} diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c index abae6dd2c6b9..cc88f4f0325e 100644 --- a/fs/cifs/netmisc.c +++ b/fs/cifs/netmisc.c @@ -980,10 +980,10 @@ struct timespec cnvrtDosUnixTm(__le16 le_date, __le16 le_time, int offset) cifs_dbg(VFS, "illegal hours %d\n", st->Hours); days = sd->Day; month = sd->Month; - if ((days > 31) || (month > 12)) { + if (days < 1 || days > 31 || month < 1 || month > 12) { cifs_dbg(VFS, "illegal date, month %d day: %d\n", month, days); - if (month > 12) - month = 12; + days = clamp(days, 1, 31); + month = clamp(month, 1, 12); } month -= 1; days += total_days_of_prev_months[month]; diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c index cc93ba4da9b5..27bc360c7ffd 100644 --- a/fs/cifs/smb1ops.c +++ b/fs/cifs/smb1ops.c @@ -1015,6 +1015,15 @@ cifs_dir_needs_close(struct cifsFileInfo *cfile) return !cfile->srch_inf.endOfSearch && !cfile->invalidHandle; } +static bool +cifs_can_echo(struct TCP_Server_Info *server) +{ + if (server->tcpStatus == CifsGood) + return true; + + return false; +} + struct smb_version_operations smb1_operations = { .send_cancel = send_nt_cancel, .compare_fids = cifs_compare_fids, @@ -1049,6 +1058,7 @@ struct smb_version_operations smb1_operations = { .get_dfs_refer = CIFSGetDFSRefer, .qfs_tcon = cifs_qfs_tcon, .is_path_accessible = cifs_is_path_accessible, + .can_echo = cifs_can_echo, .query_path_info = cifs_query_path_info, .query_file_info = cifs_query_file_info, .get_srv_inum = cifs_get_srv_inum, diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c index 1a04b3a5beb1..7b08a1446a7f 100644 --- a/fs/cifs/smb2misc.c +++ b/fs/cifs/smb2misc.c @@ -499,7 +499,7 @@ smb2_tcon_has_lease(struct cifs_tcon *tcon, struct smb2_lease_break *rsp, else cfile->oplock_break_cancelled = true; - queue_work(cifsiod_wq, &cfile->oplock_break); + queue_work(cifsoplockd_wq, &cfile->oplock_break); kfree(lw); return true; } @@ -643,7 +643,8 @@ smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server) CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2, &cinode->flags); spin_unlock(&cfile->file_info_lock); - queue_work(cifsiod_wq, &cfile->oplock_break); + queue_work(cifsoplockd_wq, + &cfile->oplock_break); spin_unlock(&tcon->open_file_lock); spin_unlock(&cifs_tcp_ses_lock); diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index 7b12a727947e..c58691834eb2 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -942,6 +942,7 @@ smb3_enum_snapshots(const unsigned int xid, struct cifs_tcon *tcon, } if (snapshot_in.snapshot_array_size < sizeof(struct smb_snapshot_array)) { rc = -ERANGE; + kfree(retbuf); return rc; } @@ -2195,7 +2196,7 @@ receive_encrypted_read(struct TCP_Server_Info *server, struct mid_q_entry **mid) if (rc) goto free_pages; - rc = cifs_discard_remaining_data(server, buf); + rc = cifs_discard_remaining_data(server); if (rc) goto free_pages; @@ -2221,7 +2222,7 @@ free_pages: kfree(pages); return rc; discard_data: - cifs_discard_remaining_data(server, buf); + cifs_discard_remaining_data(server); goto free_pages; } diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 66fa1b941cdf..48ff7703b919 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -33,6 +33,7 @@ #include <linux/vfs.h> #include <linux/task_io_accounting_ops.h> #include <linux/uaccess.h> +#include <linux/uuid.h> #include <linux/pagemap.h> #include <linux/xattr.h> #include "smb2pdu.h" @@ -562,8 +563,10 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses) * but for time being this is our only auth choice so doesn't matter. * We just found a server which sets blob length to zero expecting raw. */ - if (blob_length == 0) + if (blob_length == 0) { cifs_dbg(FYI, "missing security blob on negprot\n"); + server->sec_ntlmssp = true; + } rc = cifs_enable_signing(server, ses->sign); if (rc) @@ -630,8 +633,12 @@ int smb3_validate_negotiate(const unsigned int xid, struct cifs_tcon *tcon) } if (rsplen != sizeof(struct validate_negotiate_info_rsp)) { - cifs_dbg(VFS, "invalid size of protocol negotiate response\n"); - return -EIO; + cifs_dbg(VFS, "invalid protocol negotiate response size: %d\n", + rsplen); + + /* relax check since Mac returns max bufsize allowed on ioctl */ + if (rsplen > CIFSMaxBufSize) + return -EIO; } /* check validate negotiate info response matches what we got earlier */ @@ -1171,9 +1178,6 @@ SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree, else return -EIO; - if (tcon && tcon->bad_network_name) - return -ENOENT; - unc_path = kmalloc(MAX_SHARENAME_LENGTH * 2, GFP_KERNEL); if (unc_path == NULL) return -ENOMEM; @@ -1277,8 +1281,6 @@ tcon_exit: tcon_error_exit: if (rsp->hdr.sync_hdr.Status == STATUS_BAD_NETWORK_NAME) { cifs_dbg(VFS, "BAD_NETWORK_NAME: %s\n", tree); - if (tcon) - tcon->bad_network_name = true; } goto tcon_exit; } @@ -1856,8 +1858,12 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, * than one credit. Windows typically sets this smaller, but for some * ioctls it may be useful to allow server to send more. No point * limiting what the server can send as long as fits in one credit + * Unfortunately - we can not handle more than CIFS_MAX_MSG_SIZE + * (by default, note that it can be overridden to make max larger) + * in responses (except for read responses which can be bigger. + * We may want to bump this limit up */ - req->MaxOutputResponse = cpu_to_le32(0xFF00); /* < 64K uses 1 credit */ + req->MaxOutputResponse = cpu_to_le32(CIFSMaxBufSize); if (is_fsctl) req->Flags = cpu_to_le32(SMB2_0_IOCTL_IS_FSCTL); @@ -2181,6 +2187,9 @@ void smb2_reconnect_server(struct work_struct *work) struct cifs_tcon *tcon, *tcon2; struct list_head tmp_list; int tcon_exist = false; + int rc; + int resched = false; + /* Prevent simultaneous reconnects that can corrupt tcon->rlist list */ mutex_lock(&server->reconnect_mutex); @@ -2208,13 +2217,18 @@ void smb2_reconnect_server(struct work_struct *work) spin_unlock(&cifs_tcp_ses_lock); list_for_each_entry_safe(tcon, tcon2, &tmp_list, rlist) { - if (!smb2_reconnect(SMB2_INTERNAL_CMD, tcon)) + rc = smb2_reconnect(SMB2_INTERNAL_CMD, tcon); + if (!rc) cifs_reopen_persistent_handles(tcon); + else + resched = true; list_del_init(&tcon->rlist); cifs_put_tcon(tcon); } cifs_dbg(FYI, "Reconnecting tcons finished\n"); + if (resched) + queue_delayed_work(cifsiod_wq, &server->reconnect, 2 * HZ); mutex_unlock(&server->reconnect_mutex); /* now we can safely release srv struct */ diff --git a/fs/cifs/smb2transport.c b/fs/cifs/smb2transport.c index 506b67fc93d9..c69ec96e92ac 100644 --- a/fs/cifs/smb2transport.c +++ b/fs/cifs/smb2transport.c @@ -538,23 +538,19 @@ smb2_mid_entry_alloc(const struct smb2_sync_hdr *shdr, } temp = mempool_alloc(cifs_mid_poolp, GFP_NOFS); - if (temp == NULL) - return temp; - else { - memset(temp, 0, sizeof(struct mid_q_entry)); - temp->mid = le64_to_cpu(shdr->MessageId); - temp->pid = current->pid; - temp->command = shdr->Command; /* Always LE */ - temp->when_alloc = jiffies; - temp->server = server; - - /* - * The default is for the mid to be synchronous, so the - * default callback just wakes up the current task. - */ - temp->callback = cifs_wake_up_task; - temp->callback_data = current; - } + memset(temp, 0, sizeof(struct mid_q_entry)); + temp->mid = le64_to_cpu(shdr->MessageId); + temp->pid = current->pid; + temp->command = shdr->Command; /* Always LE */ + temp->when_alloc = jiffies; + temp->server = server; + + /* + * The default is for the mid to be synchronous, so the + * default callback just wakes up the current task. + */ + temp->callback = cifs_wake_up_task; + temp->callback_data = current; atomic_inc(&midCount); temp->mid_state = MID_REQUEST_ALLOCATED; diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index f6e13a977fc8..4d64b5b8fc9c 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -55,26 +55,22 @@ AllocMidQEntry(const struct smb_hdr *smb_buffer, struct TCP_Server_Info *server) } temp = mempool_alloc(cifs_mid_poolp, GFP_NOFS); - if (temp == NULL) - return temp; - else { - memset(temp, 0, sizeof(struct mid_q_entry)); - temp->mid = get_mid(smb_buffer); - temp->pid = current->pid; - temp->command = cpu_to_le16(smb_buffer->Command); - cifs_dbg(FYI, "For smb_command %d\n", smb_buffer->Command); + memset(temp, 0, sizeof(struct mid_q_entry)); + temp->mid = get_mid(smb_buffer); + temp->pid = current->pid; + temp->command = cpu_to_le16(smb_buffer->Command); + cifs_dbg(FYI, "For smb_command %d\n", smb_buffer->Command); /* do_gettimeofday(&temp->when_sent);*/ /* easier to use jiffies */ - /* when mid allocated can be before when sent */ - temp->when_alloc = jiffies; - temp->server = server; + /* when mid allocated can be before when sent */ + temp->when_alloc = jiffies; + temp->server = server; - /* - * The default is for the mid to be synchronous, so the - * default callback just wakes up the current task. - */ - temp->callback = cifs_wake_up_task; - temp->callback_data = current; - } + /* + * The default is for the mid to be synchronous, so the + * default callback just wakes up the current task. + */ + temp->callback = cifs_wake_up_task; + temp->callback_data = current; atomic_inc(&midCount); temp->mid_state = MID_REQUEST_ALLOCATED; diff --git a/fs/coda/inode.c b/fs/coda/inode.c index 2dea594da199..6058df380cc0 100644 --- a/fs/coda/inode.c +++ b/fs/coda/inode.c @@ -183,10 +183,6 @@ static int coda_fill_super(struct super_block *sb, void *data, int silent) goto unlock_out; } - error = bdi_setup_and_register(&vc->bdi, "coda"); - if (error) - goto unlock_out; - vc->vc_sb = sb; mutex_unlock(&vc->vc_mutex); @@ -197,7 +193,10 @@ static int coda_fill_super(struct super_block *sb, void *data, int silent) sb->s_magic = CODA_SUPER_MAGIC; sb->s_op = &coda_super_operations; sb->s_d_op = &coda_dentry_operations; - sb->s_bdi = &vc->bdi; + + error = super_setup_bdi(sb); + if (error) + goto error; /* get root fid from Venus: this needs the root inode */ error = venus_rootfid(sb, &fid); @@ -228,7 +227,6 @@ static int coda_fill_super(struct super_block *sb, void *data, int silent) error: mutex_lock(&vc->vc_mutex); - bdi_destroy(&vc->bdi); vc->vc_sb = NULL; sb->s_fs_info = NULL; unlock_out: @@ -240,7 +238,6 @@ static void coda_put_super(struct super_block *sb) { struct venus_comm *vcp = coda_vcp(sb); mutex_lock(&vcp->vc_mutex); - bdi_destroy(&vcp->bdi); vcp->vc_sb = NULL; sb->s_fs_info = NULL; mutex_unlock(&vcp->vc_mutex); diff --git a/fs/compat.c b/fs/compat.c index c61b506f5bc9..190b38b39d9e 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -15,555 +15,14 @@ * published by the Free Software Foundation. */ -#include <linux/stddef.h> -#include <linux/kernel.h> -#include <linux/linkage.h> #include <linux/compat.h> -#include <linux/errno.h> -#include <linux/time.h> -#include <linux/cred.h> -#include <linux/fs.h> -#include <linux/fcntl.h> -#include <linux/namei.h> -#include <linux/file.h> -#include <linux/fdtable.h> -#include <linux/vfs.h> -#include <linux/ioctl.h> -#include <linux/init.h> #include <linux/ncp_mount.h> #include <linux/nfs4_mount.h> #include <linux/syscalls.h> -#include <linux/ctype.h> -#include <linux/dirent.h> -#include <linux/fsnotify.h> -#include <linux/highuid.h> -#include <linux/personality.h> -#include <linux/rwsem.h> -#include <linux/tsacct_kern.h> -#include <linux/security.h> -#include <linux/highmem.h> -#include <linux/signal.h> -#include <linux/poll.h> -#include <linux/mm.h> -#include <linux/fs_struct.h> #include <linux/slab.h> -#include <linux/pagemap.h> -#include <linux/aio.h> - #include <linux/uaccess.h> -#include <asm/mmu_context.h> -#include <asm/ioctls.h> #include "internal.h" -/* - * Not all architectures have sys_utime, so implement this in terms - * of sys_utimes. - */ -COMPAT_SYSCALL_DEFINE2(utime, const char __user *, filename, - struct compat_utimbuf __user *, t) -{ - struct timespec tv[2]; - - if (t) { - if (get_user(tv[0].tv_sec, &t->actime) || - get_user(tv[1].tv_sec, &t->modtime)) - return -EFAULT; - tv[0].tv_nsec = 0; - tv[1].tv_nsec = 0; - } - return do_utimes(AT_FDCWD, filename, t ? tv : NULL, 0); -} - -COMPAT_SYSCALL_DEFINE4(utimensat, unsigned int, dfd, const char __user *, filename, struct compat_timespec __user *, t, int, flags) -{ - struct timespec tv[2]; - - if (t) { - if (compat_get_timespec(&tv[0], &t[0]) || - compat_get_timespec(&tv[1], &t[1])) - return -EFAULT; - - if (tv[0].tv_nsec == UTIME_OMIT && tv[1].tv_nsec == UTIME_OMIT) - return 0; - } - return do_utimes(dfd, filename, t ? tv : NULL, flags); -} - -COMPAT_SYSCALL_DEFINE3(futimesat, unsigned int, dfd, const char __user *, filename, struct compat_timeval __user *, t) -{ - struct timespec tv[2]; - - if (t) { - if (get_user(tv[0].tv_sec, &t[0].tv_sec) || - get_user(tv[0].tv_nsec, &t[0].tv_usec) || - get_user(tv[1].tv_sec, &t[1].tv_sec) || - get_user(tv[1].tv_nsec, &t[1].tv_usec)) - return -EFAULT; - if (tv[0].tv_nsec >= 1000000 || tv[0].tv_nsec < 0 || - tv[1].tv_nsec >= 1000000 || tv[1].tv_nsec < 0) - return -EINVAL; - tv[0].tv_nsec *= 1000; - tv[1].tv_nsec *= 1000; - } - return do_utimes(dfd, filename, t ? tv : NULL, 0); -} - -COMPAT_SYSCALL_DEFINE2(utimes, const char __user *, filename, struct compat_timeval __user *, t) -{ - return compat_sys_futimesat(AT_FDCWD, filename, t); -} - -static int cp_compat_stat(struct kstat *stat, struct compat_stat __user *ubuf) -{ - struct compat_stat tmp; - - if (!old_valid_dev(stat->dev) || !old_valid_dev(stat->rdev)) - return -EOVERFLOW; - - memset(&tmp, 0, sizeof(tmp)); - tmp.st_dev = old_encode_dev(stat->dev); - tmp.st_ino = stat->ino; - if (sizeof(tmp.st_ino) < sizeof(stat->ino) && tmp.st_ino != stat->ino) - return -EOVERFLOW; - tmp.st_mode = stat->mode; - tmp.st_nlink = stat->nlink; - if (tmp.st_nlink != stat->nlink) - return -EOVERFLOW; - SET_UID(tmp.st_uid, from_kuid_munged(current_user_ns(), stat->uid)); - SET_GID(tmp.st_gid, from_kgid_munged(current_user_ns(), stat->gid)); - tmp.st_rdev = old_encode_dev(stat->rdev); - if ((u64) stat->size > MAX_NON_LFS) - return -EOVERFLOW; - tmp.st_size = stat->size; - tmp.st_atime = stat->atime.tv_sec; - tmp.st_atime_nsec = stat->atime.tv_nsec; - tmp.st_mtime = stat->mtime.tv_sec; - tmp.st_mtime_nsec = stat->mtime.tv_nsec; - tmp.st_ctime = stat->ctime.tv_sec; - tmp.st_ctime_nsec = stat->ctime.tv_nsec; - tmp.st_blocks = stat->blocks; - tmp.st_blksize = stat->blksize; - return copy_to_user(ubuf, &tmp, sizeof(tmp)) ? -EFAULT : 0; -} - -COMPAT_SYSCALL_DEFINE2(newstat, const char __user *, filename, - struct compat_stat __user *, statbuf) -{ - struct kstat stat; - int error; - - error = vfs_stat(filename, &stat); - if (error) - return error; - return cp_compat_stat(&stat, statbuf); -} - -COMPAT_SYSCALL_DEFINE2(newlstat, const char __user *, filename, - struct compat_stat __user *, statbuf) -{ - struct kstat stat; - int error; - - error = vfs_lstat(filename, &stat); - if (error) - return error; - return cp_compat_stat(&stat, statbuf); -} - -#ifndef __ARCH_WANT_STAT64 -COMPAT_SYSCALL_DEFINE4(newfstatat, unsigned int, dfd, - const char __user *, filename, - struct compat_stat __user *, statbuf, int, flag) -{ - struct kstat stat; - int error; - - error = vfs_fstatat(dfd, filename, &stat, flag); - if (error) - return error; - return cp_compat_stat(&stat, statbuf); -} -#endif - -COMPAT_SYSCALL_DEFINE2(newfstat, unsigned int, fd, - struct compat_stat __user *, statbuf) -{ - struct kstat stat; - int error = vfs_fstat(fd, &stat); - - if (!error) - error = cp_compat_stat(&stat, statbuf); - return error; -} - -static int put_compat_statfs(struct compat_statfs __user *ubuf, struct kstatfs *kbuf) -{ - - if (sizeof ubuf->f_blocks == 4) { - if ((kbuf->f_blocks | kbuf->f_bfree | kbuf->f_bavail | - kbuf->f_bsize | kbuf->f_frsize) & 0xffffffff00000000ULL) - return -EOVERFLOW; - /* f_files and f_ffree may be -1; it's okay - * to stuff that into 32 bits */ - if (kbuf->f_files != 0xffffffffffffffffULL - && (kbuf->f_files & 0xffffffff00000000ULL)) - return -EOVERFLOW; - if (kbuf->f_ffree != 0xffffffffffffffffULL - && (kbuf->f_ffree & 0xffffffff00000000ULL)) - return -EOVERFLOW; - } - if (!access_ok(VERIFY_WRITE, ubuf, sizeof(*ubuf)) || - __put_user(kbuf->f_type, &ubuf->f_type) || - __put_user(kbuf->f_bsize, &ubuf->f_bsize) || - __put_user(kbuf->f_blocks, &ubuf->f_blocks) || - __put_user(kbuf->f_bfree, &ubuf->f_bfree) || - __put_user(kbuf->f_bavail, &ubuf->f_bavail) || - __put_user(kbuf->f_files, &ubuf->f_files) || - __put_user(kbuf->f_ffree, &ubuf->f_ffree) || - __put_user(kbuf->f_namelen, &ubuf->f_namelen) || - __put_user(kbuf->f_fsid.val[0], &ubuf->f_fsid.val[0]) || - __put_user(kbuf->f_fsid.val[1], &ubuf->f_fsid.val[1]) || - __put_user(kbuf->f_frsize, &ubuf->f_frsize) || - __put_user(kbuf->f_flags, &ubuf->f_flags) || - __clear_user(ubuf->f_spare, sizeof(ubuf->f_spare))) - return -EFAULT; - return 0; -} - -/* - * The following statfs calls are copies of code from fs/statfs.c and - * should be checked against those from time to time - */ -COMPAT_SYSCALL_DEFINE2(statfs, const char __user *, pathname, struct compat_statfs __user *, buf) -{ - struct kstatfs tmp; - int error = user_statfs(pathname, &tmp); - if (!error) - error = put_compat_statfs(buf, &tmp); - return error; -} - -COMPAT_SYSCALL_DEFINE2(fstatfs, unsigned int, fd, struct compat_statfs __user *, buf) -{ - struct kstatfs tmp; - int error = fd_statfs(fd, &tmp); - if (!error) - error = put_compat_statfs(buf, &tmp); - return error; -} - -static int put_compat_statfs64(struct compat_statfs64 __user *ubuf, struct kstatfs *kbuf) -{ - if (sizeof(ubuf->f_bsize) == 4) { - if ((kbuf->f_type | kbuf->f_bsize | kbuf->f_namelen | - kbuf->f_frsize | kbuf->f_flags) & 0xffffffff00000000ULL) - return -EOVERFLOW; - /* f_files and f_ffree may be -1; it's okay - * to stuff that into 32 bits */ - if (kbuf->f_files != 0xffffffffffffffffULL - && (kbuf->f_files & 0xffffffff00000000ULL)) - return -EOVERFLOW; - if (kbuf->f_ffree != 0xffffffffffffffffULL - && (kbuf->f_ffree & 0xffffffff00000000ULL)) - return -EOVERFLOW; - } - if (!access_ok(VERIFY_WRITE, ubuf, sizeof(*ubuf)) || - __put_user(kbuf->f_type, &ubuf->f_type) || - __put_user(kbuf->f_bsize, &ubuf->f_bsize) || - __put_user(kbuf->f_blocks, &ubuf->f_blocks) || - __put_user(kbuf->f_bfree, &ubuf->f_bfree) || - __put_user(kbuf->f_bavail, &ubuf->f_bavail) || - __put_user(kbuf->f_files, &ubuf->f_files) || - __put_user(kbuf->f_ffree, &ubuf->f_ffree) || - __put_user(kbuf->f_namelen, &ubuf->f_namelen) || - __put_user(kbuf->f_fsid.val[0], &ubuf->f_fsid.val[0]) || - __put_user(kbuf->f_fsid.val[1], &ubuf->f_fsid.val[1]) || - __put_user(kbuf->f_frsize, &ubuf->f_frsize) || - __put_user(kbuf->f_flags, &ubuf->f_flags) || - __clear_user(ubuf->f_spare, sizeof(ubuf->f_spare))) - return -EFAULT; - return 0; -} - -COMPAT_SYSCALL_DEFINE3(statfs64, const char __user *, pathname, compat_size_t, sz, struct compat_statfs64 __user *, buf) -{ - struct kstatfs tmp; - int error; - - if (sz != sizeof(*buf)) - return -EINVAL; - - error = user_statfs(pathname, &tmp); - if (!error) - error = put_compat_statfs64(buf, &tmp); - return error; -} - -COMPAT_SYSCALL_DEFINE3(fstatfs64, unsigned int, fd, compat_size_t, sz, struct compat_statfs64 __user *, buf) -{ - struct kstatfs tmp; - int error; - - if (sz != sizeof(*buf)) - return -EINVAL; - - error = fd_statfs(fd, &tmp); - if (!error) - error = put_compat_statfs64(buf, &tmp); - return error; -} - -/* - * This is a copy of sys_ustat, just dealing with a structure layout. - * Given how simple this syscall is that apporach is more maintainable - * than the various conversion hacks. - */ -COMPAT_SYSCALL_DEFINE2(ustat, unsigned, dev, struct compat_ustat __user *, u) -{ - struct compat_ustat tmp; - struct kstatfs sbuf; - int err = vfs_ustat(new_decode_dev(dev), &sbuf); - if (err) - return err; - - memset(&tmp, 0, sizeof(struct compat_ustat)); - tmp.f_tfree = sbuf.f_bfree; - tmp.f_tinode = sbuf.f_ffree; - if (copy_to_user(u, &tmp, sizeof(struct compat_ustat))) - return -EFAULT; - return 0; -} - -static int get_compat_flock(struct flock *kfl, struct compat_flock __user *ufl) -{ - if (!access_ok(VERIFY_READ, ufl, sizeof(*ufl)) || - __get_user(kfl->l_type, &ufl->l_type) || - __get_user(kfl->l_whence, &ufl->l_whence) || - __get_user(kfl->l_start, &ufl->l_start) || - __get_user(kfl->l_len, &ufl->l_len) || - __get_user(kfl->l_pid, &ufl->l_pid)) - return -EFAULT; - return 0; -} - -static int put_compat_flock(struct flock *kfl, struct compat_flock __user *ufl) -{ - if (!access_ok(VERIFY_WRITE, ufl, sizeof(*ufl)) || - __put_user(kfl->l_type, &ufl->l_type) || - __put_user(kfl->l_whence, &ufl->l_whence) || - __put_user(kfl->l_start, &ufl->l_start) || - __put_user(kfl->l_len, &ufl->l_len) || - __put_user(kfl->l_pid, &ufl->l_pid)) - return -EFAULT; - return 0; -} - -#ifndef HAVE_ARCH_GET_COMPAT_FLOCK64 -static int get_compat_flock64(struct flock *kfl, struct compat_flock64 __user *ufl) -{ - if (!access_ok(VERIFY_READ, ufl, sizeof(*ufl)) || - __get_user(kfl->l_type, &ufl->l_type) || - __get_user(kfl->l_whence, &ufl->l_whence) || - __get_user(kfl->l_start, &ufl->l_start) || - __get_user(kfl->l_len, &ufl->l_len) || - __get_user(kfl->l_pid, &ufl->l_pid)) - return -EFAULT; - return 0; -} -#endif - -#ifndef HAVE_ARCH_PUT_COMPAT_FLOCK64 -static int put_compat_flock64(struct flock *kfl, struct compat_flock64 __user *ufl) -{ - if (!access_ok(VERIFY_WRITE, ufl, sizeof(*ufl)) || - __put_user(kfl->l_type, &ufl->l_type) || - __put_user(kfl->l_whence, &ufl->l_whence) || - __put_user(kfl->l_start, &ufl->l_start) || - __put_user(kfl->l_len, &ufl->l_len) || - __put_user(kfl->l_pid, &ufl->l_pid)) - return -EFAULT; - return 0; -} -#endif - -static unsigned int -convert_fcntl_cmd(unsigned int cmd) -{ - switch (cmd) { - case F_GETLK64: - return F_GETLK; - case F_SETLK64: - return F_SETLK; - case F_SETLKW64: - return F_SETLKW; - } - - return cmd; -} - -COMPAT_SYSCALL_DEFINE3(fcntl64, unsigned int, fd, unsigned int, cmd, - compat_ulong_t, arg) -{ - mm_segment_t old_fs; - struct flock f; - long ret; - unsigned int conv_cmd; - - switch (cmd) { - case F_GETLK: - case F_SETLK: - case F_SETLKW: - ret = get_compat_flock(&f, compat_ptr(arg)); - if (ret != 0) - break; - old_fs = get_fs(); - set_fs(KERNEL_DS); - ret = sys_fcntl(fd, cmd, (unsigned long)&f); - set_fs(old_fs); - if (cmd == F_GETLK && ret == 0) { - /* GETLK was successful and we need to return the data... - * but it needs to fit in the compat structure. - * l_start shouldn't be too big, unless the original - * start + end is greater than COMPAT_OFF_T_MAX, in which - * case the app was asking for trouble, so we return - * -EOVERFLOW in that case. - * l_len could be too big, in which case we just truncate it, - * and only allow the app to see that part of the conflicting - * lock that might make sense to it anyway - */ - - if (f.l_start > COMPAT_OFF_T_MAX) - ret = -EOVERFLOW; - if (f.l_len > COMPAT_OFF_T_MAX) - f.l_len = COMPAT_OFF_T_MAX; - if (ret == 0) - ret = put_compat_flock(&f, compat_ptr(arg)); - } - break; - - case F_GETLK64: - case F_SETLK64: - case F_SETLKW64: - case F_OFD_GETLK: - case F_OFD_SETLK: - case F_OFD_SETLKW: - ret = get_compat_flock64(&f, compat_ptr(arg)); - if (ret != 0) - break; - old_fs = get_fs(); - set_fs(KERNEL_DS); - conv_cmd = convert_fcntl_cmd(cmd); - ret = sys_fcntl(fd, conv_cmd, (unsigned long)&f); - set_fs(old_fs); - if ((conv_cmd == F_GETLK || conv_cmd == F_OFD_GETLK) && ret == 0) { - /* need to return lock information - see above for commentary */ - if (f.l_start > COMPAT_LOFF_T_MAX) - ret = -EOVERFLOW; - if (f.l_len > COMPAT_LOFF_T_MAX) - f.l_len = COMPAT_LOFF_T_MAX; - if (ret == 0) - ret = put_compat_flock64(&f, compat_ptr(arg)); - } - break; - - default: - ret = sys_fcntl(fd, cmd, arg); - break; - } - return ret; -} - -COMPAT_SYSCALL_DEFINE3(fcntl, unsigned int, fd, unsigned int, cmd, - compat_ulong_t, arg) -{ - switch (cmd) { - case F_GETLK64: - case F_SETLK64: - case F_SETLKW64: - case F_OFD_GETLK: - case F_OFD_SETLK: - case F_OFD_SETLKW: - return -EINVAL; - } - return compat_sys_fcntl64(fd, cmd, arg); -} - -/* A write operation does a read from user space and vice versa */ -#define vrfy_dir(type) ((type) == READ ? VERIFY_WRITE : VERIFY_READ) - -ssize_t compat_rw_copy_check_uvector(int type, - const struct compat_iovec __user *uvector, unsigned long nr_segs, - unsigned long fast_segs, struct iovec *fast_pointer, - struct iovec **ret_pointer) -{ - compat_ssize_t tot_len; - struct iovec *iov = *ret_pointer = fast_pointer; - ssize_t ret = 0; - int seg; - - /* - * SuS says "The readv() function *may* fail if the iovcnt argument - * was less than or equal to 0, or greater than {IOV_MAX}. Linux has - * traditionally returned zero for zero segments, so... - */ - if (nr_segs == 0) - goto out; - - ret = -EINVAL; - if (nr_segs > UIO_MAXIOV) - goto out; - if (nr_segs > fast_segs) { - ret = -ENOMEM; - iov = kmalloc(nr_segs*sizeof(struct iovec), GFP_KERNEL); - if (iov == NULL) - goto out; - } - *ret_pointer = iov; - - ret = -EFAULT; - if (!access_ok(VERIFY_READ, uvector, nr_segs*sizeof(*uvector))) - goto out; - - /* - * Single unix specification: - * We should -EINVAL if an element length is not >= 0 and fitting an - * ssize_t. - * - * In Linux, the total length is limited to MAX_RW_COUNT, there is - * no overflow possibility. - */ - tot_len = 0; - ret = -EINVAL; - for (seg = 0; seg < nr_segs; seg++) { - compat_uptr_t buf; - compat_ssize_t len; - - if (__get_user(len, &uvector->iov_len) || - __get_user(buf, &uvector->iov_base)) { - ret = -EFAULT; - goto out; - } - if (len < 0) /* size_t not fitting in compat_ssize_t .. */ - goto out; - if (type >= 0 && - !access_ok(vrfy_dir(type), compat_ptr(buf), len)) { - ret = -EFAULT; - goto out; - } - if (len > MAX_RW_COUNT - tot_len) - len = MAX_RW_COUNT - tot_len; - tot_len += len; - iov->iov_base = compat_ptr(buf); - iov->iov_len = (compat_size_t) len; - uvector++; - iov++; - } - ret = tot_len; - -out: - return ret; -} - struct compat_ncp_mount_data { compat_int_t version; compat_uint_t ncp_fd; @@ -744,653 +203,3 @@ COMPAT_SYSCALL_DEFINE5(mount, const char __user *, dev_name, out: return retval; } - -struct compat_old_linux_dirent { - compat_ulong_t d_ino; - compat_ulong_t d_offset; - unsigned short d_namlen; - char d_name[1]; -}; - -struct compat_readdir_callback { - struct dir_context ctx; - struct compat_old_linux_dirent __user *dirent; - int result; -}; - -static int compat_fillonedir(struct dir_context *ctx, const char *name, - int namlen, loff_t offset, u64 ino, - unsigned int d_type) -{ - struct compat_readdir_callback *buf = - container_of(ctx, struct compat_readdir_callback, ctx); - struct compat_old_linux_dirent __user *dirent; - compat_ulong_t d_ino; - - if (buf->result) - return -EINVAL; - d_ino = ino; - if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) { - buf->result = -EOVERFLOW; - return -EOVERFLOW; - } - buf->result++; - dirent = buf->dirent; - if (!access_ok(VERIFY_WRITE, dirent, - (unsigned long)(dirent->d_name + namlen + 1) - - (unsigned long)dirent)) - goto efault; - if ( __put_user(d_ino, &dirent->d_ino) || - __put_user(offset, &dirent->d_offset) || - __put_user(namlen, &dirent->d_namlen) || - __copy_to_user(dirent->d_name, name, namlen) || - __put_user(0, dirent->d_name + namlen)) - goto efault; - return 0; -efault: - buf->result = -EFAULT; - return -EFAULT; -} - -COMPAT_SYSCALL_DEFINE3(old_readdir, unsigned int, fd, - struct compat_old_linux_dirent __user *, dirent, unsigned int, count) -{ - int error; - struct fd f = fdget_pos(fd); - struct compat_readdir_callback buf = { - .ctx.actor = compat_fillonedir, - .dirent = dirent - }; - - if (!f.file) - return -EBADF; - - error = iterate_dir(f.file, &buf.ctx); - if (buf.result) - error = buf.result; - - fdput_pos(f); - return error; -} - -struct compat_linux_dirent { - compat_ulong_t d_ino; - compat_ulong_t d_off; - unsigned short d_reclen; - char d_name[1]; -}; - -struct compat_getdents_callback { - struct dir_context ctx; - struct compat_linux_dirent __user *current_dir; - struct compat_linux_dirent __user *previous; - int count; - int error; -}; - -static int compat_filldir(struct dir_context *ctx, const char *name, int namlen, - loff_t offset, u64 ino, unsigned int d_type) -{ - struct compat_linux_dirent __user * dirent; - struct compat_getdents_callback *buf = - container_of(ctx, struct compat_getdents_callback, ctx); - compat_ulong_t d_ino; - int reclen = ALIGN(offsetof(struct compat_linux_dirent, d_name) + - namlen + 2, sizeof(compat_long_t)); - - buf->error = -EINVAL; /* only used if we fail.. */ - if (reclen > buf->count) - return -EINVAL; - d_ino = ino; - if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) { - buf->error = -EOVERFLOW; - return -EOVERFLOW; - } - dirent = buf->previous; - if (dirent) { - if (signal_pending(current)) - return -EINTR; - if (__put_user(offset, &dirent->d_off)) - goto efault; - } - dirent = buf->current_dir; - if (__put_user(d_ino, &dirent->d_ino)) - goto efault; - if (__put_user(reclen, &dirent->d_reclen)) - goto efault; - if (copy_to_user(dirent->d_name, name, namlen)) - goto efault; - if (__put_user(0, dirent->d_name + namlen)) - goto efault; - if (__put_user(d_type, (char __user *) dirent + reclen - 1)) - goto efault; - buf->previous = dirent; - dirent = (void __user *)dirent + reclen; - buf->current_dir = dirent; - buf->count -= reclen; - return 0; -efault: - buf->error = -EFAULT; - return -EFAULT; -} - -COMPAT_SYSCALL_DEFINE3(getdents, unsigned int, fd, - struct compat_linux_dirent __user *, dirent, unsigned int, count) -{ - struct fd f; - struct compat_linux_dirent __user * lastdirent; - struct compat_getdents_callback buf = { - .ctx.actor = compat_filldir, - .current_dir = dirent, - .count = count - }; - int error; - - if (!access_ok(VERIFY_WRITE, dirent, count)) - return -EFAULT; - - f = fdget_pos(fd); - if (!f.file) - return -EBADF; - - error = iterate_dir(f.file, &buf.ctx); - if (error >= 0) - error = buf.error; - lastdirent = buf.previous; - if (lastdirent) { - if (put_user(buf.ctx.pos, &lastdirent->d_off)) - error = -EFAULT; - else - error = count - buf.count; - } - fdput_pos(f); - return error; -} - -#ifdef __ARCH_WANT_COMPAT_SYS_GETDENTS64 - -struct compat_getdents_callback64 { - struct dir_context ctx; - struct linux_dirent64 __user *current_dir; - struct linux_dirent64 __user *previous; - int count; - int error; -}; - -static int compat_filldir64(struct dir_context *ctx, const char *name, - int namlen, loff_t offset, u64 ino, - unsigned int d_type) -{ - struct linux_dirent64 __user *dirent; - struct compat_getdents_callback64 *buf = - container_of(ctx, struct compat_getdents_callback64, ctx); - int reclen = ALIGN(offsetof(struct linux_dirent64, d_name) + namlen + 1, - sizeof(u64)); - u64 off; - - buf->error = -EINVAL; /* only used if we fail.. */ - if (reclen > buf->count) - return -EINVAL; - dirent = buf->previous; - - if (dirent) { - if (signal_pending(current)) - return -EINTR; - if (__put_user_unaligned(offset, &dirent->d_off)) - goto efault; - } - dirent = buf->current_dir; - if (__put_user_unaligned(ino, &dirent->d_ino)) - goto efault; - off = 0; - if (__put_user_unaligned(off, &dirent->d_off)) - goto efault; - if (__put_user(reclen, &dirent->d_reclen)) - goto efault; - if (__put_user(d_type, &dirent->d_type)) - goto efault; - if (copy_to_user(dirent->d_name, name, namlen)) - goto efault; - if (__put_user(0, dirent->d_name + namlen)) - goto efault; - buf->previous = dirent; - dirent = (void __user *)dirent + reclen; - buf->current_dir = dirent; - buf->count -= reclen; - return 0; -efault: - buf->error = -EFAULT; - return -EFAULT; -} - -COMPAT_SYSCALL_DEFINE3(getdents64, unsigned int, fd, - struct linux_dirent64 __user *, dirent, unsigned int, count) -{ - struct fd f; - struct linux_dirent64 __user * lastdirent; - struct compat_getdents_callback64 buf = { - .ctx.actor = compat_filldir64, - .current_dir = dirent, - .count = count - }; - int error; - - if (!access_ok(VERIFY_WRITE, dirent, count)) - return -EFAULT; - - f = fdget_pos(fd); - if (!f.file) - return -EBADF; - - error = iterate_dir(f.file, &buf.ctx); - if (error >= 0) - error = buf.error; - lastdirent = buf.previous; - if (lastdirent) { - typeof(lastdirent->d_off) d_off = buf.ctx.pos; - if (__put_user_unaligned(d_off, &lastdirent->d_off)) - error = -EFAULT; - else - error = count - buf.count; - } - fdput_pos(f); - return error; -} -#endif /* __ARCH_WANT_COMPAT_SYS_GETDENTS64 */ - -/* - * Exactly like fs/open.c:sys_open(), except that it doesn't set the - * O_LARGEFILE flag. - */ -COMPAT_SYSCALL_DEFINE3(open, const char __user *, filename, int, flags, umode_t, mode) -{ - return do_sys_open(AT_FDCWD, filename, flags, mode); -} - -/* - * Exactly like fs/open.c:sys_openat(), except that it doesn't set the - * O_LARGEFILE flag. - */ -COMPAT_SYSCALL_DEFINE4(openat, int, dfd, const char __user *, filename, int, flags, umode_t, mode) -{ - return do_sys_open(dfd, filename, flags, mode); -} - -#define __COMPAT_NFDBITS (8 * sizeof(compat_ulong_t)) - -static int poll_select_copy_remaining(struct timespec *end_time, void __user *p, - int timeval, int ret) -{ - struct timespec ts; - - if (!p) - return ret; - - if (current->personality & STICKY_TIMEOUTS) - goto sticky; - - /* No update for zero timeout */ - if (!end_time->tv_sec && !end_time->tv_nsec) - return ret; - - ktime_get_ts(&ts); - ts = timespec_sub(*end_time, ts); - if (ts.tv_sec < 0) - ts.tv_sec = ts.tv_nsec = 0; - - if (timeval) { - struct compat_timeval rtv; - - rtv.tv_sec = ts.tv_sec; - rtv.tv_usec = ts.tv_nsec / NSEC_PER_USEC; - - if (!copy_to_user(p, &rtv, sizeof(rtv))) - return ret; - } else { - struct compat_timespec rts; - - rts.tv_sec = ts.tv_sec; - rts.tv_nsec = ts.tv_nsec; - - if (!copy_to_user(p, &rts, sizeof(rts))) - return ret; - } - /* - * If an application puts its timeval in read-only memory, we - * don't want the Linux-specific update to the timeval to - * cause a fault after the select has completed - * successfully. However, because we're not updating the - * timeval, we can't restart the system call. - */ - -sticky: - if (ret == -ERESTARTNOHAND) - ret = -EINTR; - return ret; -} - -/* - * Ooo, nasty. We need here to frob 32-bit unsigned longs to - * 64-bit unsigned longs. - */ -static -int compat_get_fd_set(unsigned long nr, compat_ulong_t __user *ufdset, - unsigned long *fdset) -{ - nr = DIV_ROUND_UP(nr, __COMPAT_NFDBITS); - if (ufdset) { - unsigned long odd; - - if (!access_ok(VERIFY_WRITE, ufdset, nr*sizeof(compat_ulong_t))) - return -EFAULT; - - odd = nr & 1UL; - nr &= ~1UL; - while (nr) { - unsigned long h, l; - if (__get_user(l, ufdset) || __get_user(h, ufdset+1)) - return -EFAULT; - ufdset += 2; - *fdset++ = h << 32 | l; - nr -= 2; - } - if (odd && __get_user(*fdset, ufdset)) - return -EFAULT; - } else { - /* Tricky, must clear full unsigned long in the - * kernel fdset at the end, this makes sure that - * actually happens. - */ - memset(fdset, 0, ((nr + 1) & ~1)*sizeof(compat_ulong_t)); - } - return 0; -} - -static -int compat_set_fd_set(unsigned long nr, compat_ulong_t __user *ufdset, - unsigned long *fdset) -{ - unsigned long odd; - nr = DIV_ROUND_UP(nr, __COMPAT_NFDBITS); - - if (!ufdset) - return 0; - - odd = nr & 1UL; - nr &= ~1UL; - while (nr) { - unsigned long h, l; - l = *fdset++; - h = l >> 32; - if (__put_user(l, ufdset) || __put_user(h, ufdset+1)) - return -EFAULT; - ufdset += 2; - nr -= 2; - } - if (odd && __put_user(*fdset, ufdset)) - return -EFAULT; - return 0; -} - - -/* - * This is a virtual copy of sys_select from fs/select.c and probably - * should be compared to it from time to time - */ - -/* - * We can actually return ERESTARTSYS instead of EINTR, but I'd - * like to be certain this leads to no problems. So I return - * EINTR just for safety. - * - * Update: ERESTARTSYS breaks at least the xview clock binary, so - * I'm trying ERESTARTNOHAND which restart only when you want to. - */ -int compat_core_sys_select(int n, compat_ulong_t __user *inp, - compat_ulong_t __user *outp, compat_ulong_t __user *exp, - struct timespec *end_time) -{ - fd_set_bits fds; - void *bits; - int size, max_fds, ret = -EINVAL; - struct fdtable *fdt; - long stack_fds[SELECT_STACK_ALLOC/sizeof(long)]; - - if (n < 0) - goto out_nofds; - - /* max_fds can increase, so grab it once to avoid race */ - rcu_read_lock(); - fdt = files_fdtable(current->files); - max_fds = fdt->max_fds; - rcu_read_unlock(); - if (n > max_fds) - n = max_fds; - - /* - * We need 6 bitmaps (in/out/ex for both incoming and outgoing), - * since we used fdset we need to allocate memory in units of - * long-words. - */ - size = FDS_BYTES(n); - bits = stack_fds; - if (size > sizeof(stack_fds) / 6) { - bits = kmalloc(6 * size, GFP_KERNEL); - ret = -ENOMEM; - if (!bits) - goto out_nofds; - } - fds.in = (unsigned long *) bits; - fds.out = (unsigned long *) (bits + size); - fds.ex = (unsigned long *) (bits + 2*size); - fds.res_in = (unsigned long *) (bits + 3*size); - fds.res_out = (unsigned long *) (bits + 4*size); - fds.res_ex = (unsigned long *) (bits + 5*size); - - if ((ret = compat_get_fd_set(n, inp, fds.in)) || - (ret = compat_get_fd_set(n, outp, fds.out)) || - (ret = compat_get_fd_set(n, exp, fds.ex))) - goto out; - zero_fd_set(n, fds.res_in); - zero_fd_set(n, fds.res_out); - zero_fd_set(n, fds.res_ex); - - ret = do_select(n, &fds, end_time); - - if (ret < 0) - goto out; - if (!ret) { - ret = -ERESTARTNOHAND; - if (signal_pending(current)) - goto out; - ret = 0; - } - - if (compat_set_fd_set(n, inp, fds.res_in) || - compat_set_fd_set(n, outp, fds.res_out) || - compat_set_fd_set(n, exp, fds.res_ex)) - ret = -EFAULT; -out: - if (bits != stack_fds) - kfree(bits); -out_nofds: - return ret; -} - -COMPAT_SYSCALL_DEFINE5(select, int, n, compat_ulong_t __user *, inp, - compat_ulong_t __user *, outp, compat_ulong_t __user *, exp, - struct compat_timeval __user *, tvp) -{ - struct timespec end_time, *to = NULL; - struct compat_timeval tv; - int ret; - - if (tvp) { - if (copy_from_user(&tv, tvp, sizeof(tv))) - return -EFAULT; - - to = &end_time; - if (poll_select_set_timeout(to, - tv.tv_sec + (tv.tv_usec / USEC_PER_SEC), - (tv.tv_usec % USEC_PER_SEC) * NSEC_PER_USEC)) - return -EINVAL; - } - - ret = compat_core_sys_select(n, inp, outp, exp, to); - ret = poll_select_copy_remaining(&end_time, tvp, 1, ret); - - return ret; -} - -struct compat_sel_arg_struct { - compat_ulong_t n; - compat_uptr_t inp; - compat_uptr_t outp; - compat_uptr_t exp; - compat_uptr_t tvp; -}; - -COMPAT_SYSCALL_DEFINE1(old_select, struct compat_sel_arg_struct __user *, arg) -{ - struct compat_sel_arg_struct a; - - if (copy_from_user(&a, arg, sizeof(a))) - return -EFAULT; - return compat_sys_select(a.n, compat_ptr(a.inp), compat_ptr(a.outp), - compat_ptr(a.exp), compat_ptr(a.tvp)); -} - -static long do_compat_pselect(int n, compat_ulong_t __user *inp, - compat_ulong_t __user *outp, compat_ulong_t __user *exp, - struct compat_timespec __user *tsp, compat_sigset_t __user *sigmask, - compat_size_t sigsetsize) -{ - compat_sigset_t ss32; - sigset_t ksigmask, sigsaved; - struct compat_timespec ts; - struct timespec end_time, *to = NULL; - int ret; - - if (tsp) { - if (copy_from_user(&ts, tsp, sizeof(ts))) - return -EFAULT; - - to = &end_time; - if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec)) - return -EINVAL; - } - - if (sigmask) { - if (sigsetsize != sizeof(compat_sigset_t)) - return -EINVAL; - if (copy_from_user(&ss32, sigmask, sizeof(ss32))) - return -EFAULT; - sigset_from_compat(&ksigmask, &ss32); - - sigdelsetmask(&ksigmask, sigmask(SIGKILL)|sigmask(SIGSTOP)); - sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved); - } - - ret = compat_core_sys_select(n, inp, outp, exp, to); - ret = poll_select_copy_remaining(&end_time, tsp, 0, ret); - - if (ret == -ERESTARTNOHAND) { - /* - * Don't restore the signal mask yet. Let do_signal() deliver - * the signal on the way back to userspace, before the signal - * mask is restored. - */ - if (sigmask) { - memcpy(¤t->saved_sigmask, &sigsaved, - sizeof(sigsaved)); - set_restore_sigmask(); - } - } else if (sigmask) - sigprocmask(SIG_SETMASK, &sigsaved, NULL); - - return ret; -} - -COMPAT_SYSCALL_DEFINE6(pselect6, int, n, compat_ulong_t __user *, inp, - compat_ulong_t __user *, outp, compat_ulong_t __user *, exp, - struct compat_timespec __user *, tsp, void __user *, sig) -{ - compat_size_t sigsetsize = 0; - compat_uptr_t up = 0; - - if (sig) { - if (!access_ok(VERIFY_READ, sig, - sizeof(compat_uptr_t)+sizeof(compat_size_t)) || - __get_user(up, (compat_uptr_t __user *)sig) || - __get_user(sigsetsize, - (compat_size_t __user *)(sig+sizeof(up)))) - return -EFAULT; - } - return do_compat_pselect(n, inp, outp, exp, tsp, compat_ptr(up), - sigsetsize); -} - -COMPAT_SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds, - unsigned int, nfds, struct compat_timespec __user *, tsp, - const compat_sigset_t __user *, sigmask, compat_size_t, sigsetsize) -{ - compat_sigset_t ss32; - sigset_t ksigmask, sigsaved; - struct compat_timespec ts; - struct timespec end_time, *to = NULL; - int ret; - - if (tsp) { - if (copy_from_user(&ts, tsp, sizeof(ts))) - return -EFAULT; - - to = &end_time; - if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec)) - return -EINVAL; - } - - if (sigmask) { - if (sigsetsize != sizeof(compat_sigset_t)) - return -EINVAL; - if (copy_from_user(&ss32, sigmask, sizeof(ss32))) - return -EFAULT; - sigset_from_compat(&ksigmask, &ss32); - - sigdelsetmask(&ksigmask, sigmask(SIGKILL)|sigmask(SIGSTOP)); - sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved); - } - - ret = do_sys_poll(ufds, nfds, to); - - /* We can restart this syscall, usually */ - if (ret == -EINTR) { - /* - * Don't restore the signal mask yet. Let do_signal() deliver - * the signal on the way back to userspace, before the signal - * mask is restored. - */ - if (sigmask) { - memcpy(¤t->saved_sigmask, &sigsaved, - sizeof(sigsaved)); - set_restore_sigmask(); - } - ret = -ERESTARTNOHAND; - } else if (sigmask) - sigprocmask(SIG_SETMASK, &sigsaved, NULL); - - ret = poll_select_copy_remaining(&end_time, tsp, 0, ret); - - return ret; -} - -#ifdef CONFIG_FHANDLE -/* - * Exactly like fs/open.c:sys_open_by_handle_at(), except that it - * doesn't set the O_LARGEFILE flag. - */ -COMPAT_SYSCALL_DEFINE3(open_by_handle_at, int, mountdirfd, - struct file_handle __user *, handle, int, flags) -{ - return do_handle_open(mountdirfd, handle, flags); -} -#endif @@ -55,32 +55,6 @@ static int __init init_dax_wait_table(void) } fs_initcall(init_dax_wait_table); -static long dax_map_atomic(struct block_device *bdev, struct blk_dax_ctl *dax) -{ - struct request_queue *q = bdev->bd_queue; - long rc = -EIO; - - dax->addr = ERR_PTR(-EIO); - if (blk_queue_enter(q, true) != 0) - return rc; - - rc = bdev_direct_access(bdev, dax); - if (rc < 0) { - dax->addr = ERR_PTR(rc); - blk_queue_exit(q); - return rc; - } - return rc; -} - -static void dax_unmap_atomic(struct block_device *bdev, - const struct blk_dax_ctl *dax) -{ - if (IS_ERR(dax->addr)) - return; - blk_queue_exit(bdev->bd_queue); -} - static int dax_is_pmd_entry(void *entry) { return (unsigned long)entry & RADIX_DAX_PMD; @@ -101,26 +75,6 @@ static int dax_is_empty_entry(void *entry) return (unsigned long)entry & RADIX_DAX_EMPTY; } -struct page *read_dax_sector(struct block_device *bdev, sector_t n) -{ - struct page *page = alloc_pages(GFP_KERNEL, 0); - struct blk_dax_ctl dax = { - .size = PAGE_SIZE, - .sector = n & ~((((int) PAGE_SIZE) / 512) - 1), - }; - long rc; - - if (!page) - return ERR_PTR(-ENOMEM); - - rc = dax_map_atomic(bdev, &dax); - if (rc < 0) - return ERR_PTR(rc); - memcpy_from_pmem(page_address(page), dax.addr, PAGE_SIZE); - dax_unmap_atomic(bdev, &dax); - return page; -} - /* * DAX radix tree locking */ @@ -582,21 +536,30 @@ static int dax_load_hole(struct address_space *mapping, void **entry, return ret; } -static int copy_user_dax(struct block_device *bdev, sector_t sector, size_t size, - struct page *to, unsigned long vaddr) +static int copy_user_dax(struct block_device *bdev, struct dax_device *dax_dev, + sector_t sector, size_t size, struct page *to, + unsigned long vaddr) { - struct blk_dax_ctl dax = { - .sector = sector, - .size = size, - }; - void *vto; - - if (dax_map_atomic(bdev, &dax) < 0) - return PTR_ERR(dax.addr); + void *vto, *kaddr; + pgoff_t pgoff; + pfn_t pfn; + long rc; + int id; + + rc = bdev_dax_pgoff(bdev, sector, size, &pgoff); + if (rc) + return rc; + + id = dax_read_lock(); + rc = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size), &kaddr, &pfn); + if (rc < 0) { + dax_read_unlock(id); + return rc; + } vto = kmap_atomic(to); - copy_user_page(vto, (void __force *)dax.addr, vaddr, to); + copy_user_page(vto, (void __force *)kaddr, vaddr, to); kunmap_atomic(vto); - dax_unmap_atomic(bdev, &dax); + dax_read_unlock(id); return 0; } @@ -764,12 +727,16 @@ unlock_pte: } static int dax_writeback_one(struct block_device *bdev, - struct address_space *mapping, pgoff_t index, void *entry) + struct dax_device *dax_dev, struct address_space *mapping, + pgoff_t index, void *entry) { struct radix_tree_root *page_tree = &mapping->page_tree; - struct blk_dax_ctl dax; - void *entry2, **slot; - int ret = 0; + void *entry2, **slot, *kaddr; + long ret = 0, id; + sector_t sector; + pgoff_t pgoff; + size_t size; + pfn_t pfn; /* * A page got tagged dirty in DAX mapping? Something is seriously @@ -818,26 +785,29 @@ static int dax_writeback_one(struct block_device *bdev, * 'entry'. This allows us to flush for PMD_SIZE and not have to * worry about partial PMD writebacks. */ - dax.sector = dax_radix_sector(entry); - dax.size = PAGE_SIZE << dax_radix_order(entry); + sector = dax_radix_sector(entry); + size = PAGE_SIZE << dax_radix_order(entry); + + id = dax_read_lock(); + ret = bdev_dax_pgoff(bdev, sector, size, &pgoff); + if (ret) + goto dax_unlock; /* - * We cannot hold tree_lock while calling dax_map_atomic() because it - * eventually calls cond_resched(). + * dax_direct_access() may sleep, so cannot hold tree_lock over + * its invocation. */ - ret = dax_map_atomic(bdev, &dax); - if (ret < 0) { - put_locked_mapping_entry(mapping, index, entry); - return ret; - } + ret = dax_direct_access(dax_dev, pgoff, size / PAGE_SIZE, &kaddr, &pfn); + if (ret < 0) + goto dax_unlock; - if (WARN_ON_ONCE(ret < dax.size)) { + if (WARN_ON_ONCE(ret < size / PAGE_SIZE)) { ret = -EIO; - goto unmap; + goto dax_unlock; } - dax_mapping_entry_mkclean(mapping, index, pfn_t_to_pfn(dax.pfn)); - wb_cache_pmem(dax.addr, dax.size); + dax_mapping_entry_mkclean(mapping, index, pfn_t_to_pfn(pfn)); + wb_cache_pmem(kaddr, size); /* * After we have flushed the cache, we can clear the dirty tag. There * cannot be new dirty data in the pfn after the flush has completed as @@ -847,8 +817,8 @@ static int dax_writeback_one(struct block_device *bdev, spin_lock_irq(&mapping->tree_lock); radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_DIRTY); spin_unlock_irq(&mapping->tree_lock); - unmap: - dax_unmap_atomic(bdev, &dax); + dax_unlock: + dax_read_unlock(id); put_locked_mapping_entry(mapping, index, entry); return ret; @@ -869,6 +839,7 @@ int dax_writeback_mapping_range(struct address_space *mapping, struct inode *inode = mapping->host; pgoff_t start_index, end_index; pgoff_t indices[PAGEVEC_SIZE]; + struct dax_device *dax_dev; struct pagevec pvec; bool done = false; int i, ret = 0; @@ -879,6 +850,10 @@ int dax_writeback_mapping_range(struct address_space *mapping, if (!mapping->nrexceptional || wbc->sync_mode != WB_SYNC_ALL) return 0; + dax_dev = dax_get_by_host(bdev->bd_disk->disk_name); + if (!dax_dev) + return -EIO; + start_index = wbc->range_start >> PAGE_SHIFT; end_index = wbc->range_end >> PAGE_SHIFT; @@ -899,38 +874,49 @@ int dax_writeback_mapping_range(struct address_space *mapping, break; } - ret = dax_writeback_one(bdev, mapping, indices[i], - pvec.pages[i]); - if (ret < 0) + ret = dax_writeback_one(bdev, dax_dev, mapping, + indices[i], pvec.pages[i]); + if (ret < 0) { + put_dax(dax_dev); return ret; + } } } + put_dax(dax_dev); return 0; } EXPORT_SYMBOL_GPL(dax_writeback_mapping_range); static int dax_insert_mapping(struct address_space *mapping, - struct block_device *bdev, sector_t sector, size_t size, - void **entryp, struct vm_area_struct *vma, struct vm_fault *vmf) + struct block_device *bdev, struct dax_device *dax_dev, + sector_t sector, size_t size, void **entryp, + struct vm_area_struct *vma, struct vm_fault *vmf) { unsigned long vaddr = vmf->address; - struct blk_dax_ctl dax = { - .sector = sector, - .size = size, - }; - void *ret; void *entry = *entryp; + void *ret, *kaddr; + pgoff_t pgoff; + int id, rc; + pfn_t pfn; - if (dax_map_atomic(bdev, &dax) < 0) - return PTR_ERR(dax.addr); - dax_unmap_atomic(bdev, &dax); + rc = bdev_dax_pgoff(bdev, sector, size, &pgoff); + if (rc) + return rc; - ret = dax_insert_mapping_entry(mapping, vmf, entry, dax.sector, 0); + id = dax_read_lock(); + rc = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size), &kaddr, &pfn); + if (rc < 0) { + dax_read_unlock(id); + return rc; + } + dax_read_unlock(id); + + ret = dax_insert_mapping_entry(mapping, vmf, entry, sector, 0); if (IS_ERR(ret)) return PTR_ERR(ret); *entryp = ret; - return vm_insert_mixed(vma, vaddr, dax.pfn); + return vm_insert_mixed(vma, vaddr, pfn); } /** @@ -979,24 +965,34 @@ static bool dax_range_is_aligned(struct block_device *bdev, return true; } -int __dax_zero_page_range(struct block_device *bdev, sector_t sector, - unsigned int offset, unsigned int length) +int __dax_zero_page_range(struct block_device *bdev, + struct dax_device *dax_dev, sector_t sector, + unsigned int offset, unsigned int size) { - struct blk_dax_ctl dax = { - .sector = sector, - .size = PAGE_SIZE, - }; - - if (dax_range_is_aligned(bdev, offset, length)) { - sector_t start_sector = dax.sector + (offset >> 9); + if (dax_range_is_aligned(bdev, offset, size)) { + sector_t start_sector = sector + (offset >> 9); return blkdev_issue_zeroout(bdev, start_sector, - length >> 9, GFP_NOFS, true); + size >> 9, GFP_NOFS, 0); } else { - if (dax_map_atomic(bdev, &dax) < 0) - return PTR_ERR(dax.addr); - clear_pmem(dax.addr + offset, length); - dax_unmap_atomic(bdev, &dax); + pgoff_t pgoff; + long rc, id; + void *kaddr; + pfn_t pfn; + + rc = bdev_dax_pgoff(bdev, sector, size, &pgoff); + if (rc) + return rc; + + id = dax_read_lock(); + rc = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size), &kaddr, + &pfn); + if (rc < 0) { + dax_read_unlock(id); + return rc; + } + clear_pmem(kaddr + offset, size); + dax_read_unlock(id); } return 0; } @@ -1011,9 +1007,12 @@ static loff_t dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data, struct iomap *iomap) { + struct block_device *bdev = iomap->bdev; + struct dax_device *dax_dev = iomap->dax_dev; struct iov_iter *iter = data; loff_t end = pos + length, done = 0; ssize_t ret = 0; + int id; if (iov_iter_rw(iter) == READ) { end = min(end, i_size_read(inode)); @@ -1038,34 +1037,42 @@ dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data, (end - 1) >> PAGE_SHIFT); } + id = dax_read_lock(); while (pos < end) { unsigned offset = pos & (PAGE_SIZE - 1); - struct blk_dax_ctl dax = { 0 }; + const size_t size = ALIGN(length + offset, PAGE_SIZE); + const sector_t sector = dax_iomap_sector(iomap, pos); ssize_t map_len; + pgoff_t pgoff; + void *kaddr; + pfn_t pfn; if (fatal_signal_pending(current)) { ret = -EINTR; break; } - dax.sector = dax_iomap_sector(iomap, pos); - dax.size = (length + offset + PAGE_SIZE - 1) & PAGE_MASK; - map_len = dax_map_atomic(iomap->bdev, &dax); + ret = bdev_dax_pgoff(bdev, sector, size, &pgoff); + if (ret) + break; + + map_len = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size), + &kaddr, &pfn); if (map_len < 0) { ret = map_len; break; } - dax.addr += offset; + map_len = PFN_PHYS(map_len); + kaddr += offset; map_len -= offset; if (map_len > end - pos) map_len = end - pos; if (iov_iter_rw(iter) == WRITE) - map_len = copy_from_iter_pmem(dax.addr, map_len, iter); + map_len = copy_from_iter_pmem(kaddr, map_len, iter); else - map_len = copy_to_iter(dax.addr, map_len, iter); - dax_unmap_atomic(iomap->bdev, &dax); + map_len = copy_to_iter(kaddr, map_len, iter); if (map_len <= 0) { ret = map_len ? map_len : -EFAULT; break; @@ -1075,6 +1082,7 @@ dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data, length -= map_len; done += map_len; } + dax_read_unlock(id); return done ? done : ret; } @@ -1181,8 +1189,8 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, clear_user_highpage(vmf->cow_page, vaddr); break; case IOMAP_MAPPED: - error = copy_user_dax(iomap.bdev, sector, PAGE_SIZE, - vmf->cow_page, vaddr); + error = copy_user_dax(iomap.bdev, iomap.dax_dev, + sector, PAGE_SIZE, vmf->cow_page, vaddr); break; default: WARN_ON_ONCE(1); @@ -1207,8 +1215,8 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, mem_cgroup_count_vm_event(vmf->vma->vm_mm, PGMAJFAULT); major = VM_FAULT_MAJOR; } - error = dax_insert_mapping(mapping, iomap.bdev, sector, - PAGE_SIZE, &entry, vmf->vma, vmf); + error = dax_insert_mapping(mapping, iomap.bdev, iomap.dax_dev, + sector, PAGE_SIZE, &entry, vmf->vma, vmf); /* -EBUSY is fine, somebody else faulted on the same PTE */ if (error == -EBUSY) error = 0; @@ -1258,41 +1266,48 @@ static int dax_pmd_insert_mapping(struct vm_fault *vmf, struct iomap *iomap, loff_t pos, void **entryp) { struct address_space *mapping = vmf->vma->vm_file->f_mapping; + const sector_t sector = dax_iomap_sector(iomap, pos); + struct dax_device *dax_dev = iomap->dax_dev; struct block_device *bdev = iomap->bdev; struct inode *inode = mapping->host; - struct blk_dax_ctl dax = { - .sector = dax_iomap_sector(iomap, pos), - .size = PMD_SIZE, - }; - long length = dax_map_atomic(bdev, &dax); - void *ret = NULL; - - if (length < 0) /* dax_map_atomic() failed */ + const size_t size = PMD_SIZE; + void *ret = NULL, *kaddr; + long length = 0; + pgoff_t pgoff; + pfn_t pfn; + int id; + + if (bdev_dax_pgoff(bdev, sector, size, &pgoff) != 0) goto fallback; - if (length < PMD_SIZE) - goto unmap_fallback; - if (pfn_t_to_pfn(dax.pfn) & PG_PMD_COLOUR) - goto unmap_fallback; - if (!pfn_t_devmap(dax.pfn)) - goto unmap_fallback; - - dax_unmap_atomic(bdev, &dax); - ret = dax_insert_mapping_entry(mapping, vmf, *entryp, dax.sector, + id = dax_read_lock(); + length = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size), &kaddr, &pfn); + if (length < 0) + goto unlock_fallback; + length = PFN_PHYS(length); + + if (length < size) + goto unlock_fallback; + if (pfn_t_to_pfn(pfn) & PG_PMD_COLOUR) + goto unlock_fallback; + if (!pfn_t_devmap(pfn)) + goto unlock_fallback; + dax_read_unlock(id); + + ret = dax_insert_mapping_entry(mapping, vmf, *entryp, sector, RADIX_DAX_PMD); if (IS_ERR(ret)) goto fallback; *entryp = ret; - trace_dax_pmd_insert_mapping(inode, vmf, length, dax.pfn, ret); + trace_dax_pmd_insert_mapping(inode, vmf, length, pfn, ret); return vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd, - dax.pfn, vmf->flags & FAULT_FLAG_WRITE); + pfn, vmf->flags & FAULT_FLAG_WRITE); - unmap_fallback: - dax_unmap_atomic(bdev, &dax); +unlock_fallback: + dax_read_unlock(id); fallback: - trace_dax_pmd_insert_mapping_fallback(inode, vmf, length, - dax.pfn, ret); + trace_dax_pmd_insert_mapping_fallback(inode, vmf, length, pfn, ret); return VM_FAULT_FALLBACK; } diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h index 95c1c8d34539..9c351bf757b2 100644 --- a/fs/ecryptfs/ecryptfs_kernel.h +++ b/fs/ecryptfs/ecryptfs_kernel.h @@ -349,7 +349,6 @@ struct ecryptfs_mount_crypt_stat { struct ecryptfs_sb_info { struct super_block *wsi_sb; struct ecryptfs_mount_crypt_stat mount_crypt_stat; - struct backing_dev_info bdi; }; /* file private data. */ diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c index 151872dcc1f4..9014479d0160 100644 --- a/fs/ecryptfs/main.c +++ b/fs/ecryptfs/main.c @@ -519,12 +519,11 @@ static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags goto out; } - rc = bdi_setup_and_register(&sbi->bdi, "ecryptfs"); + rc = super_setup_bdi(s); if (rc) goto out1; ecryptfs_set_superblock_private(s, sbi); - s->s_bdi = &sbi->bdi; /* ->kill_sb() will take care of sbi after that point */ sbi = NULL; @@ -633,7 +632,6 @@ static void ecryptfs_kill_block_super(struct super_block *sb) if (!sb_info) return; ecryptfs_destroy_mount_crypt_stat(&sb_info->mount_crypt_stat); - bdi_destroy(&sb_info->bdi); kmem_cache_free(ecryptfs_sb_info_cache, sb_info); } diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 341251421ced..5420767c9b68 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -42,6 +42,7 @@ #include <linux/seq_file.h> #include <linux/compat.h> #include <linux/rculist.h> +#include <net/busy_poll.h> /* * LOCKING: @@ -224,6 +225,11 @@ struct eventpoll { /* used to optimize loop detection check */ int visited; struct list_head visited_list_link; + +#ifdef CONFIG_NET_RX_BUSY_POLL + /* used to track busy poll napi_id */ + unsigned int napi_id; +#endif }; /* Wait structure used by the poll hooks */ @@ -384,6 +390,77 @@ static inline int ep_events_available(struct eventpoll *ep) return !list_empty(&ep->rdllist) || ep->ovflist != EP_UNACTIVE_PTR; } +#ifdef CONFIG_NET_RX_BUSY_POLL +static bool ep_busy_loop_end(void *p, unsigned long start_time) +{ + struct eventpoll *ep = p; + + return ep_events_available(ep) || busy_loop_timeout(start_time); +} +#endif /* CONFIG_NET_RX_BUSY_POLL */ + +/* + * Busy poll if globally on and supporting sockets found && no events, + * busy loop will return if need_resched or ep_events_available. + * + * we must do our busy polling with irqs enabled + */ +static void ep_busy_loop(struct eventpoll *ep, int nonblock) +{ +#ifdef CONFIG_NET_RX_BUSY_POLL + unsigned int napi_id = READ_ONCE(ep->napi_id); + + if ((napi_id >= MIN_NAPI_ID) && net_busy_loop_on()) + napi_busy_loop(napi_id, nonblock ? NULL : ep_busy_loop_end, ep); +#endif +} + +static inline void ep_reset_busy_poll_napi_id(struct eventpoll *ep) +{ +#ifdef CONFIG_NET_RX_BUSY_POLL + if (ep->napi_id) + ep->napi_id = 0; +#endif +} + +/* + * Set epoll busy poll NAPI ID from sk. + */ +static inline void ep_set_busy_poll_napi_id(struct epitem *epi) +{ +#ifdef CONFIG_NET_RX_BUSY_POLL + struct eventpoll *ep; + unsigned int napi_id; + struct socket *sock; + struct sock *sk; + int err; + + if (!net_busy_loop_on()) + return; + + sock = sock_from_file(epi->ffd.file, &err); + if (!sock) + return; + + sk = sock->sk; + if (!sk) + return; + + napi_id = READ_ONCE(sk->sk_napi_id); + ep = epi->ep; + + /* Non-NAPI IDs can be rejected + * or + * Nothing to do if we already have this ID + */ + if (napi_id < MIN_NAPI_ID || napi_id == ep->napi_id) + return; + + /* record NAPI ID for use in next busy poll */ + ep->napi_id = napi_id; +#endif +} + /** * ep_call_nested - Perform a bound (possibly) nested call, by checking * that the recursion limit is not exceeded, and that @@ -1022,6 +1099,8 @@ static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *k spin_lock_irqsave(&ep->lock, flags); + ep_set_busy_poll_napi_id(epi); + /* * If the event mask does not contain any poll(2) event, we consider the * descriptor to be disabled. This condition is likely the effect of the @@ -1363,6 +1442,9 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event, /* We have to drop the new item inside our item list to keep track of it */ spin_lock_irqsave(&ep->lock, flags); + /* record NAPI ID of new item if present */ + ep_set_busy_poll_napi_id(epi); + /* If the file is already "ready" we drop it inside the ready list */ if ((revents & event->events) && !ep_is_linked(&epi->rdllink)) { list_add_tail(&epi->rdllink, &ep->rdllist); @@ -1637,10 +1719,21 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, } fetch_events: + + if (!ep_events_available(ep)) + ep_busy_loop(ep, timed_out); + spin_lock_irqsave(&ep->lock, flags); if (!ep_events_available(ep)) { /* + * Busy poll timed out. Drop NAPI ID for now, we can add + * it back in when we have moved a socket with a valid NAPI + * ID onto the ready list. + */ + ep_reset_busy_poll_napi_id(ep); + + /* * We don't have any available event to return to the caller. * We need to sleep here, and we will be wake up by * ep_poll_callback() when events will become available. diff --git a/fs/exec.c b/fs/exec.c index 65145a3df065..72934df68471 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1320,6 +1320,7 @@ void setup_new_exec(struct linux_binprm * bprm) else set_dumpable(current->mm, suid_dumpable); + arch_setup_new_exec(); perf_event_exec(); __set_task_comm(current, kbasename(bprm->filename), true); diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h index 2e86086bc940..5dc392404559 100644 --- a/fs/exofs/exofs.h +++ b/fs/exofs/exofs.h @@ -64,7 +64,6 @@ struct exofs_dev { * our extension to the in-memory superblock */ struct exofs_sb_info { - struct backing_dev_info bdi; /* register our bdi with VFS */ struct exofs_sb_stats s_ess; /* Written often, pre-allocate*/ int s_timeout; /* timeout for OSD operations */ uint64_t s_nextid; /* highest object ID used */ diff --git a/fs/exofs/super.c b/fs/exofs/super.c index 1076a4233b39..819624cfc8da 100644 --- a/fs/exofs/super.c +++ b/fs/exofs/super.c @@ -464,7 +464,6 @@ static void exofs_put_super(struct super_block *sb) sbi->one_comp.obj.partition); exofs_sysfs_sb_del(sbi); - bdi_destroy(&sbi->bdi); exofs_free_sbi(sbi); sb->s_fs_info = NULL; } @@ -809,8 +808,12 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent) __sbi_read_stats(sbi); /* set up operation vectors */ - sbi->bdi.ra_pages = __ra_pages(&sbi->layout); - sb->s_bdi = &sbi->bdi; + ret = super_setup_bdi(sb); + if (ret) { + EXOFS_DBGMSG("Failed to super_setup_bdi\n"); + goto free_sbi; + } + sb->s_bdi->ra_pages = __ra_pages(&sbi->layout); sb->s_fs_info = sbi; sb->s_op = &exofs_sops; sb->s_export_op = &exofs_export_ops; @@ -836,14 +839,6 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent) goto free_sbi; } - ret = bdi_setup_and_register(&sbi->bdi, "exofs"); - if (ret) { - EXOFS_DBGMSG("Failed to bdi_setup_and_register\n"); - dput(sb->s_root); - sb->s_root = NULL; - goto free_sbi; - } - exofs_sysfs_dbg_print(); _exofs_print_device("Mounting", opts->dev_name, ore_comp_dev(&sbi->oc, 0), diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h index 5e64de9c5093..03f5ce1d3dbe 100644 --- a/fs/ext2/ext2.h +++ b/fs/ext2/ext2.h @@ -779,7 +779,6 @@ extern void ext2_evict_inode(struct inode *); extern int ext2_get_block(struct inode *, sector_t, struct buffer_head *, int); extern int ext2_setattr (struct dentry *, struct iattr *); extern void ext2_set_inode_flags(struct inode *inode); -extern void ext2_get_inode_flags(struct ext2_inode_info *); extern int ext2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len); @@ -796,7 +795,8 @@ void ext2_error(struct super_block *, const char *, const char *, ...); extern __printf(3, 4) void ext2_msg(struct super_block *, const char *, const char *, ...); extern void ext2_update_dynamic_rev (struct super_block *sb); -extern void ext2_write_super (struct super_block *); +extern void ext2_sync_super(struct super_block *sb, struct ext2_super_block *es, + int wait); /* * Inodes and files operations diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index 128cce540645..26d77f9f8c12 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -799,6 +799,7 @@ int ext2_get_block(struct inode *inode, sector_t iblock, static int ext2_iomap_begin(struct inode *inode, loff_t offset, loff_t length, unsigned flags, struct iomap *iomap) { + struct block_device *bdev; unsigned int blkbits = inode->i_blkbits; unsigned long first_block = offset >> blkbits; unsigned long max_blocks = (length + (1 << blkbits) - 1) >> blkbits; @@ -812,8 +813,13 @@ static int ext2_iomap_begin(struct inode *inode, loff_t offset, loff_t length, return ret; iomap->flags = 0; - iomap->bdev = inode->i_sb->s_bdev; + bdev = inode->i_sb->s_bdev; + iomap->bdev = bdev; iomap->offset = (u64)first_block << blkbits; + if (blk_queue_dax(bdev->bd_queue)) + iomap->dax_dev = dax_get_by_host(bdev->bd_disk->disk_name); + else + iomap->dax_dev = NULL; if (ret == 0) { iomap->type = IOMAP_HOLE; @@ -835,6 +841,7 @@ static int ext2_iomap_end(struct inode *inode, loff_t offset, loff_t length, ssize_t written, unsigned flags, struct iomap *iomap) { + put_dax(iomap->dax_dev); if (iomap->type == IOMAP_MAPPED && written < length && (flags & IOMAP_WRITE)) @@ -1384,25 +1391,6 @@ void ext2_set_inode_flags(struct inode *inode) inode->i_flags |= S_DAX; } -/* Propagate flags from i_flags to EXT2_I(inode)->i_flags */ -void ext2_get_inode_flags(struct ext2_inode_info *ei) -{ - unsigned int flags = ei->vfs_inode.i_flags; - - ei->i_flags &= ~(EXT2_SYNC_FL|EXT2_APPEND_FL| - EXT2_IMMUTABLE_FL|EXT2_NOATIME_FL|EXT2_DIRSYNC_FL); - if (flags & S_SYNC) - ei->i_flags |= EXT2_SYNC_FL; - if (flags & S_APPEND) - ei->i_flags |= EXT2_APPEND_FL; - if (flags & S_IMMUTABLE) - ei->i_flags |= EXT2_IMMUTABLE_FL; - if (flags & S_NOATIME) - ei->i_flags |= EXT2_NOATIME_FL; - if (flags & S_DIRSYNC) - ei->i_flags |= EXT2_DIRSYNC_FL; -} - struct inode *ext2_iget (struct super_block *sb, unsigned long ino) { struct ext2_inode_info *ei; @@ -1563,7 +1551,6 @@ static int __ext2_write_inode(struct inode *inode, int do_sync) if (ei->i_state & EXT2_STATE_NEW) memset(raw_inode, 0, EXT2_SB(sb)->s_inode_size); - ext2_get_inode_flags(ei); raw_inode->i_mode = cpu_to_le16(inode->i_mode); if (!(test_opt(sb, NO_UID32))) { raw_inode->i_uid_low = cpu_to_le16(low_16_bits(uid)); @@ -1615,7 +1602,7 @@ static int __ext2_write_inode(struct inode *inode, int do_sync) EXT2_SET_RO_COMPAT_FEATURE(sb, EXT2_FEATURE_RO_COMPAT_LARGE_FILE); spin_unlock(&EXT2_SB(sb)->s_lock); - ext2_write_super(sb); + ext2_sync_super(sb, EXT2_SB(sb)->s_es, 1); } } } diff --git a/fs/ext2/ioctl.c b/fs/ext2/ioctl.c index 191e02b28ce8..087f122cca42 100644 --- a/fs/ext2/ioctl.c +++ b/fs/ext2/ioctl.c @@ -29,7 +29,6 @@ long ext2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) switch (cmd) { case EXT2_IOC_GETFLAGS: - ext2_get_inode_flags(ei); flags = ei->i_flags & EXT2_FL_USER_VISIBLE; return put_user(flags, (int __user *) arg); case EXT2_IOC_SETFLAGS: { diff --git a/fs/ext2/super.c b/fs/ext2/super.c index 9e25a71fe1a2..8ac673c71a36 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -36,8 +36,7 @@ #include "xattr.h" #include "acl.h" -static void ext2_sync_super(struct super_block *sb, - struct ext2_super_block *es, int wait); +static void ext2_write_super(struct super_block *sb); static int ext2_remount (struct super_block * sb, int * flags, char * data); static int ext2_statfs (struct dentry * dentry, struct kstatfs * buf); static int ext2_sync_fs(struct super_block *sb, int wait); @@ -123,13 +122,29 @@ void ext2_update_dynamic_rev(struct super_block *sb) */ } +#ifdef CONFIG_QUOTA +static int ext2_quota_off(struct super_block *sb, int type); + +static void ext2_quota_off_umount(struct super_block *sb) +{ + int type; + + for (type = 0; type < MAXQUOTAS; type++) + ext2_quota_off(sb, type); +} +#else +static inline void ext2_quota_off_umount(struct super_block *sb) +{ +} +#endif + static void ext2_put_super (struct super_block * sb) { int db_count; int i; struct ext2_sb_info *sbi = EXT2_SB(sb); - dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED); + ext2_quota_off_umount(sb); if (sbi->s_mb_cache) { ext2_xattr_destroy_cache(sbi->s_mb_cache); @@ -314,10 +329,23 @@ static int ext2_show_options(struct seq_file *seq, struct dentry *root) #ifdef CONFIG_QUOTA static ssize_t ext2_quota_read(struct super_block *sb, int type, char *data, size_t len, loff_t off); static ssize_t ext2_quota_write(struct super_block *sb, int type, const char *data, size_t len, loff_t off); +static int ext2_quota_on(struct super_block *sb, int type, int format_id, + const struct path *path); static struct dquot **ext2_get_dquots(struct inode *inode) { return EXT2_I(inode)->i_dquot; } + +static const struct quotactl_ops ext2_quotactl_ops = { + .quota_on = ext2_quota_on, + .quota_off = ext2_quota_off, + .quota_sync = dquot_quota_sync, + .get_state = dquot_get_state, + .set_info = dquot_set_dqinfo, + .get_dqblk = dquot_get_dqblk, + .set_dqblk = dquot_set_dqblk, + .get_nextdqblk = dquot_get_next_dqblk, +}; #endif static const struct super_operations ext2_sops = { @@ -1117,7 +1145,7 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) #ifdef CONFIG_QUOTA sb->dq_op = &dquot_operations; - sb->s_qcop = &dquot_quotactl_ops; + sb->s_qcop = &ext2_quotactl_ops; sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP; #endif @@ -1194,8 +1222,8 @@ static void ext2_clear_super_error(struct super_block *sb) } } -static void ext2_sync_super(struct super_block *sb, struct ext2_super_block *es, - int wait) +void ext2_sync_super(struct super_block *sb, struct ext2_super_block *es, + int wait) { ext2_clear_super_error(sb); spin_lock(&EXT2_SB(sb)->s_lock); @@ -1270,7 +1298,7 @@ static int ext2_unfreeze(struct super_block *sb) return 0; } -void ext2_write_super(struct super_block *sb) +static void ext2_write_super(struct super_block *sb) { if (!(sb->s_flags & MS_RDONLY)) ext2_sync_fs(sb, 1); @@ -1548,6 +1576,51 @@ out: return len - towrite; } +static int ext2_quota_on(struct super_block *sb, int type, int format_id, + const struct path *path) +{ + int err; + struct inode *inode; + + err = dquot_quota_on(sb, type, format_id, path); + if (err) + return err; + + inode = d_inode(path->dentry); + inode_lock(inode); + EXT2_I(inode)->i_flags |= EXT2_NOATIME_FL | EXT2_IMMUTABLE_FL; + inode_set_flags(inode, S_NOATIME | S_IMMUTABLE, + S_NOATIME | S_IMMUTABLE); + inode_unlock(inode); + mark_inode_dirty(inode); + + return 0; +} + +static int ext2_quota_off(struct super_block *sb, int type) +{ + struct inode *inode = sb_dqopt(sb)->files[type]; + int err; + + if (!inode || !igrab(inode)) + goto out; + + err = dquot_quota_off(sb, type); + if (err) + goto out_put; + + inode_lock(inode); + EXT2_I(inode)->i_flags &= ~(EXT2_NOATIME_FL | EXT2_IMMUTABLE_FL); + inode_set_flags(inode, 0, S_NOATIME | S_IMMUTABLE); + inode_unlock(inode); + mark_inode_dirty(inode); +out_put: + iput(inode); + return err; +out: + return dquot_quota_off(sb, type); +} + #endif static struct file_system_type ext2_fs_type = { diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 13742d6c83d2..8e8046104f4d 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2476,7 +2476,6 @@ extern int ext4_truncate(struct inode *); extern int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length); extern int ext4_truncate_restart_trans(handle_t *, struct inode *, int nblocks); extern void ext4_set_inode_flags(struct inode *); -extern void ext4_get_inode_flags(struct ext4_inode_info *); extern int ext4_alloc_da_blocks(struct inode *inode); extern void ext4_set_aops(struct inode *inode); extern int ext4_writepage_trans_blocks(struct inode *); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 2a4507deb925..5834c4d76be8 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3340,6 +3340,7 @@ static int ext4_releasepage(struct page *page, gfp_t wait) static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length, unsigned flags, struct iomap *iomap) { + struct block_device *bdev; unsigned int blkbits = inode->i_blkbits; unsigned long first_block = offset >> blkbits; unsigned long last_block = (offset + length - 1) >> blkbits; @@ -3408,7 +3409,12 @@ retry: } iomap->flags = 0; - iomap->bdev = inode->i_sb->s_bdev; + bdev = inode->i_sb->s_bdev; + iomap->bdev = bdev; + if (blk_queue_dax(bdev->bd_queue)) + iomap->dax_dev = dax_get_by_host(bdev->bd_disk->disk_name); + else + iomap->dax_dev = NULL; iomap->offset = first_block << blkbits; if (ret == 0) { @@ -3441,6 +3447,7 @@ static int ext4_iomap_end(struct inode *inode, loff_t offset, loff_t length, int blkbits = inode->i_blkbits; bool truncate = false; + put_dax(iomap->dax_dev); if (!(flags & IOMAP_WRITE) || (flags & IOMAP_FAULT)) return 0; @@ -4537,31 +4544,6 @@ void ext4_set_inode_flags(struct inode *inode) S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC|S_DAX); } -/* Propagate flags from i_flags to EXT4_I(inode)->i_flags */ -void ext4_get_inode_flags(struct ext4_inode_info *ei) -{ - unsigned int vfs_fl; - unsigned long old_fl, new_fl; - - do { - vfs_fl = ei->vfs_inode.i_flags; - old_fl = ei->i_flags; - new_fl = old_fl & ~(EXT4_SYNC_FL|EXT4_APPEND_FL| - EXT4_IMMUTABLE_FL|EXT4_NOATIME_FL| - EXT4_DIRSYNC_FL); - if (vfs_fl & S_SYNC) - new_fl |= EXT4_SYNC_FL; - if (vfs_fl & S_APPEND) - new_fl |= EXT4_APPEND_FL; - if (vfs_fl & S_IMMUTABLE) - new_fl |= EXT4_IMMUTABLE_FL; - if (vfs_fl & S_NOATIME) - new_fl |= EXT4_NOATIME_FL; - if (vfs_fl & S_DIRSYNC) - new_fl |= EXT4_DIRSYNC_FL; - } while (cmpxchg(&ei->i_flags, old_fl, new_fl) != old_fl); -} - static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode, struct ext4_inode_info *ei) { @@ -4998,7 +4980,6 @@ static int ext4_do_update_inode(handle_t *handle, if (ext4_test_inode_state(inode, EXT4_STATE_NEW)) memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size); - ext4_get_inode_flags(ei); raw_inode->i_mode = cpu_to_le16(inode->i_mode); i_uid = i_uid_read(inode); i_gid = i_gid_read(inode); diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 2779d5f291a4..0c21e22acd74 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -589,7 +589,6 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) case FS_IOC_GETFSMAP: return ext4_ioc_getfsmap(sb, (void __user *)arg); case EXT4_IOC_GETFLAGS: - ext4_get_inode_flags(ei); flags = ei->i_flags & EXT4_FL_USER_VISIBLE; return put_user(flags, (int __user *) arg); case EXT4_IOC_SETFLAGS: { @@ -977,7 +976,6 @@ resizefs_out: struct fsxattr fa; memset(&fa, 0, sizeof(struct fsxattr)); - ext4_get_inode_flags(ei); fa.fsx_xflags = ext4_iflags_to_xflags(ei->i_flags & EXT4_FL_USER_VISIBLE); if (ext4_has_feature_project(inode->i_sb)) { diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 931053cf20d6..96973ee74147 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -840,6 +840,28 @@ static void dump_orphan_list(struct super_block *sb, struct ext4_sb_info *sbi) } } +#ifdef CONFIG_QUOTA +static int ext4_quota_off(struct super_block *sb, int type); + +static inline void ext4_quota_off_umount(struct super_block *sb) +{ + int type; + + if (ext4_has_feature_quota(sb)) { + dquot_disable(sb, -1, + DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED); + } else { + /* Use our quota_off function to clear inode flags etc. */ + for (type = 0; type < EXT4_MAXQUOTAS; type++) + ext4_quota_off(sb, type); + } +} +#else +static inline void ext4_quota_off_umount(struct super_block *sb) +{ +} +#endif + static void ext4_put_super(struct super_block *sb) { struct ext4_sb_info *sbi = EXT4_SB(sb); @@ -848,7 +870,7 @@ static void ext4_put_super(struct super_block *sb) int i, err; ext4_unregister_li_request(sb); - dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED); + ext4_quota_off_umount(sb); flush_workqueue(sbi->rsv_conversion_wq); destroy_workqueue(sbi->rsv_conversion_wq); @@ -1219,7 +1241,6 @@ static int ext4_mark_dquot_dirty(struct dquot *dquot); static int ext4_write_info(struct super_block *sb, int type); static int ext4_quota_on(struct super_block *sb, int type, int format_id, const struct path *path); -static int ext4_quota_off(struct super_block *sb, int type); static int ext4_quota_on_mount(struct super_block *sb, int type); static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data, size_t len, loff_t off); @@ -5352,11 +5373,33 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id, if (err) return err; } + lockdep_set_quota_inode(path->dentry->d_inode, I_DATA_SEM_QUOTA); err = dquot_quota_on(sb, type, format_id, path); - if (err) + if (err) { lockdep_set_quota_inode(path->dentry->d_inode, I_DATA_SEM_NORMAL); + } else { + struct inode *inode = d_inode(path->dentry); + handle_t *handle; + + /* + * Set inode flags to prevent userspace from messing with quota + * files. If this fails, we return success anyway since quotas + * are already enabled and this is not a hard failure. + */ + inode_lock(inode); + handle = ext4_journal_start(inode, EXT4_HT_QUOTA, 1); + if (IS_ERR(handle)) + goto unlock_inode; + EXT4_I(inode)->i_flags |= EXT4_NOATIME_FL | EXT4_IMMUTABLE_FL; + inode_set_flags(inode, S_NOATIME | S_IMMUTABLE, + S_NOATIME | S_IMMUTABLE); + ext4_mark_inode_dirty(handle, inode); + ext4_journal_stop(handle); + unlock_inode: + inode_unlock(inode); + } return err; } @@ -5430,24 +5473,39 @@ static int ext4_quota_off(struct super_block *sb, int type) { struct inode *inode = sb_dqopt(sb)->files[type]; handle_t *handle; + int err; /* Force all delayed allocation blocks to be allocated. * Caller already holds s_umount sem */ if (test_opt(sb, DELALLOC)) sync_filesystem(sb); - if (!inode) + if (!inode || !igrab(inode)) goto out; - /* Update modification times of quota files when userspace can - * start looking at them */ + err = dquot_quota_off(sb, type); + if (err) + goto out_put; + + inode_lock(inode); + /* + * Update modification times of quota files when userspace can + * start looking at them. If we fail, we return success anyway since + * this is not a hard failure and quotas are already disabled. + */ handle = ext4_journal_start(inode, EXT4_HT_QUOTA, 1); if (IS_ERR(handle)) - goto out; + goto out_unlock; + EXT4_I(inode)->i_flags &= ~(EXT4_NOATIME_FL | EXT4_IMMUTABLE_FL); + inode_set_flags(inode, 0, S_NOATIME | S_IMMUTABLE); inode->i_mtime = inode->i_ctime = current_time(inode); ext4_mark_inode_dirty(handle, inode); ext4_journal_stop(handle); - +out_unlock: + inode_unlock(inode); +out_put: + iput(inode); + return err; out: return dquot_quota_off(sb, type); } diff --git a/fs/fcntl.c b/fs/fcntl.c index be8fbe289087..8bd81c2e89b2 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -23,6 +23,7 @@ #include <linux/pid_namespace.h> #include <linux/user_namespace.h> #include <linux/shmem_fs.h> +#include <linux/compat.h> #include <asm/poll.h> #include <asm/siginfo.h> @@ -420,6 +421,162 @@ out: } #endif +#ifdef CONFIG_COMPAT +static int get_compat_flock(struct flock *kfl, struct compat_flock __user *ufl) +{ + if (!access_ok(VERIFY_READ, ufl, sizeof(*ufl)) || + __get_user(kfl->l_type, &ufl->l_type) || + __get_user(kfl->l_whence, &ufl->l_whence) || + __get_user(kfl->l_start, &ufl->l_start) || + __get_user(kfl->l_len, &ufl->l_len) || + __get_user(kfl->l_pid, &ufl->l_pid)) + return -EFAULT; + return 0; +} + +static int put_compat_flock(struct flock *kfl, struct compat_flock __user *ufl) +{ + if (!access_ok(VERIFY_WRITE, ufl, sizeof(*ufl)) || + __put_user(kfl->l_type, &ufl->l_type) || + __put_user(kfl->l_whence, &ufl->l_whence) || + __put_user(kfl->l_start, &ufl->l_start) || + __put_user(kfl->l_len, &ufl->l_len) || + __put_user(kfl->l_pid, &ufl->l_pid)) + return -EFAULT; + return 0; +} + +#ifndef HAVE_ARCH_GET_COMPAT_FLOCK64 +static int get_compat_flock64(struct flock *kfl, struct compat_flock64 __user *ufl) +{ + if (!access_ok(VERIFY_READ, ufl, sizeof(*ufl)) || + __get_user(kfl->l_type, &ufl->l_type) || + __get_user(kfl->l_whence, &ufl->l_whence) || + __get_user(kfl->l_start, &ufl->l_start) || + __get_user(kfl->l_len, &ufl->l_len) || + __get_user(kfl->l_pid, &ufl->l_pid)) + return -EFAULT; + return 0; +} +#endif + +#ifndef HAVE_ARCH_PUT_COMPAT_FLOCK64 +static int put_compat_flock64(struct flock *kfl, struct compat_flock64 __user *ufl) +{ + if (!access_ok(VERIFY_WRITE, ufl, sizeof(*ufl)) || + __put_user(kfl->l_type, &ufl->l_type) || + __put_user(kfl->l_whence, &ufl->l_whence) || + __put_user(kfl->l_start, &ufl->l_start) || + __put_user(kfl->l_len, &ufl->l_len) || + __put_user(kfl->l_pid, &ufl->l_pid)) + return -EFAULT; + return 0; +} +#endif + +static unsigned int +convert_fcntl_cmd(unsigned int cmd) +{ + switch (cmd) { + case F_GETLK64: + return F_GETLK; + case F_SETLK64: + return F_SETLK; + case F_SETLKW64: + return F_SETLKW; + } + + return cmd; +} + +COMPAT_SYSCALL_DEFINE3(fcntl64, unsigned int, fd, unsigned int, cmd, + compat_ulong_t, arg) +{ + mm_segment_t old_fs; + struct flock f; + long ret; + unsigned int conv_cmd; + + switch (cmd) { + case F_GETLK: + case F_SETLK: + case F_SETLKW: + ret = get_compat_flock(&f, compat_ptr(arg)); + if (ret != 0) + break; + old_fs = get_fs(); + set_fs(KERNEL_DS); + ret = sys_fcntl(fd, cmd, (unsigned long)&f); + set_fs(old_fs); + if (cmd == F_GETLK && ret == 0) { + /* GETLK was successful and we need to return the data... + * but it needs to fit in the compat structure. + * l_start shouldn't be too big, unless the original + * start + end is greater than COMPAT_OFF_T_MAX, in which + * case the app was asking for trouble, so we return + * -EOVERFLOW in that case. + * l_len could be too big, in which case we just truncate it, + * and only allow the app to see that part of the conflicting + * lock that might make sense to it anyway + */ + + if (f.l_start > COMPAT_OFF_T_MAX) + ret = -EOVERFLOW; + if (f.l_len > COMPAT_OFF_T_MAX) + f.l_len = COMPAT_OFF_T_MAX; + if (ret == 0) + ret = put_compat_flock(&f, compat_ptr(arg)); + } + break; + + case F_GETLK64: + case F_SETLK64: + case F_SETLKW64: + case F_OFD_GETLK: + case F_OFD_SETLK: + case F_OFD_SETLKW: + ret = get_compat_flock64(&f, compat_ptr(arg)); + if (ret != 0) + break; + old_fs = get_fs(); + set_fs(KERNEL_DS); + conv_cmd = convert_fcntl_cmd(cmd); + ret = sys_fcntl(fd, conv_cmd, (unsigned long)&f); + set_fs(old_fs); + if ((conv_cmd == F_GETLK || conv_cmd == F_OFD_GETLK) && ret == 0) { + /* need to return lock information - see above for commentary */ + if (f.l_start > COMPAT_LOFF_T_MAX) + ret = -EOVERFLOW; + if (f.l_len > COMPAT_LOFF_T_MAX) + f.l_len = COMPAT_LOFF_T_MAX; + if (ret == 0) + ret = put_compat_flock64(&f, compat_ptr(arg)); + } + break; + + default: + ret = sys_fcntl(fd, cmd, arg); + break; + } + return ret; +} + +COMPAT_SYSCALL_DEFINE3(fcntl, unsigned int, fd, unsigned int, cmd, + compat_ulong_t, arg) +{ + switch (cmd) { + case F_GETLK64: + case F_SETLK64: + case F_SETLKW64: + case F_OFD_GETLK: + case F_OFD_SETLK: + case F_OFD_SETLKW: + return -EINVAL; + } + return compat_sys_fcntl64(fd, cmd, arg); +} +#endif + /* Table to convert sigio signal codes into poll band bitmaps */ static const long band_table[NSIGPOLL] = { diff --git a/fs/fhandle.c b/fs/fhandle.c index 5559168d5637..58a61f55e0d0 100644 --- a/fs/fhandle.c +++ b/fs/fhandle.c @@ -9,6 +9,7 @@ #include <linux/fsnotify.h> #include <linux/personality.h> #include <linux/uaccess.h> +#include <linux/compat.h> #include "internal.h" #include "mount.h" @@ -264,3 +265,15 @@ SYSCALL_DEFINE3(open_by_handle_at, int, mountdirfd, ret = do_handle_open(mountdirfd, handle, flags); return ret; } + +#ifdef CONFIG_COMPAT +/* + * Exactly like fs/open.c:sys_open_by_handle_at(), except that it + * doesn't set the O_LARGEFILE flag. + */ +COMPAT_SYSCALL_DEFINE3(open_by_handle_at, int, mountdirfd, + struct file_handle __user *, handle, int, flags) +{ + return do_handle_open(mountdirfd, handle, flags); +} +#endif diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index b681b43c766e..c2d7f3a92679 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -382,9 +382,9 @@ static void request_end(struct fuse_conn *fc, struct fuse_req *req) wake_up(&fc->blocked_waitq); if (fc->num_background == fc->congestion_threshold && - fc->connected && fc->bdi_initialized) { - clear_bdi_congested(&fc->bdi, BLK_RW_SYNC); - clear_bdi_congested(&fc->bdi, BLK_RW_ASYNC); + fc->connected && fc->sb) { + clear_bdi_congested(fc->sb->s_bdi, BLK_RW_SYNC); + clear_bdi_congested(fc->sb->s_bdi, BLK_RW_ASYNC); } fc->num_background--; fc->active_background--; @@ -573,10 +573,9 @@ void fuse_request_send_background_locked(struct fuse_conn *fc, fc->num_background++; if (fc->num_background == fc->max_background) fc->blocked = 1; - if (fc->num_background == fc->congestion_threshold && - fc->bdi_initialized) { - set_bdi_congested(&fc->bdi, BLK_RW_SYNC); - set_bdi_congested(&fc->bdi, BLK_RW_ASYNC); + if (fc->num_background == fc->congestion_threshold && fc->sb) { + set_bdi_congested(fc->sb->s_bdi, BLK_RW_SYNC); + set_bdi_congested(fc->sb->s_bdi, BLK_RW_ASYNC); } list_add_tail(&req->list, &fc->bg_queue); flush_bg_queue(fc); diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 32ac2c9b09c0..f33341d9501a 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -527,9 +527,6 @@ struct fuse_conn { /** Filesystem supports NFS exporting. Only set in INIT */ unsigned export_support:1; - /** Set if bdi is valid */ - unsigned bdi_initialized:1; - /** write-back cache policy (default is write-through) */ unsigned writeback_cache:1; @@ -631,9 +628,6 @@ struct fuse_conn { /** Negotiated minor version */ unsigned minor; - /** Backing dev info */ - struct backing_dev_info bdi; - /** Entry on the fuse_conn_list */ struct list_head entry; diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 6fe6a88ecb4a..73cf05135252 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -386,12 +386,6 @@ static void fuse_send_destroy(struct fuse_conn *fc) } } -static void fuse_bdi_destroy(struct fuse_conn *fc) -{ - if (fc->bdi_initialized) - bdi_destroy(&fc->bdi); -} - static void fuse_put_super(struct super_block *sb) { struct fuse_conn *fc = get_fuse_conn_super(sb); @@ -403,7 +397,6 @@ static void fuse_put_super(struct super_block *sb) list_del(&fc->entry); fuse_ctl_remove_conn(fc); mutex_unlock(&fuse_mutex); - fuse_bdi_destroy(fc); fuse_conn_put(fc); } @@ -928,7 +921,8 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req) fc->no_flock = 1; } - fc->bdi.ra_pages = min(fc->bdi.ra_pages, ra_pages); + fc->sb->s_bdi->ra_pages = + min(fc->sb->s_bdi->ra_pages, ra_pages); fc->minor = arg->minor; fc->max_write = arg->minor < 5 ? 4096 : arg->max_write; fc->max_write = max_t(unsigned, 4096, fc->max_write); @@ -944,7 +938,7 @@ static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req) arg->major = FUSE_KERNEL_VERSION; arg->minor = FUSE_KERNEL_MINOR_VERSION; - arg->max_readahead = fc->bdi.ra_pages * PAGE_SIZE; + arg->max_readahead = fc->sb->s_bdi->ra_pages * PAGE_SIZE; arg->flags |= FUSE_ASYNC_READ | FUSE_POSIX_LOCKS | FUSE_ATOMIC_O_TRUNC | FUSE_EXPORT_SUPPORT | FUSE_BIG_WRITES | FUSE_DONT_MASK | FUSE_SPLICE_WRITE | FUSE_SPLICE_MOVE | FUSE_SPLICE_READ | @@ -976,27 +970,18 @@ static void fuse_free_conn(struct fuse_conn *fc) static int fuse_bdi_init(struct fuse_conn *fc, struct super_block *sb) { int err; + char *suffix = ""; - fc->bdi.name = "fuse"; - fc->bdi.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_SIZE; - /* fuse does it's own writeback accounting */ - fc->bdi.capabilities = BDI_CAP_NO_ACCT_WB | BDI_CAP_STRICTLIMIT; - - err = bdi_init(&fc->bdi); + if (sb->s_bdev) + suffix = "-fuseblk"; + err = super_setup_bdi_name(sb, "%u:%u%s", MAJOR(fc->dev), + MINOR(fc->dev), suffix); if (err) return err; - fc->bdi_initialized = 1; - - if (sb->s_bdev) { - err = bdi_register(&fc->bdi, NULL, "%u:%u-fuseblk", - MAJOR(fc->dev), MINOR(fc->dev)); - } else { - err = bdi_register_dev(&fc->bdi, fc->dev); - } - - if (err) - return err; + sb->s_bdi->ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_SIZE; + /* fuse does it's own writeback accounting */ + sb->s_bdi->capabilities = BDI_CAP_NO_ACCT_WB | BDI_CAP_STRICTLIMIT; /* * For a single fuse filesystem use max 1% of dirty + @@ -1010,7 +995,7 @@ static int fuse_bdi_init(struct fuse_conn *fc, struct super_block *sb) * * /sys/class/bdi/<bdi>/max_ratio */ - bdi_set_max_ratio(&fc->bdi, 1); + bdi_set_max_ratio(sb->s_bdi, 1); return 0; } @@ -1113,8 +1098,6 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) if (err) goto err_dev_free; - sb->s_bdi = &fc->bdi; - /* Handle umasking inside the fuse code */ if (sb->s_flags & MS_POSIXACL) fc->dont_mask = 1; @@ -1182,7 +1165,6 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) err_dev_free: fuse_dev_free(fud); err_put_conn: - fuse_bdi_destroy(fc); fuse_conn_put(fc); err_fput: fput(file); diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index 01b97c012c6e..3814a60e0aea 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -38,11 +38,6 @@ struct metapath { __u16 mp_list[GFS2_MAX_META_HEIGHT]; }; -struct strip_mine { - int sm_first; - unsigned int sm_height; -}; - /** * gfs2_unstuffer_page - unstuff a stuffed inode into a block cached by a page * @ip: the inode @@ -253,6 +248,19 @@ static inline unsigned int metapath_branch_start(const struct metapath *mp) } /** + * metaptr1 - Return the first possible metadata pointer in a metaath buffer + * @height: The metadata height (0 = dinode) + * @mp: The metapath + */ +static inline __be64 *metaptr1(unsigned int height, const struct metapath *mp) +{ + struct buffer_head *bh = mp->mp_bh[height]; + if (height == 0) + return ((__be64 *)(bh->b_data + sizeof(struct gfs2_dinode))); + return ((__be64 *)(bh->b_data + sizeof(struct gfs2_meta_header))); +} + +/** * metapointer - Return pointer to start of metadata in a buffer * @height: The metadata height (0 = dinode) * @mp: The metapath @@ -264,10 +272,8 @@ static inline unsigned int metapath_branch_start(const struct metapath *mp) static inline __be64 *metapointer(unsigned int height, const struct metapath *mp) { - struct buffer_head *bh = mp->mp_bh[height]; - unsigned int head_size = (height > 0) ? - sizeof(struct gfs2_meta_header) : sizeof(struct gfs2_dinode); - return ((__be64 *)(bh->b_data + head_size)) + mp->mp_list[height]; + __be64 *p = metaptr1(height, mp); + return p + mp->mp_list[height]; } static void gfs2_metapath_ra(struct gfs2_glock *gl, @@ -296,6 +302,23 @@ static void gfs2_metapath_ra(struct gfs2_glock *gl, } /** + * lookup_mp_height - helper function for lookup_metapath + * @ip: the inode + * @mp: the metapath + * @h: the height which needs looking up + */ +static int lookup_mp_height(struct gfs2_inode *ip, struct metapath *mp, int h) +{ + __be64 *ptr = metapointer(h, mp); + u64 dblock = be64_to_cpu(*ptr); + + if (!dblock) + return h + 1; + + return gfs2_meta_indirect_buffer(ip, h + 1, dblock, &mp->mp_bh[h + 1]); +} + +/** * lookup_metapath - Walk the metadata tree to a specific point * @ip: The inode * @mp: The metapath @@ -316,17 +339,10 @@ static int lookup_metapath(struct gfs2_inode *ip, struct metapath *mp) { unsigned int end_of_metadata = ip->i_height - 1; unsigned int x; - __be64 *ptr; - u64 dblock; int ret; for (x = 0; x < end_of_metadata; x++) { - ptr = metapointer(x, mp); - dblock = be64_to_cpu(*ptr); - if (!dblock) - return x + 1; - - ret = gfs2_meta_indirect_buffer(ip, x+1, dblock, &mp->mp_bh[x+1]); + ret = lookup_mp_height(ip, mp, x); if (ret) return ret; } @@ -334,6 +350,35 @@ static int lookup_metapath(struct gfs2_inode *ip, struct metapath *mp) return ip->i_height; } +/** + * fillup_metapath - fill up buffers for the metadata path to a specific height + * @ip: The inode + * @mp: The metapath + * @h: The height to which it should be mapped + * + * Similar to lookup_metapath, but does lookups for a range of heights + * + * Returns: error or height of metadata tree + */ + +static int fillup_metapath(struct gfs2_inode *ip, struct metapath *mp, int h) +{ + unsigned int start_h = h - 1; + int ret; + + if (h) { + /* find the first buffer we need to look up. */ + while (start_h > 0 && mp->mp_bh[start_h] == NULL) + start_h--; + for (; start_h < h; start_h++) { + ret = lookup_mp_height(ip, mp, start_h); + if (ret) + return ret; + } + } + return ip->i_height; +} + static inline void release_metapath(struct metapath *mp) { int i; @@ -422,6 +467,13 @@ enum alloc_state { /* ALLOC_UNSTUFF = 3, TBD and rather complicated */ }; +static inline unsigned int hptrs(struct gfs2_sbd *sdp, const unsigned int hgt) +{ + if (hgt) + return sdp->sd_inptrs; + return sdp->sd_diptrs; +} + /** * gfs2_bmap_alloc - Build a metadata tree of the requested height * @inode: The GFS2 inode @@ -620,7 +672,7 @@ int gfs2_block_map(struct inode *inode, sector_t lblock, BUG_ON(maxlen == 0); - memset(mp.mp_bh, 0, sizeof(mp.mp_bh)); + memset(&mp, 0, sizeof(mp)); bmap_lock(ip, create); clear_buffer_mapped(bh_map); clear_buffer_new(bh_map); @@ -702,252 +754,6 @@ int gfs2_extent_map(struct inode *inode, u64 lblock, int *new, u64 *dblock, unsi } /** - * do_strip - Look for a layer a particular layer of the file and strip it off - * @ip: the inode - * @dibh: the dinode buffer - * @bh: A buffer of pointers - * @top: The first pointer in the buffer - * @bottom: One more than the last pointer - * @height: the height this buffer is at - * @sm: a pointer to a struct strip_mine - * - * Returns: errno - */ - -static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh, - struct buffer_head *bh, __be64 *top, __be64 *bottom, - unsigned int height, struct strip_mine *sm) -{ - struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); - struct gfs2_rgrp_list rlist; - struct gfs2_trans *tr; - u64 bn, bstart; - u32 blen, btotal; - __be64 *p; - unsigned int rg_blocks = 0; - int metadata; - unsigned int revokes = 0; - int x; - int error; - int jblocks_rqsted; - - error = gfs2_rindex_update(sdp); - if (error) - return error; - - if (!*top) - sm->sm_first = 0; - - if (height != sm->sm_height) - return 0; - - if (sm->sm_first) { - top++; - sm->sm_first = 0; - } - - metadata = (height != ip->i_height - 1); - if (metadata) - revokes = (height) ? sdp->sd_inptrs : sdp->sd_diptrs; - else if (ip->i_depth) - revokes = sdp->sd_inptrs; - - memset(&rlist, 0, sizeof(struct gfs2_rgrp_list)); - bstart = 0; - blen = 0; - - for (p = top; p < bottom; p++) { - if (!*p) - continue; - - bn = be64_to_cpu(*p); - - if (bstart + blen == bn) - blen++; - else { - if (bstart) - gfs2_rlist_add(ip, &rlist, bstart); - - bstart = bn; - blen = 1; - } - } - - if (bstart) - gfs2_rlist_add(ip, &rlist, bstart); - else - goto out; /* Nothing to do */ - - gfs2_rlist_alloc(&rlist, LM_ST_EXCLUSIVE); - - for (x = 0; x < rlist.rl_rgrps; x++) { - struct gfs2_rgrpd *rgd; - rgd = rlist.rl_ghs[x].gh_gl->gl_object; - rg_blocks += rgd->rd_length; - } - - error = gfs2_glock_nq_m(rlist.rl_rgrps, rlist.rl_ghs); - if (error) - goto out_rlist; - - if (gfs2_rs_active(&ip->i_res)) /* needs to be done with the rgrp glock held */ - gfs2_rs_deltree(&ip->i_res); - -restart: - jblocks_rqsted = rg_blocks + RES_DINODE + - RES_INDIRECT + RES_STATFS + RES_QUOTA + - gfs2_struct2blk(sdp, revokes, sizeof(u64)); - if (jblocks_rqsted > atomic_read(&sdp->sd_log_thresh2)) - jblocks_rqsted = atomic_read(&sdp->sd_log_thresh2); - error = gfs2_trans_begin(sdp, jblocks_rqsted, revokes); - if (error) - goto out_rg_gunlock; - - tr = current->journal_info; - down_write(&ip->i_rw_mutex); - - gfs2_trans_add_meta(ip->i_gl, dibh); - gfs2_trans_add_meta(ip->i_gl, bh); - - bstart = 0; - blen = 0; - btotal = 0; - - for (p = top; p < bottom; p++) { - if (!*p) - continue; - - /* check for max reasonable journal transaction blocks */ - if (tr->tr_num_buf_new + RES_STATFS + - RES_QUOTA >= atomic_read(&sdp->sd_log_thresh2)) { - if (rg_blocks >= tr->tr_num_buf_new) - rg_blocks -= tr->tr_num_buf_new; - else - rg_blocks = 0; - break; - } - - bn = be64_to_cpu(*p); - - if (bstart + blen == bn) - blen++; - else { - if (bstart) { - __gfs2_free_blocks(ip, bstart, blen, metadata); - btotal += blen; - } - - bstart = bn; - blen = 1; - } - - *p = 0; - gfs2_add_inode_blocks(&ip->i_inode, -1); - } - if (p == bottom) - rg_blocks = 0; - - if (bstart) { - __gfs2_free_blocks(ip, bstart, blen, metadata); - btotal += blen; - } - - gfs2_statfs_change(sdp, 0, +btotal, 0); - gfs2_quota_change(ip, -(s64)btotal, ip->i_inode.i_uid, - ip->i_inode.i_gid); - - ip->i_inode.i_mtime = ip->i_inode.i_ctime = current_time(&ip->i_inode); - - gfs2_dinode_out(ip, dibh->b_data); - - up_write(&ip->i_rw_mutex); - - gfs2_trans_end(sdp); - - if (rg_blocks) - goto restart; - -out_rg_gunlock: - gfs2_glock_dq_m(rlist.rl_rgrps, rlist.rl_ghs); -out_rlist: - gfs2_rlist_free(&rlist); -out: - return error; -} - -/** - * recursive_scan - recursively scan through the end of a file - * @ip: the inode - * @dibh: the dinode buffer - * @mp: the path through the metadata to the point to start - * @height: the height the recursion is at - * @block: the indirect block to look at - * @first: 1 if this is the first block - * @sm: data opaque to this function to pass to @bc - * - * When this is first called @height and @block should be zero and - * @first should be 1. - * - * Returns: errno - */ - -static int recursive_scan(struct gfs2_inode *ip, struct buffer_head *dibh, - struct metapath *mp, unsigned int height, - u64 block, int first, struct strip_mine *sm) -{ - struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); - struct buffer_head *bh = NULL; - __be64 *top, *bottom; - u64 bn; - int error; - int mh_size = sizeof(struct gfs2_meta_header); - - if (!height) { - error = gfs2_meta_inode_buffer(ip, &bh); - if (error) - return error; - dibh = bh; - - top = (__be64 *)(bh->b_data + sizeof(struct gfs2_dinode)) + mp->mp_list[0]; - bottom = (__be64 *)(bh->b_data + sizeof(struct gfs2_dinode)) + sdp->sd_diptrs; - } else { - error = gfs2_meta_indirect_buffer(ip, height, block, &bh); - if (error) - return error; - - top = (__be64 *)(bh->b_data + mh_size) + - (first ? mp->mp_list[height] : 0); - - bottom = (__be64 *)(bh->b_data + mh_size) + sdp->sd_inptrs; - } - - error = do_strip(ip, dibh, bh, top, bottom, height, sm); - if (error) - goto out; - - if (height < ip->i_height - 1) { - - gfs2_metapath_ra(ip->i_gl, bh, top); - - for (; top < bottom; top++, first = 0) { - if (!*top) - continue; - - bn = be64_to_cpu(*top); - - error = recursive_scan(ip, dibh, mp, height + 1, bn, - first, sm); - if (error) - break; - } - } -out: - brelse(bh); - return error; -} - - -/** * gfs2_block_truncate_page - Deal with zeroing out data for truncate * * This is partly borrowed from ext3. @@ -1106,41 +912,406 @@ out: return error; } -static int trunc_dealloc(struct gfs2_inode *ip, u64 size) +/** + * sweep_bh_for_rgrps - find an rgrp in a meta buffer and free blocks therein + * @ip: inode + * @rg_gh: holder of resource group glock + * @mp: current metapath fully populated with buffers + * @btotal: place to keep count of total blocks freed + * @hgt: height we're processing + * @first: true if this is the first call to this function for this height + * + * We sweep a metadata buffer (provided by the metapath) for blocks we need to + * free, and free them all. However, we do it one rgrp at a time. If this + * block has references to multiple rgrps, we break it into individual + * transactions. This allows other processes to use the rgrps while we're + * focused on a single one, for better concurrency / performance. + * At every transaction boundary, we rewrite the inode into the journal. + * That way the bitmaps are kept consistent with the inode and we can recover + * if we're interrupted by power-outages. + * + * Returns: 0, or return code if an error occurred. + * *btotal has the total number of blocks freed + */ +static int sweep_bh_for_rgrps(struct gfs2_inode *ip, struct gfs2_holder *rd_gh, + const struct metapath *mp, u32 *btotal, int hgt, + bool preserve1) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); - unsigned int height = ip->i_height; - u64 lblock; - struct metapath mp; - int error; + struct gfs2_rgrpd *rgd; + struct gfs2_trans *tr; + struct buffer_head *bh = mp->mp_bh[hgt]; + __be64 *top, *bottom, *p; + int blks_outside_rgrp; + u64 bn, bstart, isize_blks; + s64 blen; /* needs to be s64 or gfs2_add_inode_blocks breaks */ + int meta = ((hgt != ip->i_height - 1) ? 1 : 0); + int ret = 0; + bool buf_in_tr = false; /* buffer was added to transaction */ + + if (gfs2_metatype_check(sdp, bh, + (hgt ? GFS2_METATYPE_IN : GFS2_METATYPE_DI))) + return -EIO; + +more_rgrps: + blks_outside_rgrp = 0; + bstart = 0; + blen = 0; + top = metapointer(hgt, mp); /* first ptr from metapath */ + /* If we're keeping some data at the truncation point, we've got to + preserve the metadata tree by adding 1 to the starting metapath. */ + if (preserve1) + top++; + + bottom = (__be64 *)(bh->b_data + bh->b_size); + + for (p = top; p < bottom; p++) { + if (!*p) + continue; + bn = be64_to_cpu(*p); + if (gfs2_holder_initialized(rd_gh)) { + rgd = (struct gfs2_rgrpd *)rd_gh->gh_gl->gl_object; + gfs2_assert_withdraw(sdp, + gfs2_glock_is_locked_by_me(rd_gh->gh_gl)); + } else { + rgd = gfs2_blk2rgrpd(sdp, bn, false); + ret = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, + 0, rd_gh); + if (ret) + goto out; + + /* Must be done with the rgrp glock held: */ + if (gfs2_rs_active(&ip->i_res) && + rgd == ip->i_res.rs_rbm.rgd) + gfs2_rs_deltree(&ip->i_res); + } + + if (!rgrp_contains_block(rgd, bn)) { + blks_outside_rgrp++; + continue; + } + + /* The size of our transactions will be unknown until we + actually process all the metadata blocks that relate to + the rgrp. So we estimate. We know it can't be more than + the dinode's i_blocks and we don't want to exceed the + journal flush threshold, sd_log_thresh2. */ + if (current->journal_info == NULL) { + unsigned int jblocks_rqsted, revokes; + + jblocks_rqsted = rgd->rd_length + RES_DINODE + + RES_INDIRECT; + isize_blks = gfs2_get_inode_blocks(&ip->i_inode); + if (isize_blks > atomic_read(&sdp->sd_log_thresh2)) + jblocks_rqsted += + atomic_read(&sdp->sd_log_thresh2); + else + jblocks_rqsted += isize_blks; + revokes = jblocks_rqsted; + if (meta) + revokes += hptrs(sdp, hgt); + else if (ip->i_depth) + revokes += sdp->sd_inptrs; + ret = gfs2_trans_begin(sdp, jblocks_rqsted, revokes); + if (ret) + goto out_unlock; + down_write(&ip->i_rw_mutex); + } + /* check if we will exceed the transaction blocks requested */ + tr = current->journal_info; + if (tr->tr_num_buf_new + RES_STATFS + + RES_QUOTA >= atomic_read(&sdp->sd_log_thresh2)) { + /* We set blks_outside_rgrp to ensure the loop will + be repeated for the same rgrp, but with a new + transaction. */ + blks_outside_rgrp++; + /* This next part is tricky. If the buffer was added + to the transaction, we've already set some block + pointers to 0, so we better follow through and free + them, or we will introduce corruption (so break). + This may be impossible, or at least rare, but I + decided to cover the case regardless. + + If the buffer was not added to the transaction + (this call), doing so would exceed our transaction + size, so we need to end the transaction and start a + new one (so goto). */ + + if (buf_in_tr) + break; + goto out_unlock; + } + + gfs2_trans_add_meta(ip->i_gl, bh); + buf_in_tr = true; + *p = 0; + if (bstart + blen == bn) { + blen++; + continue; + } + if (bstart) { + __gfs2_free_blocks(ip, bstart, (u32)blen, meta); + (*btotal) += blen; + gfs2_add_inode_blocks(&ip->i_inode, -blen); + } + bstart = bn; + blen = 1; + } + if (bstart) { + __gfs2_free_blocks(ip, bstart, (u32)blen, meta); + (*btotal) += blen; + gfs2_add_inode_blocks(&ip->i_inode, -blen); + } +out_unlock: + if (!ret && blks_outside_rgrp) { /* If buffer still has non-zero blocks + outside the rgrp we just processed, + do it all over again. */ + if (current->journal_info) { + struct buffer_head *dibh = mp->mp_bh[0]; + + /* Every transaction boundary, we rewrite the dinode + to keep its di_blocks current in case of failure. */ + ip->i_inode.i_mtime = ip->i_inode.i_ctime = + CURRENT_TIME; + gfs2_trans_add_meta(ip->i_gl, dibh); + gfs2_dinode_out(ip, dibh->b_data); + up_write(&ip->i_rw_mutex); + gfs2_trans_end(sdp); + } + gfs2_glock_dq_uninit(rd_gh); + cond_resched(); + goto more_rgrps; + } +out: + return ret; +} + +/** + * find_nonnull_ptr - find a non-null pointer given a metapath and height + * assumes the metapath is valid (with buffers) out to height h + * @mp: starting metapath + * @h: desired height to search + * + * Returns: true if a non-null pointer was found in the metapath buffer + * false if all remaining pointers are NULL in the buffer + */ +static bool find_nonnull_ptr(struct gfs2_sbd *sdp, struct metapath *mp, + unsigned int h) +{ + __be64 *ptr; + unsigned int ptrs = hptrs(sdp, h) - 1; + + while (true) { + ptr = metapointer(h, mp); + if (*ptr) /* if we have a non-null pointer */ + return true; + + if (mp->mp_list[h] < ptrs) + mp->mp_list[h]++; + else + return false; /* no more pointers in this buffer */ + } +} + +enum dealloc_states { + DEALLOC_MP_FULL = 0, /* Strip a metapath with all buffers read in */ + DEALLOC_MP_LOWER = 1, /* lower the metapath strip height */ + DEALLOC_FILL_MP = 2, /* Fill in the metapath to the given height. */ + DEALLOC_DONE = 3, /* process complete */ +}; - if (!size) +/** + * trunc_dealloc - truncate a file down to a desired size + * @ip: inode to truncate + * @newsize: The desired size of the file + * + * This function truncates a file to newsize. It works from the + * bottom up, and from the right to the left. In other words, it strips off + * the highest layer (data) before stripping any of the metadata. Doing it + * this way is best in case the operation is interrupted by power failure, etc. + * The dinode is rewritten in every transaction to guarantee integrity. + */ +static int trunc_dealloc(struct gfs2_inode *ip, u64 newsize) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + struct metapath mp; + struct buffer_head *dibh, *bh; + struct gfs2_holder rd_gh; + u64 lblock; + __u16 nbof[GFS2_MAX_META_HEIGHT]; /* new beginning of truncation */ + unsigned int strip_h = ip->i_height - 1; + u32 btotal = 0; + int ret, state; + int mp_h; /* metapath buffers are read in to this height */ + sector_t last_ra = 0; + u64 prev_bnr = 0; + bool preserve1; /* need to preserve the first meta pointer? */ + + if (!newsize) lblock = 0; else - lblock = (size - 1) >> sdp->sd_sb.sb_bsize_shift; + lblock = (newsize - 1) >> sdp->sd_sb.sb_bsize_shift; + memset(&mp, 0, sizeof(mp)); find_metapath(sdp, lblock, &mp, ip->i_height); - error = gfs2_rindex_update(sdp); - if (error) - return error; - error = gfs2_quota_hold(ip, NO_UID_QUOTA_CHANGE, NO_GID_QUOTA_CHANGE); - if (error) - return error; + memcpy(&nbof, &mp.mp_list, sizeof(nbof)); + + ret = gfs2_meta_inode_buffer(ip, &dibh); + if (ret) + return ret; - while (height--) { - struct strip_mine sm; - sm.sm_first = !!size; - sm.sm_height = height; + mp.mp_bh[0] = dibh; + ret = lookup_metapath(ip, &mp); + if (ret == ip->i_height) + state = DEALLOC_MP_FULL; /* We have a complete metapath */ + else + state = DEALLOC_FILL_MP; /* deal with partial metapath */ - error = recursive_scan(ip, NULL, &mp, 0, 0, 1, &sm); - if (error) + ret = gfs2_rindex_update(sdp); + if (ret) + goto out_metapath; + + ret = gfs2_quota_hold(ip, NO_UID_QUOTA_CHANGE, NO_GID_QUOTA_CHANGE); + if (ret) + goto out_metapath; + gfs2_holder_mark_uninitialized(&rd_gh); + + mp_h = strip_h; + + while (state != DEALLOC_DONE) { + switch (state) { + /* Truncate a full metapath at the given strip height. + * Note that strip_h == mp_h in order to be in this state. */ + case DEALLOC_MP_FULL: + if (mp_h > 0) { /* issue read-ahead on metadata */ + __be64 *top; + + bh = mp.mp_bh[mp_h - 1]; + if (bh->b_blocknr != last_ra) { + last_ra = bh->b_blocknr; + top = metaptr1(mp_h - 1, &mp); + gfs2_metapath_ra(ip->i_gl, bh, top); + } + } + /* If we're truncating to a non-zero size and the mp is + at the beginning of file for the strip height, we + need to preserve the first metadata pointer. */ + preserve1 = (newsize && + (mp.mp_list[mp_h] == nbof[mp_h])); + bh = mp.mp_bh[mp_h]; + gfs2_assert_withdraw(sdp, bh); + if (gfs2_assert_withdraw(sdp, + prev_bnr != bh->b_blocknr)) { + printk(KERN_EMERG "GFS2: fsid=%s:inode %llu, " + "block:%llu, i_h:%u, s_h:%u, mp_h:%u\n", + sdp->sd_fsname, + (unsigned long long)ip->i_no_addr, + prev_bnr, ip->i_height, strip_h, mp_h); + } + prev_bnr = bh->b_blocknr; + ret = sweep_bh_for_rgrps(ip, &rd_gh, &mp, &btotal, + mp_h, preserve1); + /* If we hit an error or just swept dinode buffer, + just exit. */ + if (ret || !mp_h) { + state = DEALLOC_DONE; + break; + } + state = DEALLOC_MP_LOWER; + break; + + /* lower the metapath strip height */ + case DEALLOC_MP_LOWER: + /* We're done with the current buffer, so release it, + unless it's the dinode buffer. Then back up to the + previous pointer. */ + if (mp_h) { + brelse(mp.mp_bh[mp_h]); + mp.mp_bh[mp_h] = NULL; + } + /* If we can't get any lower in height, we've stripped + off all we can. Next step is to back up and start + stripping the previous level of metadata. */ + if (mp_h == 0) { + strip_h--; + memcpy(&mp.mp_list, &nbof, sizeof(nbof)); + mp_h = strip_h; + state = DEALLOC_FILL_MP; + break; + } + mp.mp_list[mp_h] = 0; + mp_h--; /* search one metadata height down */ + if (mp.mp_list[mp_h] >= hptrs(sdp, mp_h) - 1) + break; /* loop around in the same state */ + mp.mp_list[mp_h]++; + /* Here we've found a part of the metapath that is not + * allocated. We need to search at that height for the + * next non-null pointer. */ + if (find_nonnull_ptr(sdp, &mp, mp_h)) { + state = DEALLOC_FILL_MP; + mp_h++; + } + /* No more non-null pointers at this height. Back up + to the previous height and try again. */ + break; /* loop around in the same state */ + + /* Fill the metapath with buffers to the given height. */ + case DEALLOC_FILL_MP: + /* Fill the buffers out to the current height. */ + ret = fillup_metapath(ip, &mp, mp_h); + if (ret < 0) + goto out; + + /* If buffers found for the entire strip height */ + if ((ret == ip->i_height) && (mp_h == strip_h)) { + state = DEALLOC_MP_FULL; + break; + } + if (ret < ip->i_height) /* We have a partial height */ + mp_h = ret - 1; + + /* If we find a non-null block pointer, crawl a bit + higher up in the metapath and try again, otherwise + we need to look lower for a new starting point. */ + if (find_nonnull_ptr(sdp, &mp, mp_h)) + mp_h++; + else + state = DEALLOC_MP_LOWER; break; + } } - gfs2_quota_unhold(ip); + if (btotal) { + if (current->journal_info == NULL) { + ret = gfs2_trans_begin(sdp, RES_DINODE + RES_STATFS + + RES_QUOTA, 0); + if (ret) + goto out; + down_write(&ip->i_rw_mutex); + } + gfs2_statfs_change(sdp, 0, +btotal, 0); + gfs2_quota_change(ip, -(s64)btotal, ip->i_inode.i_uid, + ip->i_inode.i_gid); + ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME; + gfs2_trans_add_meta(ip->i_gl, dibh); + gfs2_dinode_out(ip, dibh->b_data); + up_write(&ip->i_rw_mutex); + gfs2_trans_end(sdp); + } - return error; +out: + if (gfs2_holder_initialized(&rd_gh)) + gfs2_glock_dq_uninit(&rd_gh); + if (current->journal_info) { + up_write(&ip->i_rw_mutex); + gfs2_trans_end(sdp); + cond_resched(); + } + gfs2_quota_unhold(ip); +out_metapath: + release_metapath(&mp); + return ret; } static int trunc_end(struct gfs2_inode *ip) diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 6fe2a59c6a9a..c2062a108d19 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -911,11 +911,15 @@ out_qunlock: static long gfs2_fallocate(struct file *file, int mode, loff_t offset, loff_t len) { struct inode *inode = file_inode(file); + struct gfs2_sbd *sdp = GFS2_SB(inode); struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_holder gh; int ret; - if ((mode & ~FALLOC_FL_KEEP_SIZE) || gfs2_is_jdata(ip)) + if (mode & ~FALLOC_FL_KEEP_SIZE) + return -EOPNOTSUPP; + /* fallocate is needed by gfs2_grow to reserve space in the rindex */ + if (gfs2_is_jdata(ip) && inode != sdp->sd_rindex) return -EOPNOTSUPP; inode_lock(inode); diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index ec0848fcca02..959a19ced4d5 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -73,7 +73,7 @@ static DEFINE_SPINLOCK(lru_lock); static struct rhashtable_params ht_parms = { .nelem_hint = GFS2_GL_HASH_SIZE * 3 / 4, - .key_len = sizeof(struct lm_lockname), + .key_len = offsetofend(struct lm_lockname, ln_type), .key_offset = offsetof(struct gfs2_glock, gl_name), .head_offset = offsetof(struct gfs2_glock, gl_node), }; @@ -449,6 +449,9 @@ __acquires(&gl->gl_lockref.lock) unsigned int lck_flags = (unsigned int)(gh ? gh->gh_flags : 0); int ret; + if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)) && + target != LM_ST_UNLOCKED) + return; lck_flags &= (LM_FLAG_TRY | LM_FLAG_TRY_1CB | LM_FLAG_NOEXP | LM_FLAG_PRIORITY); GLOCK_BUG_ON(gl, gl->gl_state == target); @@ -484,7 +487,8 @@ __acquires(&gl->gl_lockref.lock) } else if (ret) { pr_err("lm_lock ret %d\n", ret); - GLOCK_BUG_ON(gl, 1); + GLOCK_BUG_ON(gl, !test_bit(SDF_SHUTDOWN, + &sdp->sd_flags)); } } else { /* lock_nolock */ finish_xmote(gl, target); @@ -653,10 +657,10 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number, struct lm_lockname name = { .ln_number = number, .ln_type = glops->go_type, .ln_sbd = sdp }; - struct gfs2_glock *gl, *tmp = NULL; + struct gfs2_glock *gl, *tmp; struct address_space *mapping; struct kmem_cache *cachep; - int ret, tries = 0; + int ret = 0; rcu_read_lock(); gl = rhashtable_lookup_fast(&gl_hash_table, &name, ht_parms); @@ -721,35 +725,32 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number, } again: - ret = rhashtable_lookup_insert_fast(&gl_hash_table, &gl->gl_node, - ht_parms); - if (ret == 0) { + rcu_read_lock(); + tmp = rhashtable_lookup_get_insert_fast(&gl_hash_table, &gl->gl_node, + ht_parms); + if (!tmp) { *glp = gl; - return 0; + goto out; } - - if (ret == -EEXIST) { - ret = 0; - rcu_read_lock(); - tmp = rhashtable_lookup_fast(&gl_hash_table, &name, ht_parms); - if (tmp == NULL || !lockref_get_not_dead(&tmp->gl_lockref)) { - if (++tries < 100) { - rcu_read_unlock(); - cond_resched(); - goto again; - } - tmp = NULL; - ret = -ENOMEM; - } - rcu_read_unlock(); - } else { - WARN_ON_ONCE(ret); + if (IS_ERR(tmp)) { + ret = PTR_ERR(tmp); + goto out_free; } + if (lockref_get_not_dead(&tmp->gl_lockref)) { + *glp = tmp; + goto out_free; + } + rcu_read_unlock(); + cond_resched(); + goto again; + +out_free: kfree(gl->gl_lksb.sb_lvbptr); kmem_cache_free(cachep, gl); atomic_dec(&sdp->sd_glock_disposal); - *glp = tmp; +out: + rcu_read_unlock(); return ret; } @@ -1918,10 +1919,10 @@ static const struct seq_operations gfs2_sbstats_seq_ops = { #define GFS2_SEQ_GOODSIZE min(PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER, 65536UL) -static int gfs2_glocks_open(struct inode *inode, struct file *file) +static int __gfs2_glocks_open(struct inode *inode, struct file *file, + const struct seq_operations *ops) { - int ret = seq_open_private(file, &gfs2_glock_seq_ops, - sizeof(struct gfs2_glock_iter)); + int ret = seq_open_private(file, ops, sizeof(struct gfs2_glock_iter)); if (ret == 0) { struct seq_file *seq = file->private_data; struct gfs2_glock_iter *gi = seq->private; @@ -1932,11 +1933,16 @@ static int gfs2_glocks_open(struct inode *inode, struct file *file) if (seq->buf) seq->size = GFS2_SEQ_GOODSIZE; gi->gl = NULL; - ret = rhashtable_walk_init(&gl_hash_table, &gi->hti, GFP_KERNEL); + rhashtable_walk_enter(&gl_hash_table, &gi->hti); } return ret; } +static int gfs2_glocks_open(struct inode *inode, struct file *file) +{ + return __gfs2_glocks_open(inode, file, &gfs2_glock_seq_ops); +} + static int gfs2_glocks_release(struct inode *inode, struct file *file) { struct seq_file *seq = file->private_data; @@ -1949,20 +1955,7 @@ static int gfs2_glocks_release(struct inode *inode, struct file *file) static int gfs2_glstats_open(struct inode *inode, struct file *file) { - int ret = seq_open_private(file, &gfs2_glstats_seq_ops, - sizeof(struct gfs2_glock_iter)); - if (ret == 0) { - struct seq_file *seq = file->private_data; - struct gfs2_glock_iter *gi = seq->private; - gi->sdp = inode->i_private; - gi->last_pos = 0; - seq->buf = kmalloc(GFS2_SEQ_GOODSIZE, GFP_KERNEL | __GFP_NOWARN); - if (seq->buf) - seq->size = GFS2_SEQ_GOODSIZE; - gi->gl = NULL; - ret = rhashtable_walk_init(&gl_hash_table, &gi->hti, GFP_KERNEL); - } - return ret; + return __gfs2_glocks_open(inode, file, &gfs2_glstats_seq_ops); } static int gfs2_sbstats_open(struct inode *inode, struct file *file) diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index 511e1ed7e2de..b7cf65d13561 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -203,11 +203,15 @@ enum { DFL_DLM_RECOVERY = 6, }; +/* + * We are using struct lm_lockname as an rhashtable key. Avoid holes within + * the struct; padding at the end is fine. + */ struct lm_lockname { - struct gfs2_sbd *ln_sbd; u64 ln_number; + struct gfs2_sbd *ln_sbd; unsigned int ln_type; -} __packed __aligned(sizeof(int)); +}; #define lm_name_equal(name1, name2) \ (((name1)->ln_number == (name2)->ln_number) && \ diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index e279c3ce27be..9f605ea4810c 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -202,8 +202,7 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type, fail_refresh: ip->i_iopen_gh.gh_flags |= GL_NOCACHE; ip->i_iopen_gh.gh_gl->gl_object = NULL; - gfs2_glock_dq_wait(&ip->i_iopen_gh); - gfs2_holder_uninit(&ip->i_iopen_gh); + gfs2_glock_dq_uninit(&ip->i_iopen_gh); fail_put: if (io_gl) gfs2_glock_put(io_gl); @@ -667,6 +666,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, ip->i_height = 0; ip->i_depth = 0; ip->i_entries = 0; + ip->i_no_addr = 0; /* Temporarily zero until real addr is assigned */ switch(mode & S_IFMT) { case S_IFREG: diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index b108e7ba81af..ed67548b286c 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -23,6 +23,7 @@ #include <linux/quotaops.h> #include <linux/lockdep.h> #include <linux/module.h> +#include <linux/backing-dev.h> #include "gfs2.h" #include "incore.h" @@ -1222,12 +1223,7 @@ static int set_gfs2_super(struct super_block *s, void *data) { s->s_bdev = data; s->s_dev = s->s_bdev->bd_dev; - - /* - * We set the bdi here to the queue backing, file systems can - * overwrite this in ->fill_super() - */ - s->s_bdi = bdev_get_queue(s->s_bdev)->backing_dev_info; + s->s_bdi = bdi_get(s->s_bdev->bd_bdi); return 0; } diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 86ccc0159393..83c9909ff14a 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -483,13 +483,6 @@ void gfs2_rgrp_verify(struct gfs2_rgrpd *rgd) } } -static inline int rgrp_contains_block(struct gfs2_rgrpd *rgd, u64 block) -{ - u64 first = rgd->rd_data0; - u64 last = first + rgd->rd_data; - return first <= block && block < last; -} - /** * gfs2_blk2rgrpd - Find resource group for a given data/meta block number * @sdp: The GFS2 superblock diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h index 66b51cf66dfa..e90478e2f545 100644 --- a/fs/gfs2/rgrp.h +++ b/fs/gfs2/rgrp.h @@ -83,5 +83,12 @@ static inline bool gfs2_rs_active(const struct gfs2_blkreserv *rs) return rs && !RB_EMPTY_NODE(&rs->rs_node); } +static inline int rgrp_contains_block(struct gfs2_rgrpd *rgd, u64 block) +{ + u64 first = rgd->rd_data0; + u64 last = first + rgd->rd_data; + return first <= block && block < last; +} + extern void check_and_update_goal(struct gfs2_inode *ip); #endif /* __RGRP_DOT_H__ */ diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index 361796a84fce..29b0473f6e74 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -793,7 +793,8 @@ static void gfs2_dirty_inode(struct inode *inode, int flags) if (!(flags & (I_DIRTY_DATASYNC|I_DIRTY_SYNC))) return; - + if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) + return; if (!gfs2_glock_is_locked_by_me(ip->i_gl)) { ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); if (ret) { @@ -1538,8 +1539,7 @@ static void gfs2_evict_inode(struct inode *inode) error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_SKIP, &gh); if (unlikely(error)) { ip->i_iopen_gh.gh_flags |= GL_NOCACHE; - gfs2_glock_dq_wait(&ip->i_iopen_gh); - gfs2_holder_uninit(&ip->i_iopen_gh); + gfs2_glock_dq_uninit(&ip->i_iopen_gh); goto out; } @@ -1617,7 +1617,7 @@ out_unlock: if (gfs2_holder_initialized(&ip->i_iopen_gh)) { if (test_bit(HIF_HOLDER, &ip->i_iopen_gh.gh_iflags)) { ip->i_iopen_gh.gh_flags |= GL_NOCACHE; - gfs2_glock_dq_wait(&ip->i_iopen_gh); + gfs2_glock_dq(&ip->i_iopen_gh); } gfs2_holder_uninit(&ip->i_iopen_gh); } @@ -1639,8 +1639,7 @@ out: if (gfs2_holder_initialized(&ip->i_iopen_gh)) { ip->i_iopen_gh.gh_gl->gl_object = NULL; ip->i_iopen_gh.gh_flags |= GL_NOCACHE; - gfs2_glock_dq_wait(&ip->i_iopen_gh); - gfs2_holder_uninit(&ip->i_iopen_gh); + gfs2_glock_dq_uninit(&ip->i_iopen_gh); } } diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 7163fe014b57..dde861387a40 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -136,17 +136,26 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) vma->vm_flags |= VM_HUGETLB | VM_DONTEXPAND; vma->vm_ops = &hugetlb_vm_ops; + /* + * Offset passed to mmap (before page shift) could have been + * negative when represented as a (l)off_t. + */ + if (((loff_t)vma->vm_pgoff << PAGE_SHIFT) < 0) + return -EINVAL; + if (vma->vm_pgoff & (~huge_page_mask(h) >> PAGE_SHIFT)) return -EINVAL; vma_len = (loff_t)(vma->vm_end - vma->vm_start); + len = vma_len + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); + /* check for overflow */ + if (len < vma_len) + return -EINVAL; inode_lock(inode); file_accessed(file); ret = -ENOMEM; - len = vma_len + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); - if (hugetlb_reserve_pages(inode, vma->vm_pgoff >> huge_page_order(h), len >> huge_page_shift(h), vma, @@ -155,7 +164,7 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) ret = 0; if (vma->vm_flags & VM_WRITE && inode->i_size < len) - inode->i_size = len; + i_size_write(inode, len); out: inode_unlock(inode); diff --git a/fs/inode.c b/fs/inode.c index 88110fd0b282..131b2bcebc48 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -371,9 +371,6 @@ void inode_init_once(struct inode *inode) INIT_LIST_HEAD(&inode->i_lru); address_space_init_once(&inode->i_data); i_size_ordered_init(inode); -#ifdef CONFIG_FSNOTIFY - INIT_HLIST_HEAD(&inode->i_fsnotify_marks); -#endif } EXPORT_SYMBOL(inode_init_once); diff --git a/fs/internal.h b/fs/internal.h index 11c6d89dce9c..076751d90ba2 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -108,8 +108,6 @@ extern struct file *do_filp_open(int dfd, struct filename *pathname, extern struct file *do_file_open_root(struct dentry *, struct vfsmount *, const char *, const struct open_flags *); -extern long do_handle_open(int mountdirfd, - struct file_handle __user *ufh, int open_flag); extern int open_check_o_direct(struct file *f); extern int vfs_open(const struct path *, struct file *, const struct cred *); extern struct file *filp_clone_open(struct file *); diff --git a/fs/iomap.c b/fs/iomap.c index 141c3cd55a8b..1faabe09b8fd 100644 --- a/fs/iomap.c +++ b/fs/iomap.c @@ -360,7 +360,8 @@ static int iomap_dax_zero(loff_t pos, unsigned offset, unsigned bytes, sector_t sector = iomap->blkno + (((pos & ~(PAGE_SIZE - 1)) - iomap->offset) >> 9); - return __dax_zero_page_range(iomap->bdev, sector, offset, bytes); + return __dax_zero_page_range(iomap->bdev, iomap->dax_dev, sector, + offset, bytes); } static loff_t @@ -887,16 +888,14 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, flags |= IOMAP_WRITE; } - if (mapping->nrpages) { - ret = filemap_write_and_wait_range(mapping, start, end); - if (ret) - goto out_free_dio; + ret = filemap_write_and_wait_range(mapping, start, end); + if (ret) + goto out_free_dio; - ret = invalidate_inode_pages2_range(mapping, - start >> PAGE_SHIFT, end >> PAGE_SHIFT); - WARN_ON_ONCE(ret); - ret = 0; - } + ret = invalidate_inode_pages2_range(mapping, + start >> PAGE_SHIFT, end >> PAGE_SHIFT); + WARN_ON_ONCE(ret); + ret = 0; inode_dio_begin(inode); @@ -911,6 +910,9 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, break; } pos += ret; + + if (iov_iter_rw(iter) == READ && pos >= dio->i_size) + break; } while ((count = iov_iter_count(iter)) > 0); blk_finish_plug(&plug); @@ -951,7 +953,7 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, * one is a pretty crazy thing to do, so we don't support it 100%. If * this invalidation fails, tough, the write still worked... */ - if (iov_iter_rw(iter) == WRITE && mapping->nrpages) { + if (iov_iter_rw(iter) == WRITE) { int err = invalidate_inode_pages2_range(mapping, start >> PAGE_SHIFT, end >> PAGE_SHIFT); WARN_ON_ONCE(err); diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 1ce13479b10d..5a0245e36240 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -43,6 +43,7 @@ #include <linux/backing-dev.h> #include <linux/bitops.h> #include <linux/ratelimit.h> +#include <linux/sched/mm.h> #define CREATE_TRACE_POINTS #include <trace/events/jbd2.h> @@ -206,6 +207,14 @@ static int kjournald2(void *arg) wake_up(&journal->j_wait_done_commit); /* + * Make sure that no allocations from this kernel thread will ever + * recurse to the fs layer because we are responsible for the + * transaction commit and any fs involvement might get stuck waiting for + * the trasn. commit. + */ + memalloc_nofs_save(); + + /* * And now, wait forever for commit wakeup events. */ write_lock(&journal->j_state_lock); diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 5e659ee08d6a..9ee4832b6f8b 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -29,6 +29,7 @@ #include <linux/backing-dev.h> #include <linux/bug.h> #include <linux/module.h> +#include <linux/sched/mm.h> #include <trace/events/jbd2.h> @@ -388,6 +389,11 @@ repeat: rwsem_acquire_read(&journal->j_trans_commit_map, 0, 0, _THIS_IP_); jbd2_journal_free_transaction(new_transaction); + /* + * Ensure that no allocations done while the transaction is open are + * going to recurse back to the fs layer. + */ + handle->saved_alloc_context = memalloc_nofs_save(); return 0; } @@ -466,6 +472,7 @@ handle_t *jbd2__journal_start(journal_t *journal, int nblocks, int rsv_blocks, trace_jbd2_handle_start(journal->j_fs_dev->bd_dev, handle->h_transaction->t_tid, type, line_no, nblocks); + return handle; } EXPORT_SYMBOL(jbd2__journal_start); @@ -1760,6 +1767,11 @@ int jbd2_journal_stop(handle_t *handle) if (handle->h_rsv_handle) jbd2_journal_free_reserved(handle->h_rsv_handle); free_and_exit: + /* + * Scope of the GFP_NOFS context is over here and so we can restore the + * original alloc context. + */ + memalloc_nofs_restore(handle->saved_alloc_context); jbd2_free_handle(handle); return err; } diff --git a/fs/jfs/ioctl.c b/fs/jfs/ioctl.c index fc89f9436784..5c5ac5b3aec3 100644 --- a/fs/jfs/ioctl.c +++ b/fs/jfs/ioctl.c @@ -64,7 +64,6 @@ long jfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) switch (cmd) { case JFS_IOC_GETFLAGS: - jfs_get_inode_flags(jfs_inode); flags = jfs_inode->mode2 & JFS_FL_USER_VISIBLE; flags = jfs_map_ext2(flags, 0); return put_user(flags, (int __user *) arg); @@ -98,7 +97,6 @@ long jfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) /* Lock against other parallel changes of flags */ inode_lock(inode); - jfs_get_inode_flags(jfs_inode); oldflags = jfs_inode->mode2; /* diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c index 6aca224a5d68..f36ef68905a7 100644 --- a/fs/jfs/jfs_imap.c +++ b/fs/jfs/jfs_imap.c @@ -3148,7 +3148,6 @@ static void copy_to_dinode(struct dinode * dip, struct inode *ip) else dip->di_gid = cpu_to_le32(from_kgid(&init_user_ns, jfs_ip->saved_gid)); - jfs_get_inode_flags(jfs_ip); /* * mode2 is only needed for storing the higher order bits. * Trust i_mode for the lower order ones diff --git a/fs/jfs/jfs_inode.c b/fs/jfs/jfs_inode.c index 375dd257a34f..5e9b7bb3aabf 100644 --- a/fs/jfs/jfs_inode.c +++ b/fs/jfs/jfs_inode.c @@ -45,24 +45,6 @@ void jfs_set_inode_flags(struct inode *inode) S_DIRSYNC | S_SYNC); } -void jfs_get_inode_flags(struct jfs_inode_info *jfs_ip) -{ - unsigned int flags = jfs_ip->vfs_inode.i_flags; - - jfs_ip->mode2 &= ~(JFS_IMMUTABLE_FL | JFS_APPEND_FL | JFS_NOATIME_FL | - JFS_DIRSYNC_FL | JFS_SYNC_FL); - if (flags & S_IMMUTABLE) - jfs_ip->mode2 |= JFS_IMMUTABLE_FL; - if (flags & S_APPEND) - jfs_ip->mode2 |= JFS_APPEND_FL; - if (flags & S_NOATIME) - jfs_ip->mode2 |= JFS_NOATIME_FL; - if (flags & S_DIRSYNC) - jfs_ip->mode2 |= JFS_DIRSYNC_FL; - if (flags & S_SYNC) - jfs_ip->mode2 |= JFS_SYNC_FL; -} - /* * NAME: ialloc() * diff --git a/fs/jfs/jfs_inode.h b/fs/jfs/jfs_inode.h index 9271cfe4a149..7b0b3a40788f 100644 --- a/fs/jfs/jfs_inode.h +++ b/fs/jfs/jfs_inode.h @@ -33,7 +33,6 @@ extern void jfs_truncate(struct inode *); extern void jfs_truncate_nolock(struct inode *, loff_t); extern void jfs_free_zero_link(struct inode *); extern struct dentry *jfs_get_parent(struct dentry *dentry); -extern void jfs_get_inode_flags(struct jfs_inode_info *); extern struct dentry *jfs_fh_to_dentry(struct super_block *sb, struct fid *fid, int fh_len, int fh_type); extern struct dentry *jfs_fh_to_parent(struct super_block *sb, struct fid *fid, diff --git a/fs/jfs/super.c b/fs/jfs/super.c index c64c2574a0aa..e8aad7d87b8c 100644 --- a/fs/jfs/super.c +++ b/fs/jfs/super.c @@ -45,6 +45,7 @@ #include "jfs_acl.h" #include "jfs_debug.h" #include "jfs_xattr.h" +#include "jfs_dinode.h" MODULE_DESCRIPTION("The Journaled Filesystem (JFS)"); MODULE_AUTHOR("Steve Best/Dave Kleikamp/Barry Arndt, IBM"); @@ -181,6 +182,35 @@ static int jfs_statfs(struct dentry *dentry, struct kstatfs *buf) return 0; } +#ifdef CONFIG_QUOTA +static int jfs_quota_off(struct super_block *sb, int type); +static int jfs_quota_on(struct super_block *sb, int type, int format_id, + const struct path *path); + +static void jfs_quota_off_umount(struct super_block *sb) +{ + int type; + + for (type = 0; type < MAXQUOTAS; type++) + jfs_quota_off(sb, type); +} + +static const struct quotactl_ops jfs_quotactl_ops = { + .quota_on = jfs_quota_on, + .quota_off = jfs_quota_off, + .quota_sync = dquot_quota_sync, + .get_state = dquot_get_state, + .set_info = dquot_set_dqinfo, + .get_dqblk = dquot_get_dqblk, + .set_dqblk = dquot_set_dqblk, + .get_nextdqblk = dquot_get_next_dqblk, +}; +#else +static inline void jfs_quota_off_umount(struct super_block *sb) +{ +} +#endif + static void jfs_put_super(struct super_block *sb) { struct jfs_sb_info *sbi = JFS_SBI(sb); @@ -188,7 +218,7 @@ static void jfs_put_super(struct super_block *sb) jfs_info("In jfs_put_super"); - dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED); + jfs_quota_off_umount(sb); rc = jfs_umount(sb); if (rc) @@ -536,7 +566,7 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent) sb->s_xattr = jfs_xattr_handlers; #ifdef CONFIG_QUOTA sb->dq_op = &dquot_operations; - sb->s_qcop = &dquot_quotactl_ops; + sb->s_qcop = &jfs_quotactl_ops; sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP; #endif @@ -840,6 +870,51 @@ static struct dquot **jfs_get_dquots(struct inode *inode) { return JFS_IP(inode)->i_dquot; } + +static int jfs_quota_on(struct super_block *sb, int type, int format_id, + const struct path *path) +{ + int err; + struct inode *inode; + + err = dquot_quota_on(sb, type, format_id, path); + if (err) + return err; + + inode = d_inode(path->dentry); + inode_lock(inode); + JFS_IP(inode)->mode2 |= JFS_NOATIME_FL | JFS_IMMUTABLE_FL; + inode_set_flags(inode, S_NOATIME | S_IMMUTABLE, + S_NOATIME | S_IMMUTABLE); + inode_unlock(inode); + mark_inode_dirty(inode); + + return 0; +} + +static int jfs_quota_off(struct super_block *sb, int type) +{ + struct inode *inode = sb_dqopt(sb)->files[type]; + int err; + + if (!inode || !igrab(inode)) + goto out; + + err = dquot_quota_off(sb, type); + if (err) + goto out_put; + + inode_lock(inode); + JFS_IP(inode)->mode2 &= ~(JFS_NOATIME_FL | JFS_IMMUTABLE_FL); + inode_set_flags(inode, 0, S_NOATIME | S_IMMUTABLE); + inode_unlock(inode); + mark_inode_dirty(inode); +out_put: + iput(inode); + return err; +out: + return dquot_quota_off(sb, type); +} #endif static const struct super_operations jfs_super_operations = { diff --git a/fs/mount.h b/fs/mount.h index 2826543a131d..bf1fda6eed8f 100644 --- a/fs/mount.h +++ b/fs/mount.h @@ -59,7 +59,7 @@ struct mount { struct mountpoint *mnt_mp; /* where is it mounted */ struct hlist_node mnt_mp_list; /* list mounts with the same mountpoint */ #ifdef CONFIG_FSNOTIFY - struct hlist_head mnt_fsnotify_marks; + struct fsnotify_mark_connector __rcu *mnt_fsnotify_marks; __u32 mnt_fsnotify_mask; #endif int mnt_id; /* mount identifier */ diff --git a/fs/namei.c b/fs/namei.c index d41fab78798b..9a7f8bd748d8 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -340,22 +340,14 @@ int generic_permission(struct inode *inode, int mask) if (S_ISDIR(inode->i_mode)) { /* DACs are overridable for directories */ - if (capable_wrt_inode_uidgid(inode, CAP_DAC_OVERRIDE)) - return 0; if (!(mask & MAY_WRITE)) if (capable_wrt_inode_uidgid(inode, CAP_DAC_READ_SEARCH)) return 0; - return -EACCES; - } - /* - * Read/write DACs are always overridable. - * Executable DACs are overridable when there is - * at least one exec bit set. - */ - if (!(mask & MAY_EXEC) || (inode->i_mode & S_IXUGO)) if (capable_wrt_inode_uidgid(inode, CAP_DAC_OVERRIDE)) return 0; + return -EACCES; + } /* * Searching includes executable on directories, else just read. @@ -364,6 +356,14 @@ int generic_permission(struct inode *inode, int mask) if (mask == MAY_READ) if (capable_wrt_inode_uidgid(inode, CAP_DAC_READ_SEARCH)) return 0; + /* + * Read/write DACs are always overridable. + * Executable DACs are overridable when there is + * at least one exec bit set. + */ + if (!(mask & MAY_EXEC) || (inode->i_mode & S_IXUGO)) + if (capable_wrt_inode_uidgid(inode, CAP_DAC_OVERRIDE)) + return 0; return -EACCES; } @@ -2145,6 +2145,9 @@ static const char *path_init(struct nameidata *nd, unsigned flags) int retval = 0; const char *s = nd->name->name; + if (!*s) + flags &= ~LOOKUP_RCU; + nd->last_type = LAST_ROOT; /* if there are only slashes... */ nd->flags = flags | LOOKUP_JUMPED | LOOKUP_PARENT; nd->depth = 0; diff --git a/fs/namespace.c b/fs/namespace.c index cc1375eff88c..b3b115bd4e1e 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -236,9 +236,6 @@ static struct mount *alloc_vfsmnt(const char *name) INIT_LIST_HEAD(&mnt->mnt_slave_list); INIT_LIST_HEAD(&mnt->mnt_slave); INIT_HLIST_NODE(&mnt->mnt_mp_list); -#ifdef CONFIG_FSNOTIFY - INIT_HLIST_HEAD(&mnt->mnt_fsnotify_marks); -#endif init_fs_pin(&mnt->mnt_umount, drop_mountpoint); } return mnt; diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c index d5606099712a..6d0f14c86099 100644 --- a/fs/ncpfs/inode.c +++ b/fs/ncpfs/inode.c @@ -554,12 +554,11 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent) sb->s_magic = NCP_SUPER_MAGIC; sb->s_op = &ncp_sops; sb->s_d_op = &ncp_dentry_operations; - sb->s_bdi = &server->bdi; server = NCP_SBP(sb); memset(server, 0, sizeof(*server)); - error = bdi_setup_and_register(&server->bdi, "ncpfs"); + error = super_setup_bdi(sb); if (error) goto out_fput; @@ -568,7 +567,7 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent) if (data.info_fd != -1) { struct socket *info_sock = sockfd_lookup(data.info_fd, &error); if (!info_sock) - goto out_bdi; + goto out_fput; server->info_sock = info_sock; error = -EBADFD; if (info_sock->type != SOCK_STREAM) @@ -746,8 +745,6 @@ out_nls: out_fput2: if (server->info_sock) sockfd_put(server->info_sock); -out_bdi: - bdi_destroy(&server->bdi); out_fput: sockfd_put(sock); out: @@ -788,7 +785,6 @@ static void ncp_put_super(struct super_block *sb) kill_pid(server->m.wdog_pid, SIGTERM, 1); put_pid(server->m.wdog_pid); - bdi_destroy(&server->bdi); kfree(server->priv.data); kfree(server->auth.object_name); vfree(server->rxbuf); diff --git a/fs/ncpfs/ncp_fs_sb.h b/fs/ncpfs/ncp_fs_sb.h index 55e26fd80886..366fd63cc506 100644 --- a/fs/ncpfs/ncp_fs_sb.h +++ b/fs/ncpfs/ncp_fs_sb.h @@ -143,7 +143,6 @@ struct ncp_server { size_t len; __u8 data[128]; } unexpected_packet; - struct backing_dev_info bdi; }; extern void ncp_tcp_rcv_proc(struct work_struct *work); diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 390ada8741bc..04d15a0045e3 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -761,9 +761,6 @@ static void nfs_server_set_fsinfo(struct nfs_server *server, server->rsize = NFS_MAX_FILE_IO_SIZE; server->rpages = (server->rsize + PAGE_SIZE - 1) >> PAGE_SHIFT; - server->backing_dev_info.name = "nfs"; - server->backing_dev_info.ra_pages = server->rpages * NFS_MAX_READAHEAD; - if (server->wsize > max_rpc_payload) server->wsize = max_rpc_payload; if (server->wsize > NFS_MAX_FILE_IO_SIZE) @@ -917,12 +914,6 @@ struct nfs_server *nfs_alloc_server(void) return NULL; } - if (bdi_init(&server->backing_dev_info)) { - nfs_free_iostats(server->io_stats); - kfree(server); - return NULL; - } - ida_init(&server->openowner_id); ida_init(&server->lockowner_id); pnfs_init_server(server); @@ -953,7 +944,6 @@ void nfs_free_server(struct nfs_server *server) ida_destroy(&server->lockowner_id); ida_destroy(&server->openowner_id); nfs_free_iostats(server->io_stats); - bdi_destroy(&server->backing_dev_info); kfree(server); nfs_release_automount_timer(); dprintk("<-- nfs_free_server()\n"); diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index aab32fc3d6a8..c1b5fed7c863 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -537,7 +537,7 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq, if (put_dreq(dreq)) nfs_direct_complete(dreq); - return 0; + return requested_bytes; } /** @@ -566,7 +566,7 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter) struct inode *inode = mapping->host; struct nfs_direct_req *dreq; struct nfs_lock_context *l_ctx; - ssize_t result = -EINVAL; + ssize_t result = -EINVAL, requested; size_t count = iov_iter_count(iter); nfs_add_stats(mapping->host, NFSIOS_DIRECTREADBYTES, count); @@ -600,14 +600,19 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter) nfs_start_io_direct(inode); NFS_I(inode)->read_io += count; - result = nfs_direct_read_schedule_iovec(dreq, iter, iocb->ki_pos); + requested = nfs_direct_read_schedule_iovec(dreq, iter, iocb->ki_pos); nfs_end_io_direct(inode); - if (!result) { + if (requested > 0) { result = nfs_direct_wait(dreq); - if (result > 0) + if (result > 0) { + requested -= result; iocb->ki_pos += result; + } + iov_iter_revert(iter, requested); + } else { + result = requested; } out_release: @@ -954,7 +959,7 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq, if (put_dreq(dreq)) nfs_direct_write_complete(dreq); - return 0; + return requested_bytes; } /** @@ -979,7 +984,7 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq, */ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter) { - ssize_t result = -EINVAL; + ssize_t result = -EINVAL, requested; size_t count; struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; @@ -1022,7 +1027,7 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter) nfs_start_io_direct(inode); - result = nfs_direct_write_schedule_iovec(dreq, iter, pos); + requested = nfs_direct_write_schedule_iovec(dreq, iter, pos); if (mapping->nrpages) { invalidate_inode_pages2_range(mapping, @@ -1031,13 +1036,17 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter) nfs_end_io_direct(inode); - if (!result) { + if (requested > 0) { result = nfs_direct_wait(dreq); if (result > 0) { + requested -= result; iocb->ki_pos = pos + result; /* XXX: should check the generic_write_sync retval */ generic_write_sync(iocb, result); } + iov_iter_revert(iter, requested); + } else { + result = requested; } out_release: nfs_direct_req_release(dreq); diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 54e0f9f2dd94..2f3822a4a7d5 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -2315,8 +2315,6 @@ inline void nfs_initialise_sb(struct super_block *sb) sb->s_blocksize = nfs_block_bits(server->wsize, &sb->s_blocksize_bits); - sb->s_bdi = &server->backing_dev_info; - nfs_super_set_maxbytes(sb, server->maxfilesize); } @@ -2522,11 +2520,6 @@ static void nfs_get_cache_cookie(struct super_block *sb, } #endif -static int nfs_bdi_register(struct nfs_server *server) -{ - return bdi_register_dev(&server->backing_dev_info, server->s_dev); -} - int nfs_set_sb_security(struct super_block *s, struct dentry *mntroot, struct nfs_mount_info *mount_info) { @@ -2594,11 +2587,13 @@ struct dentry *nfs_fs_mount_common(struct nfs_server *server, nfs_free_server(server); server = NULL; } else { - error = nfs_bdi_register(server); + error = super_setup_bdi_name(s, "%u:%u", MAJOR(server->s_dev), + MINOR(server->s_dev)); if (error) { mntroot = ERR_PTR(error); goto error_splat_super; } + s->s_bdi->ra_pages = server->rpages * NFS_MAX_READAHEAD; server->super = s; } diff --git a/fs/nfs/write.c b/fs/nfs/write.c index abb2c8a3be42..cc341fc7fd44 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -263,16 +263,15 @@ int nfs_congestion_kb; static void nfs_set_page_writeback(struct page *page) { - struct nfs_server *nfss = NFS_SERVER(page_file_mapping(page)->host); + struct inode *inode = page_file_mapping(page)->host; + struct nfs_server *nfss = NFS_SERVER(inode); int ret = test_set_page_writeback(page); WARN_ON_ONCE(ret != 0); if (atomic_long_inc_return(&nfss->writeback) > - NFS_CONGESTION_ON_THRESH) { - set_bdi_congested(&nfss->backing_dev_info, - BLK_RW_ASYNC); - } + NFS_CONGESTION_ON_THRESH) + set_bdi_congested(inode_to_bdi(inode), BLK_RW_ASYNC); } static void nfs_end_page_writeback(struct nfs_page *req) @@ -285,7 +284,7 @@ static void nfs_end_page_writeback(struct nfs_page *req) end_page_writeback(req->wb_page); if (atomic_long_dec_return(&nfss->writeback) < NFS_CONGESTION_OFF_THRESH) - clear_bdi_congested(&nfss->backing_dev_info, BLK_RW_ASYNC); + clear_bdi_congested(inode_to_bdi(inode), BLK_RW_ASYNC); } @@ -1808,7 +1807,7 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data) } nfss = NFS_SERVER(data->inode); if (atomic_long_read(&nfss->writeback) < NFS_CONGESTION_OFF_THRESH) - clear_bdi_congested(&nfss->backing_dev_info, BLK_RW_ASYNC); + clear_bdi_congested(inode_to_bdi(data->inode), BLK_RW_ASYNC); nfs_init_cinfo(&cinfo, data->inode, data->dreq); nfs_commit_end(cinfo.mds); diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c index 92b4b41d19d2..fb5213afc854 100644 --- a/fs/nfsd/blocklayout.c +++ b/fs/nfsd/blocklayout.c @@ -242,10 +242,11 @@ static int nfsd4_scsi_identify_device(struct block_device *bdev, req->cmd[4] = bufflen & 0xff; req->cmd_len = COMMAND_SIZE(INQUIRY); - error = blk_execute_rq(rq->q, NULL, rq, 1); - if (error) { + blk_execute_rq(rq->q, NULL, rq, 1); + if (req->result) { pr_err("pNFS: INQUIRY 0x83 failed with: %x\n", - rq->errors); + req->result); + error = -EIO; goto out_put_request; } diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c index dba2ff8eaa68..452334694a5d 100644 --- a/fs/nfsd/nfs3xdr.c +++ b/fs/nfsd/nfs3xdr.c @@ -358,6 +358,8 @@ nfs3svc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p, { unsigned int len, v, hdr, dlen; u32 max_blocksize = svc_max_payload(rqstp); + struct kvec *head = rqstp->rq_arg.head; + struct kvec *tail = rqstp->rq_arg.tail; p = decode_fh(p, &args->fh); if (!p) @@ -367,6 +369,8 @@ nfs3svc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p, args->count = ntohl(*p++); args->stable = ntohl(*p++); len = args->len = ntohl(*p++); + if ((void *)p > head->iov_base + head->iov_len) + return 0; /* * The count must equal the amount of data passed. */ @@ -377,9 +381,8 @@ nfs3svc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p, * Check to make sure that we got the right number of * bytes. */ - hdr = (void*)p - rqstp->rq_arg.head[0].iov_base; - dlen = rqstp->rq_arg.head[0].iov_len + rqstp->rq_arg.page_len - + rqstp->rq_arg.tail[0].iov_len - hdr; + hdr = (void*)p - head->iov_base; + dlen = head->iov_len + rqstp->rq_arg.page_len + tail->iov_len - hdr; /* * Round the length of the data which was specified up to * the next multiple of XDR units and then compare that @@ -396,7 +399,7 @@ nfs3svc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p, len = args->len = max_blocksize; } rqstp->rq_vec[0].iov_base = (void*)p; - rqstp->rq_vec[0].iov_len = rqstp->rq_arg.head[0].iov_len - hdr; + rqstp->rq_vec[0].iov_len = head->iov_len - hdr; v = 0; while (len > rqstp->rq_vec[v].iov_len) { len -= rqstp->rq_vec[v].iov_len; @@ -471,6 +474,8 @@ nfs3svc_decode_symlinkargs(struct svc_rqst *rqstp, __be32 *p, /* first copy and check from the first page */ old = (char*)p; vec = &rqstp->rq_arg.head[0]; + if ((void *)old > vec->iov_base + vec->iov_len) + return 0; avail = vec->iov_len - (old - (char*)vec->iov_base); while (len && avail && *old) { *new++ = *old++; diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index cbeeda1e94a2..d86031b6ad79 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -2489,7 +2489,7 @@ bool nfsd4_spo_must_allow(struct svc_rqst *rqstp) int nfsd4_max_reply(struct svc_rqst *rqstp, struct nfsd4_op *op) { - if (op->opnum == OP_ILLEGAL) + if (op->opnum == OP_ILLEGAL || op->status == nfserr_notsupp) return op_encode_hdr_size * sizeof(__be32); BUG_ON(OPDESC(op)->op_rsize_bop == NULL); diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index 31e1f9593457..59979f0bbd4b 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -747,6 +747,37 @@ static __be32 map_new_errors(u32 vers, __be32 nfserr) return nfserr; } +/* + * A write procedure can have a large argument, and a read procedure can + * have a large reply, but no NFSv2 or NFSv3 procedure has argument and + * reply that can both be larger than a page. The xdr code has taken + * advantage of this assumption to be a sloppy about bounds checking in + * some cases. Pending a rewrite of the NFSv2/v3 xdr code to fix that + * problem, we enforce these assumptions here: + */ +static bool nfs_request_too_big(struct svc_rqst *rqstp, + struct svc_procedure *proc) +{ + /* + * The ACL code has more careful bounds-checking and is not + * susceptible to this problem: + */ + if (rqstp->rq_prog != NFS_PROGRAM) + return false; + /* + * Ditto NFSv4 (which can in theory have argument and reply both + * more than a page): + */ + if (rqstp->rq_vers >= 4) + return false; + /* The reply will be small, we're OK: */ + if (proc->pc_xdrressize > 0 && + proc->pc_xdrressize < XDR_QUADLEN(PAGE_SIZE)) + return false; + + return rqstp->rq_arg.len > PAGE_SIZE; +} + int nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp) { @@ -759,6 +790,11 @@ nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp) rqstp->rq_vers, rqstp->rq_proc); proc = rqstp->rq_procinfo; + if (nfs_request_too_big(rqstp, proc)) { + dprintk("nfsd: NFSv%d argument too large\n", rqstp->rq_vers); + *statp = rpc_garbage_args; + return 1; + } /* * Give the xdr decoder a chance to change this if it wants * (necessary in the NFSv4.0 compound case) diff --git a/fs/nfsd/nfsxdr.c b/fs/nfsd/nfsxdr.c index 41b468a6a90f..de07ff625777 100644 --- a/fs/nfsd/nfsxdr.c +++ b/fs/nfsd/nfsxdr.c @@ -280,6 +280,7 @@ nfssvc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p, struct nfsd_writeargs *args) { unsigned int len, hdr, dlen; + struct kvec *head = rqstp->rq_arg.head; int v; p = decode_fh(p, &args->fh); @@ -300,9 +301,10 @@ nfssvc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p, * Check to make sure that we got the right number of * bytes. */ - hdr = (void*)p - rqstp->rq_arg.head[0].iov_base; - dlen = rqstp->rq_arg.head[0].iov_len + rqstp->rq_arg.page_len - - hdr; + hdr = (void*)p - head->iov_base; + if (hdr > head->iov_len) + return 0; + dlen = head->iov_len + rqstp->rq_arg.page_len - hdr; /* * Round the length of the data which was specified up to @@ -316,7 +318,7 @@ nfssvc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p, return 0; rqstp->rq_vec[0].iov_base = (void*)p; - rqstp->rq_vec[0].iov_len = rqstp->rq_arg.head[0].iov_len - hdr; + rqstp->rq_vec[0].iov_len = head->iov_len - hdr; v = 0; while (len > rqstp->rq_vec[v].iov_len) { len -= rqstp->rq_vec[v].iov_len; diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 19d50f600e8d..9aaf6ca77569 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -1004,7 +1004,7 @@ out_nfserr: else err = nfserrno(host_err); if (test_bit(RQ_LOCAL, &rqstp->rq_flags)) - tsk_restore_flags(current, pflags, PF_LESS_THROTTLE); + current_restore_flags(pflags, PF_LESS_THROTTLE); return err; } diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c index e1872f36147f..926682981d61 100644 --- a/fs/nilfs2/super.c +++ b/fs/nilfs2/super.c @@ -1068,7 +1068,7 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent) sb->s_time_gran = 1; sb->s_max_links = NILFS_LINK_MAX; - sb->s_bdi = bdev_get_queue(sb->s_bdev)->backing_dev_info; + sb->s_bdi = bdi_get(sb->s_bdev->bd_bdi); err = load_nilfs(nilfs, sb); if (err) diff --git a/fs/notify/Makefile b/fs/notify/Makefile index 96d3420d0242..3e969ae91b60 100644 --- a/fs/notify/Makefile +++ b/fs/notify/Makefile @@ -1,5 +1,5 @@ -obj-$(CONFIG_FSNOTIFY) += fsnotify.o notification.o group.o inode_mark.o \ - mark.o vfsmount_mark.o fdinfo.o +obj-$(CONFIG_FSNOTIFY) += fsnotify.o notification.o group.o mark.o \ + fdinfo.o obj-y += dnotify/ obj-y += inotify/ diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c index 5a4ec309e283..2430a0415995 100644 --- a/fs/notify/dnotify/dnotify.c +++ b/fs/notify/dnotify/dnotify.c @@ -52,7 +52,7 @@ struct dnotify_mark { */ static void dnotify_recalc_inode_mask(struct fsnotify_mark *fsn_mark) { - __u32 new_mask, old_mask; + __u32 new_mask = 0; struct dnotify_struct *dn; struct dnotify_mark *dn_mark = container_of(fsn_mark, struct dnotify_mark, @@ -60,17 +60,13 @@ static void dnotify_recalc_inode_mask(struct fsnotify_mark *fsn_mark) assert_spin_locked(&fsn_mark->lock); - old_mask = fsn_mark->mask; - new_mask = 0; for (dn = dn_mark->dn; dn != NULL; dn = dn->dn_next) new_mask |= (dn->dn_mask & ~FS_DN_MULTISHOT); - fsnotify_set_mark_mask_locked(fsn_mark, new_mask); - - if (old_mask == new_mask) + if (fsn_mark->mask == new_mask) return; + fsn_mark->mask = new_mask; - if (fsn_mark->inode) - fsnotify_recalc_inode_mask(fsn_mark->inode); + fsnotify_recalc_mask(fsn_mark->connector); } /* @@ -86,7 +82,8 @@ static int dnotify_handle_event(struct fsnotify_group *group, struct fsnotify_mark *inode_mark, struct fsnotify_mark *vfsmount_mark, u32 mask, const void *data, int data_type, - const unsigned char *file_name, u32 cookie) + const unsigned char *file_name, u32 cookie, + struct fsnotify_iter_info *iter_info) { struct dnotify_mark *dn_mark; struct dnotify_struct *dn; @@ -138,6 +135,7 @@ static void dnotify_free_mark(struct fsnotify_mark *fsn_mark) static struct fsnotify_ops dnotify_fsnotify_ops = { .handle_event = dnotify_handle_event, + .free_mark = dnotify_free_mark, }; /* @@ -160,7 +158,7 @@ void dnotify_flush(struct file *filp, fl_owner_t id) if (!S_ISDIR(inode->i_mode)) return; - fsn_mark = fsnotify_find_inode_mark(dnotify_group, inode); + fsn_mark = fsnotify_find_mark(&inode->i_fsnotify_marks, dnotify_group); if (!fsn_mark) return; dn_mark = container_of(fsn_mark, struct dnotify_mark, fsn_mark); @@ -308,7 +306,7 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg) /* set up the new_fsn_mark and new_dn_mark */ new_fsn_mark = &new_dn_mark->fsn_mark; - fsnotify_init_mark(new_fsn_mark, dnotify_free_mark); + fsnotify_init_mark(new_fsn_mark, dnotify_group); new_fsn_mark->mask = mask; new_dn_mark->dn = NULL; @@ -316,13 +314,12 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg) mutex_lock(&dnotify_group->mark_mutex); /* add the new_fsn_mark or find an old one. */ - fsn_mark = fsnotify_find_inode_mark(dnotify_group, inode); + fsn_mark = fsnotify_find_mark(&inode->i_fsnotify_marks, dnotify_group); if (fsn_mark) { dn_mark = container_of(fsn_mark, struct dnotify_mark, fsn_mark); spin_lock(&fsn_mark->lock); } else { - fsnotify_add_mark_locked(new_fsn_mark, dnotify_group, inode, - NULL, 0); + fsnotify_add_mark_locked(new_fsn_mark, inode, NULL, 0); spin_lock(&new_fsn_mark->lock); fsn_mark = new_fsn_mark; dn_mark = new_dn_mark; diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c index e5f7e47de68e..2fa99aeaa095 100644 --- a/fs/notify/fanotify/fanotify.c +++ b/fs/notify/fanotify/fanotify.c @@ -57,14 +57,26 @@ static int fanotify_merge(struct list_head *list, struct fsnotify_event *event) #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS static int fanotify_get_response(struct fsnotify_group *group, - struct fanotify_perm_event_info *event) + struct fanotify_perm_event_info *event, + struct fsnotify_iter_info *iter_info) { int ret; pr_debug("%s: group=%p event=%p\n", __func__, group, event); + /* + * fsnotify_prepare_user_wait() fails if we race with mark deletion. + * Just let the operation pass in that case. + */ + if (!fsnotify_prepare_user_wait(iter_info)) { + event->response = FAN_ALLOW; + goto out; + } + wait_event(group->fanotify_data.access_waitq, event->response); + fsnotify_finish_user_wait(iter_info); +out: /* userspace responded, convert to something usable */ switch (event->response) { case FAN_ALLOW: @@ -174,7 +186,8 @@ static int fanotify_handle_event(struct fsnotify_group *group, struct fsnotify_mark *inode_mark, struct fsnotify_mark *fanotify_mark, u32 mask, const void *data, int data_type, - const unsigned char *file_name, u32 cookie) + const unsigned char *file_name, u32 cookie, + struct fsnotify_iter_info *iter_info) { int ret = 0; struct fanotify_event_info *event; @@ -215,7 +228,8 @@ static int fanotify_handle_event(struct fsnotify_group *group, #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS if (mask & FAN_ALL_PERM_EVENTS) { - ret = fanotify_get_response(group, FANOTIFY_PE(fsn_event)); + ret = fanotify_get_response(group, FANOTIFY_PE(fsn_event), + iter_info); fsnotify_destroy_event(group, fsn_event); } #endif @@ -248,8 +262,14 @@ static void fanotify_free_event(struct fsnotify_event *fsn_event) kmem_cache_free(fanotify_event_cachep, event); } +static void fanotify_free_mark(struct fsnotify_mark *fsn_mark) +{ + kmem_cache_free(fanotify_mark_cache, fsn_mark); +} + const struct fsnotify_ops fanotify_fsnotify_ops = { .handle_event = fanotify_handle_event, .free_group_priv = fanotify_free_group_priv, .free_event = fanotify_free_event, + .free_mark = fanotify_free_mark, }; diff --git a/fs/notify/fanotify/fanotify.h b/fs/notify/fanotify/fanotify.h index 4500a74f8d38..4eb6f5efa282 100644 --- a/fs/notify/fanotify/fanotify.h +++ b/fs/notify/fanotify/fanotify.h @@ -2,6 +2,7 @@ #include <linux/path.h> #include <linux/slab.h> +extern struct kmem_cache *fanotify_mark_cache; extern struct kmem_cache *fanotify_event_cachep; extern struct kmem_cache *fanotify_perm_event_cachep; diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index 2b37f2785834..907a481ac781 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -41,7 +41,7 @@ extern const struct fsnotify_ops fanotify_fsnotify_ops; -static struct kmem_cache *fanotify_mark_cache __read_mostly; +struct kmem_cache *fanotify_mark_cache __read_mostly; struct kmem_cache *fanotify_event_cachep __read_mostly; struct kmem_cache *fanotify_perm_event_cachep __read_mostly; @@ -295,27 +295,37 @@ static ssize_t fanotify_read(struct file *file, char __user *buf, } ret = copy_event_to_user(group, kevent, buf); + if (unlikely(ret == -EOPENSTALE)) { + /* + * We cannot report events with stale fd so drop it. + * Setting ret to 0 will continue the event loop and + * do the right thing if there are no more events to + * read (i.e. return bytes read, -EAGAIN or wait). + */ + ret = 0; + } + /* * Permission events get queued to wait for response. Other * events can be destroyed now. */ if (!(kevent->mask & FAN_ALL_PERM_EVENTS)) { fsnotify_destroy_event(group, kevent); - if (ret < 0) - break; } else { #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS - if (ret < 0) { + if (ret <= 0) { FANOTIFY_PE(kevent)->response = FAN_DENY; wake_up(&group->fanotify_data.access_waitq); - break; + } else { + spin_lock(&group->notification_lock); + list_add_tail(&kevent->list, + &group->fanotify_data.access_list); + spin_unlock(&group->notification_lock); } - spin_lock(&group->notification_lock); - list_add_tail(&kevent->list, - &group->fanotify_data.access_list); - spin_unlock(&group->notification_lock); #endif } + if (ret < 0) + break; buf += ret; count -= ret; } @@ -445,11 +455,6 @@ static const struct file_operations fanotify_fops = { .llseek = noop_llseek, }; -static void fanotify_free_mark(struct fsnotify_mark *fsn_mark) -{ - kmem_cache_free(fanotify_mark_cache, fsn_mark); -} - static int fanotify_find_path(int dfd, const char __user *filename, struct path *path, unsigned int flags) { @@ -511,13 +516,12 @@ static __u32 fanotify_mark_remove_from_mask(struct fsnotify_mark *fsn_mark, tmask &= ~FAN_ONDIR; oldmask = fsn_mark->mask; - fsnotify_set_mark_mask_locked(fsn_mark, tmask); + fsn_mark->mask = tmask; } else { __u32 tmask = fsn_mark->ignored_mask & ~mask; if (flags & FAN_MARK_ONDIR) tmask &= ~FAN_ONDIR; - - fsnotify_set_mark_ignored_mask_locked(fsn_mark, tmask); + fsn_mark->ignored_mask = tmask; } *destroy = !(fsn_mark->mask | fsn_mark->ignored_mask); spin_unlock(&fsn_mark->lock); @@ -534,7 +538,8 @@ static int fanotify_remove_vfsmount_mark(struct fsnotify_group *group, int destroy_mark; mutex_lock(&group->mark_mutex); - fsn_mark = fsnotify_find_vfsmount_mark(group, mnt); + fsn_mark = fsnotify_find_mark(&real_mount(mnt)->mnt_fsnotify_marks, + group); if (!fsn_mark) { mutex_unlock(&group->mark_mutex); return -ENOENT; @@ -542,6 +547,8 @@ static int fanotify_remove_vfsmount_mark(struct fsnotify_group *group, removed = fanotify_mark_remove_from_mask(fsn_mark, mask, flags, &destroy_mark); + if (removed & real_mount(mnt)->mnt_fsnotify_mask) + fsnotify_recalc_mask(real_mount(mnt)->mnt_fsnotify_marks); if (destroy_mark) fsnotify_detach_mark(fsn_mark); mutex_unlock(&group->mark_mutex); @@ -549,9 +556,6 @@ static int fanotify_remove_vfsmount_mark(struct fsnotify_group *group, fsnotify_free_mark(fsn_mark); fsnotify_put_mark(fsn_mark); - if (removed & real_mount(mnt)->mnt_fsnotify_mask) - fsnotify_recalc_vfsmount_mask(mnt); - return 0; } @@ -564,7 +568,7 @@ static int fanotify_remove_inode_mark(struct fsnotify_group *group, int destroy_mark; mutex_lock(&group->mark_mutex); - fsn_mark = fsnotify_find_inode_mark(group, inode); + fsn_mark = fsnotify_find_mark(&inode->i_fsnotify_marks, group); if (!fsn_mark) { mutex_unlock(&group->mark_mutex); return -ENOENT; @@ -572,16 +576,16 @@ static int fanotify_remove_inode_mark(struct fsnotify_group *group, removed = fanotify_mark_remove_from_mask(fsn_mark, mask, flags, &destroy_mark); + if (removed & inode->i_fsnotify_mask) + fsnotify_recalc_mask(inode->i_fsnotify_marks); if (destroy_mark) fsnotify_detach_mark(fsn_mark); mutex_unlock(&group->mark_mutex); if (destroy_mark) fsnotify_free_mark(fsn_mark); - /* matches the fsnotify_find_inode_mark() */ + /* matches the fsnotify_find_mark() */ fsnotify_put_mark(fsn_mark); - if (removed & inode->i_fsnotify_mask) - fsnotify_recalc_inode_mask(inode); return 0; } @@ -600,13 +604,13 @@ static __u32 fanotify_mark_add_to_mask(struct fsnotify_mark *fsn_mark, tmask |= FAN_ONDIR; oldmask = fsn_mark->mask; - fsnotify_set_mark_mask_locked(fsn_mark, tmask); + fsn_mark->mask = tmask; } else { __u32 tmask = fsn_mark->ignored_mask | mask; if (flags & FAN_MARK_ONDIR) tmask |= FAN_ONDIR; - fsnotify_set_mark_ignored_mask_locked(fsn_mark, tmask); + fsn_mark->ignored_mask = tmask; if (flags & FAN_MARK_IGNORED_SURV_MODIFY) fsn_mark->flags |= FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY; } @@ -629,8 +633,8 @@ static struct fsnotify_mark *fanotify_add_new_mark(struct fsnotify_group *group, if (!mark) return ERR_PTR(-ENOMEM); - fsnotify_init_mark(mark, fanotify_free_mark); - ret = fsnotify_add_mark_locked(mark, group, inode, mnt, 0); + fsnotify_init_mark(mark, group); + ret = fsnotify_add_mark_locked(mark, inode, mnt, 0); if (ret) { fsnotify_put_mark(mark); return ERR_PTR(ret); @@ -648,7 +652,8 @@ static int fanotify_add_vfsmount_mark(struct fsnotify_group *group, __u32 added; mutex_lock(&group->mark_mutex); - fsn_mark = fsnotify_find_vfsmount_mark(group, mnt); + fsn_mark = fsnotify_find_mark(&real_mount(mnt)->mnt_fsnotify_marks, + group); if (!fsn_mark) { fsn_mark = fanotify_add_new_mark(group, NULL, mnt); if (IS_ERR(fsn_mark)) { @@ -657,10 +662,9 @@ static int fanotify_add_vfsmount_mark(struct fsnotify_group *group, } } added = fanotify_mark_add_to_mask(fsn_mark, mask, flags); - mutex_unlock(&group->mark_mutex); - if (added & ~real_mount(mnt)->mnt_fsnotify_mask) - fsnotify_recalc_vfsmount_mask(mnt); + fsnotify_recalc_mask(real_mount(mnt)->mnt_fsnotify_marks); + mutex_unlock(&group->mark_mutex); fsnotify_put_mark(fsn_mark); return 0; @@ -686,7 +690,7 @@ static int fanotify_add_inode_mark(struct fsnotify_group *group, return 0; mutex_lock(&group->mark_mutex); - fsn_mark = fsnotify_find_inode_mark(group, inode); + fsn_mark = fsnotify_find_mark(&inode->i_fsnotify_marks, group); if (!fsn_mark) { fsn_mark = fanotify_add_new_mark(group, inode, NULL); if (IS_ERR(fsn_mark)) { @@ -695,10 +699,9 @@ static int fanotify_add_inode_mark(struct fsnotify_group *group, } } added = fanotify_mark_add_to_mask(fsn_mark, mask, flags); - mutex_unlock(&group->mark_mutex); - if (added & ~inode->i_fsnotify_mask) - fsnotify_recalc_inode_mask(inode); + fsnotify_recalc_mask(inode->i_fsnotify_marks); + mutex_unlock(&group->mark_mutex); fsnotify_put_mark(fsn_mark); return 0; diff --git a/fs/notify/fdinfo.c b/fs/notify/fdinfo.c index fd98e5100cab..dd63aa9a6f9a 100644 --- a/fs/notify/fdinfo.c +++ b/fs/notify/fdinfo.c @@ -76,12 +76,11 @@ static void inotify_fdinfo(struct seq_file *m, struct fsnotify_mark *mark) struct inotify_inode_mark *inode_mark; struct inode *inode; - if (!(mark->flags & FSNOTIFY_MARK_FLAG_ALIVE) || - !(mark->flags & FSNOTIFY_MARK_FLAG_INODE)) + if (!(mark->connector->flags & FSNOTIFY_OBJ_TYPE_INODE)) return; inode_mark = container_of(mark, struct inotify_inode_mark, fsn_mark); - inode = igrab(mark->inode); + inode = igrab(mark->connector->inode); if (inode) { /* * IN_ALL_EVENTS represents all of the mask bits @@ -113,14 +112,11 @@ static void fanotify_fdinfo(struct seq_file *m, struct fsnotify_mark *mark) unsigned int mflags = 0; struct inode *inode; - if (!(mark->flags & FSNOTIFY_MARK_FLAG_ALIVE)) - return; - if (mark->flags & FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY) mflags |= FAN_MARK_IGNORED_SURV_MODIFY; - if (mark->flags & FSNOTIFY_MARK_FLAG_INODE) { - inode = igrab(mark->inode); + if (mark->connector->flags & FSNOTIFY_OBJ_TYPE_INODE) { + inode = igrab(mark->connector->inode); if (!inode) return; seq_printf(m, "fanotify ino:%lx sdev:%x mflags:%x mask:%x ignored_mask:%x ", @@ -129,8 +125,8 @@ static void fanotify_fdinfo(struct seq_file *m, struct fsnotify_mark *mark) show_mark_fhandle(m, inode); seq_putc(m, '\n'); iput(inode); - } else if (mark->flags & FSNOTIFY_MARK_FLAG_VFSMOUNT) { - struct mount *mnt = real_mount(mark->mnt); + } else if (mark->connector->flags & FSNOTIFY_OBJ_TYPE_VFSMOUNT) { + struct mount *mnt = real_mount(mark->connector->mnt); seq_printf(m, "fanotify mnt_id:%x mflags:%x mask:%x ignored_mask:%x\n", mnt->mnt_id, mflags, mark->mask, mark->ignored_mask); diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c index b41515d3f081..01a9f0f007d4 100644 --- a/fs/notify/fsnotify.c +++ b/fs/notify/fsnotify.c @@ -41,6 +41,63 @@ void __fsnotify_vfsmount_delete(struct vfsmount *mnt) fsnotify_clear_marks_by_mount(mnt); } +/** + * fsnotify_unmount_inodes - an sb is unmounting. handle any watched inodes. + * @sb: superblock being unmounted. + * + * Called during unmount with no locks held, so needs to be safe against + * concurrent modifiers. We temporarily drop sb->s_inode_list_lock and CAN block. + */ +void fsnotify_unmount_inodes(struct super_block *sb) +{ + struct inode *inode, *iput_inode = NULL; + + spin_lock(&sb->s_inode_list_lock); + list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { + /* + * We cannot __iget() an inode in state I_FREEING, + * I_WILL_FREE, or I_NEW which is fine because by that point + * the inode cannot have any associated watches. + */ + spin_lock(&inode->i_lock); + if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) { + spin_unlock(&inode->i_lock); + continue; + } + + /* + * If i_count is zero, the inode cannot have any watches and + * doing an __iget/iput with MS_ACTIVE clear would actually + * evict all inodes with zero i_count from icache which is + * unnecessarily violent and may in fact be illegal to do. + */ + if (!atomic_read(&inode->i_count)) { + spin_unlock(&inode->i_lock); + continue; + } + + __iget(inode); + spin_unlock(&inode->i_lock); + spin_unlock(&sb->s_inode_list_lock); + + if (iput_inode) + iput(iput_inode); + + /* for each watch, send FS_UNMOUNT and then remove it */ + fsnotify(inode, FS_UNMOUNT, inode, FSNOTIFY_EVENT_INODE, NULL, 0); + + fsnotify_inode_delete(inode); + + iput_inode = inode; + + spin_lock(&sb->s_inode_list_lock); + } + spin_unlock(&sb->s_inode_list_lock); + + if (iput_inode) + iput(iput_inode); +} + /* * Given an inode, first check if we care what happens to our children. Inotify * and dnotify both tell their parents about events. If we care about any event @@ -127,7 +184,8 @@ static int send_to_group(struct inode *to_tell, struct fsnotify_mark *vfsmount_mark, __u32 mask, const void *data, int data_is, u32 cookie, - const unsigned char *file_name) + const unsigned char *file_name, + struct fsnotify_iter_info *iter_info) { struct fsnotify_group *group = NULL; __u32 inode_test_mask = 0; @@ -178,7 +236,7 @@ static int send_to_group(struct inode *to_tell, return group->ops->handle_event(group, to_tell, inode_mark, vfsmount_mark, mask, data, data_is, - file_name, cookie); + file_name, cookie, iter_info); } /* @@ -193,8 +251,10 @@ int fsnotify(struct inode *to_tell, __u32 mask, const void *data, int data_is, struct hlist_node *inode_node = NULL, *vfsmount_node = NULL; struct fsnotify_mark *inode_mark = NULL, *vfsmount_mark = NULL; struct fsnotify_group *inode_group, *vfsmount_group; + struct fsnotify_mark_connector *inode_conn, *vfsmount_conn; + struct fsnotify_iter_info iter_info; struct mount *mnt; - int idx, ret = 0; + int ret = 0; /* global tests shouldn't care about events on child only the specific event */ __u32 test_mask = (mask & ~FS_EVENT_ON_CHILD); @@ -210,8 +270,8 @@ int fsnotify(struct inode *to_tell, __u32 mask, const void *data, int data_is, * SRCU because we have no references to any objects and do not * need SRCU to keep them "alive". */ - if (hlist_empty(&to_tell->i_fsnotify_marks) && - (!mnt || hlist_empty(&mnt->mnt_fsnotify_marks))) + if (!to_tell->i_fsnotify_marks && + (!mnt || !mnt->mnt_fsnotify_marks)) return 0; /* * if this is a modify event we may need to clear the ignored masks @@ -223,19 +283,30 @@ int fsnotify(struct inode *to_tell, __u32 mask, const void *data, int data_is, !(mnt && test_mask & mnt->mnt_fsnotify_mask)) return 0; - idx = srcu_read_lock(&fsnotify_mark_srcu); + iter_info.srcu_idx = srcu_read_lock(&fsnotify_mark_srcu); if ((mask & FS_MODIFY) || - (test_mask & to_tell->i_fsnotify_mask)) - inode_node = srcu_dereference(to_tell->i_fsnotify_marks.first, + (test_mask & to_tell->i_fsnotify_mask)) { + inode_conn = srcu_dereference(to_tell->i_fsnotify_marks, &fsnotify_mark_srcu); + if (inode_conn) + inode_node = srcu_dereference(inode_conn->list.first, + &fsnotify_mark_srcu); + } if (mnt && ((mask & FS_MODIFY) || (test_mask & mnt->mnt_fsnotify_mask))) { - vfsmount_node = srcu_dereference(mnt->mnt_fsnotify_marks.first, - &fsnotify_mark_srcu); - inode_node = srcu_dereference(to_tell->i_fsnotify_marks.first, + inode_conn = srcu_dereference(to_tell->i_fsnotify_marks, &fsnotify_mark_srcu); + if (inode_conn) + inode_node = srcu_dereference(inode_conn->list.first, + &fsnotify_mark_srcu); + vfsmount_conn = srcu_dereference(mnt->mnt_fsnotify_marks, + &fsnotify_mark_srcu); + if (vfsmount_conn) + vfsmount_node = srcu_dereference( + vfsmount_conn->list.first, + &fsnotify_mark_srcu); } /* @@ -272,8 +343,13 @@ int fsnotify(struct inode *to_tell, __u32 mask, const void *data, int data_is, vfsmount_mark = NULL; } } + + iter_info.inode_mark = inode_mark; + iter_info.vfsmount_mark = vfsmount_mark; + ret = send_to_group(to_tell, inode_mark, vfsmount_mark, mask, - data, data_is, cookie, file_name); + data, data_is, cookie, file_name, + &iter_info); if (ret && (mask & ALL_FSNOTIFY_PERM_EVENTS)) goto out; @@ -287,12 +363,14 @@ int fsnotify(struct inode *to_tell, __u32 mask, const void *data, int data_is, } ret = 0; out: - srcu_read_unlock(&fsnotify_mark_srcu, idx); + srcu_read_unlock(&fsnotify_mark_srcu, iter_info.srcu_idx); return ret; } EXPORT_SYMBOL_GPL(fsnotify); +extern struct kmem_cache *fsnotify_mark_connector_cachep; + static __init int fsnotify_init(void) { int ret; @@ -303,6 +381,9 @@ static __init int fsnotify_init(void) if (ret) panic("initializing fsnotify_mark_srcu"); + fsnotify_mark_connector_cachep = KMEM_CACHE(fsnotify_mark_connector, + SLAB_PANIC); + return 0; } core_initcall(fsnotify_init); diff --git a/fs/notify/fsnotify.h b/fs/notify/fsnotify.h index 0a3bc2cf192c..bf012e8ecd14 100644 --- a/fs/notify/fsnotify.h +++ b/fs/notify/fsnotify.h @@ -8,60 +8,36 @@ #include "../mount.h" +struct fsnotify_iter_info { + struct fsnotify_mark *inode_mark; + struct fsnotify_mark *vfsmount_mark; + int srcu_idx; +}; + /* destroy all events sitting in this groups notification queue */ extern void fsnotify_flush_notify(struct fsnotify_group *group); /* protects reads of inode and vfsmount marks list */ extern struct srcu_struct fsnotify_mark_srcu; -/* Calculate mask of events for a list of marks */ -extern u32 fsnotify_recalc_mask(struct hlist_head *head); - /* compare two groups for sorting of marks lists */ extern int fsnotify_compare_groups(struct fsnotify_group *a, struct fsnotify_group *b); -extern void fsnotify_set_inode_mark_mask_locked(struct fsnotify_mark *fsn_mark, - __u32 mask); -/* Add mark to a proper place in mark list */ -extern int fsnotify_add_mark_list(struct hlist_head *head, - struct fsnotify_mark *mark, - int allow_dups); -/* add a mark to an inode */ -extern int fsnotify_add_inode_mark(struct fsnotify_mark *mark, - struct fsnotify_group *group, struct inode *inode, - int allow_dups); -/* add a mark to a vfsmount */ -extern int fsnotify_add_vfsmount_mark(struct fsnotify_mark *mark, - struct fsnotify_group *group, struct vfsmount *mnt, - int allow_dups); - -/* vfsmount specific destruction of a mark */ -extern void fsnotify_destroy_vfsmount_mark(struct fsnotify_mark *mark); -/* inode specific destruction of a mark */ -extern void fsnotify_destroy_inode_mark(struct fsnotify_mark *mark); -/* Find mark belonging to given group in the list of marks */ -extern struct fsnotify_mark *fsnotify_find_mark(struct hlist_head *head, - struct fsnotify_group *group); -/* Destroy all marks in the given list protected by 'lock' */ -extern void fsnotify_destroy_marks(struct hlist_head *head, spinlock_t *lock); +/* Destroy all marks connected via given connector */ +extern void fsnotify_destroy_marks(struct fsnotify_mark_connector __rcu **connp); /* run the list of all marks associated with inode and destroy them */ static inline void fsnotify_clear_marks_by_inode(struct inode *inode) { - fsnotify_destroy_marks(&inode->i_fsnotify_marks, &inode->i_lock); + fsnotify_destroy_marks(&inode->i_fsnotify_marks); } /* run the list of all marks associated with vfsmount and destroy them */ static inline void fsnotify_clear_marks_by_mount(struct vfsmount *mnt) { - fsnotify_destroy_marks(&real_mount(mnt)->mnt_fsnotify_marks, - &mnt->mnt_root->d_lock); + fsnotify_destroy_marks(&real_mount(mnt)->mnt_fsnotify_marks); } -/* prepare for freeing all marks associated with given group */ -extern void fsnotify_detach_group_marks(struct fsnotify_group *group); -/* - * wait for fsnotify_mark_srcu period to end and free all marks in destroy_list - */ -extern void fsnotify_mark_destroy_list(void); +/* Wait until all marks queued for destruction are destroyed */ +extern void fsnotify_wait_marks_destroyed(void); /* * update the dentry->d_flags of all of inode's children to indicate if inode cares diff --git a/fs/notify/group.c b/fs/notify/group.c index fbe3cbebec16..32357534de18 100644 --- a/fs/notify/group.c +++ b/fs/notify/group.c @@ -66,14 +66,23 @@ void fsnotify_destroy_group(struct fsnotify_group *group) */ fsnotify_group_stop_queueing(group); - /* clear all inode marks for this group, attach them to destroy_list */ - fsnotify_detach_group_marks(group); + /* Clear all marks for this group and queue them for destruction */ + fsnotify_clear_marks_by_group(group, FSNOTIFY_OBJ_ALL_TYPES); /* - * Wait for fsnotify_mark_srcu period to end and free all marks in - * destroy_list + * Some marks can still be pinned when waiting for response from + * userspace. Wait for those now. fsnotify_prepare_user_wait() will + * not succeed now so this wait is race-free. */ - fsnotify_mark_destroy_list(); + wait_event(group->notification_waitq, !atomic_read(&group->user_waits)); + + /* + * Wait until all marks get really destroyed. We could actually destroy + * them ourselves instead of waiting for worker to do it, however that + * would be racy as worker can already be processing some marks before + * we even entered fsnotify_destroy_group(). + */ + fsnotify_wait_marks_destroyed(); /* * Since we have waited for fsnotify_mark_srcu in @@ -124,6 +133,7 @@ struct fsnotify_group *fsnotify_alloc_group(const struct fsnotify_ops *ops) /* set to 0 when there a no external references to this group */ atomic_set(&group->refcnt, 1); atomic_set(&group->num_marks, 0); + atomic_set(&group->user_waits, 0); spin_lock_init(&group->notification_lock); INIT_LIST_HEAD(&group->notification_list); diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c deleted file mode 100644 index a3645249f7ec..000000000000 --- a/fs/notify/inode_mark.c +++ /dev/null @@ -1,199 +0,0 @@ -/* - * Copyright (C) 2008 Red Hat, Inc., Eric Paris <eparis@redhat.com> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2, or (at your option) - * any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; see the file COPYING. If not, write to - * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. - */ - -#include <linux/fs.h> -#include <linux/init.h> -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/mutex.h> -#include <linux/spinlock.h> - -#include <linux/atomic.h> - -#include <linux/fsnotify_backend.h> -#include "fsnotify.h" - -#include "../internal.h" - -/* - * Recalculate the inode->i_fsnotify_mask, or the mask of all FS_* event types - * any notifier is interested in hearing for this inode. - */ -void fsnotify_recalc_inode_mask(struct inode *inode) -{ - spin_lock(&inode->i_lock); - inode->i_fsnotify_mask = fsnotify_recalc_mask(&inode->i_fsnotify_marks); - spin_unlock(&inode->i_lock); - - __fsnotify_update_child_dentry_flags(inode); -} - -void fsnotify_destroy_inode_mark(struct fsnotify_mark *mark) -{ - struct inode *inode = mark->inode; - - BUG_ON(!mutex_is_locked(&mark->group->mark_mutex)); - assert_spin_locked(&mark->lock); - - spin_lock(&inode->i_lock); - - hlist_del_init_rcu(&mark->obj_list); - mark->inode = NULL; - - /* - * this mark is now off the inode->i_fsnotify_marks list and we - * hold the inode->i_lock, so this is the perfect time to update the - * inode->i_fsnotify_mask - */ - inode->i_fsnotify_mask = fsnotify_recalc_mask(&inode->i_fsnotify_marks); - spin_unlock(&inode->i_lock); -} - -/* - * Given a group clear all of the inode marks associated with that group. - */ -void fsnotify_clear_inode_marks_by_group(struct fsnotify_group *group) -{ - fsnotify_clear_marks_by_group_flags(group, FSNOTIFY_MARK_FLAG_INODE); -} - -/* - * given a group and inode, find the mark associated with that combination. - * if found take a reference to that mark and return it, else return NULL - */ -struct fsnotify_mark *fsnotify_find_inode_mark(struct fsnotify_group *group, - struct inode *inode) -{ - struct fsnotify_mark *mark; - - spin_lock(&inode->i_lock); - mark = fsnotify_find_mark(&inode->i_fsnotify_marks, group); - spin_unlock(&inode->i_lock); - - return mark; -} - -/* - * If we are setting a mark mask on an inode mark we should pin the inode - * in memory. - */ -void fsnotify_set_inode_mark_mask_locked(struct fsnotify_mark *mark, - __u32 mask) -{ - struct inode *inode; - - assert_spin_locked(&mark->lock); - - if (mask && - mark->inode && - !(mark->flags & FSNOTIFY_MARK_FLAG_OBJECT_PINNED)) { - mark->flags |= FSNOTIFY_MARK_FLAG_OBJECT_PINNED; - inode = igrab(mark->inode); - /* - * we shouldn't be able to get here if the inode wasn't - * already safely held in memory. But bug in case it - * ever is wrong. - */ - BUG_ON(!inode); - } -} - -/* - * Attach an initialized mark to a given inode. - * These marks may be used for the fsnotify backend to determine which - * event types should be delivered to which group and for which inodes. These - * marks are ordered according to priority, highest number first, and then by - * the group's location in memory. - */ -int fsnotify_add_inode_mark(struct fsnotify_mark *mark, - struct fsnotify_group *group, struct inode *inode, - int allow_dups) -{ - int ret; - - mark->flags |= FSNOTIFY_MARK_FLAG_INODE; - - BUG_ON(!mutex_is_locked(&group->mark_mutex)); - assert_spin_locked(&mark->lock); - - spin_lock(&inode->i_lock); - mark->inode = inode; - ret = fsnotify_add_mark_list(&inode->i_fsnotify_marks, mark, - allow_dups); - inode->i_fsnotify_mask = fsnotify_recalc_mask(&inode->i_fsnotify_marks); - spin_unlock(&inode->i_lock); - - return ret; -} - -/** - * fsnotify_unmount_inodes - an sb is unmounting. handle any watched inodes. - * @sb: superblock being unmounted. - * - * Called during unmount with no locks held, so needs to be safe against - * concurrent modifiers. We temporarily drop sb->s_inode_list_lock and CAN block. - */ -void fsnotify_unmount_inodes(struct super_block *sb) -{ - struct inode *inode, *iput_inode = NULL; - - spin_lock(&sb->s_inode_list_lock); - list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { - /* - * We cannot __iget() an inode in state I_FREEING, - * I_WILL_FREE, or I_NEW which is fine because by that point - * the inode cannot have any associated watches. - */ - spin_lock(&inode->i_lock); - if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) { - spin_unlock(&inode->i_lock); - continue; - } - - /* - * If i_count is zero, the inode cannot have any watches and - * doing an __iget/iput with MS_ACTIVE clear would actually - * evict all inodes with zero i_count from icache which is - * unnecessarily violent and may in fact be illegal to do. - */ - if (!atomic_read(&inode->i_count)) { - spin_unlock(&inode->i_lock); - continue; - } - - __iget(inode); - spin_unlock(&inode->i_lock); - spin_unlock(&sb->s_inode_list_lock); - - if (iput_inode) - iput(iput_inode); - - /* for each watch, send FS_UNMOUNT and then remove it */ - fsnotify(inode, FS_UNMOUNT, inode, FSNOTIFY_EVENT_INODE, NULL, 0); - - fsnotify_inode_delete(inode); - - iput_inode = inode; - - spin_lock(&sb->s_inode_list_lock); - } - spin_unlock(&sb->s_inode_list_lock); - - if (iput_inode) - iput(iput_inode); -} diff --git a/fs/notify/inotify/inotify.h b/fs/notify/inotify/inotify.h index 7c461fd49c4c..9ff67b61da8a 100644 --- a/fs/notify/inotify/inotify.h +++ b/fs/notify/inotify/inotify.h @@ -27,9 +27,11 @@ extern int inotify_handle_event(struct fsnotify_group *group, struct fsnotify_mark *inode_mark, struct fsnotify_mark *vfsmount_mark, u32 mask, const void *data, int data_type, - const unsigned char *file_name, u32 cookie); + const unsigned char *file_name, u32 cookie, + struct fsnotify_iter_info *iter_info); extern const struct fsnotify_ops inotify_fsnotify_ops; +extern struct kmem_cache *inotify_inode_mark_cachep; #ifdef CONFIG_INOTIFY_USER static inline void dec_inotify_instances(struct ucounts *ucounts) diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c index 1aeb837ae414..8b73332735ba 100644 --- a/fs/notify/inotify/inotify_fsnotify.c +++ b/fs/notify/inotify/inotify_fsnotify.c @@ -68,7 +68,8 @@ int inotify_handle_event(struct fsnotify_group *group, struct fsnotify_mark *inode_mark, struct fsnotify_mark *vfsmount_mark, u32 mask, const void *data, int data_type, - const unsigned char *file_name, u32 cookie) + const unsigned char *file_name, u32 cookie, + struct fsnotify_iter_info *iter_info) { struct inotify_inode_mark *i_mark; struct inotify_event_info *event; @@ -156,8 +157,8 @@ static int idr_callback(int id, void *p, void *data) * BUG() that was here. */ if (fsn_mark) - printk(KERN_WARNING "fsn_mark->group=%p inode=%p wd=%d\n", - fsn_mark->group, fsn_mark->inode, i_mark->wd); + printk(KERN_WARNING "fsn_mark->group=%p wd=%d\n", + fsn_mark->group, i_mark->wd); return 0; } @@ -175,9 +176,20 @@ static void inotify_free_event(struct fsnotify_event *fsn_event) kfree(INOTIFY_E(fsn_event)); } +/* ding dong the mark is dead */ +static void inotify_free_mark(struct fsnotify_mark *fsn_mark) +{ + struct inotify_inode_mark *i_mark; + + i_mark = container_of(fsn_mark, struct inotify_inode_mark, fsn_mark); + + kmem_cache_free(inotify_inode_mark_cachep, i_mark); +} + const struct fsnotify_ops inotify_fsnotify_ops = { .handle_event = inotify_handle_event, .free_group_priv = inotify_free_group_priv, .free_event = inotify_free_event, .freeing_mark = inotify_freeing_mark, + .free_mark = inotify_free_mark, }; diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index 498d609b26c7..7cc7d3fb1862 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -47,7 +47,7 @@ /* configurable via /proc/sys/fs/inotify/ */ static int inotify_max_queued_events __read_mostly; -static struct kmem_cache *inotify_inode_mark_cachep __read_mostly; +struct kmem_cache *inotify_inode_mark_cachep __read_mostly; #ifdef CONFIG_SYSCTL @@ -395,21 +395,6 @@ static struct inotify_inode_mark *inotify_idr_find(struct fsnotify_group *group, return i_mark; } -static void do_inotify_remove_from_idr(struct fsnotify_group *group, - struct inotify_inode_mark *i_mark) -{ - struct idr *idr = &group->inotify_data.idr; - spinlock_t *idr_lock = &group->inotify_data.idr_lock; - int wd = i_mark->wd; - - assert_spin_locked(idr_lock); - - idr_remove(idr, wd); - - /* removed from the idr, drop that ref */ - fsnotify_put_mark(&i_mark->fsn_mark); -} - /* * Remove the mark from the idr (if present) and drop the reference * on the mark because it was in the idr. @@ -417,6 +402,7 @@ static void do_inotify_remove_from_idr(struct fsnotify_group *group, static void inotify_remove_from_idr(struct fsnotify_group *group, struct inotify_inode_mark *i_mark) { + struct idr *idr = &group->inotify_data.idr; spinlock_t *idr_lock = &group->inotify_data.idr_lock; struct inotify_inode_mark *found_i_mark = NULL; int wd; @@ -429,18 +415,16 @@ static void inotify_remove_from_idr(struct fsnotify_group *group, * if it wasn't.... */ if (wd == -1) { - WARN_ONCE(1, "%s: i_mark=%p i_mark->wd=%d i_mark->group=%p" - " i_mark->inode=%p\n", __func__, i_mark, i_mark->wd, - i_mark->fsn_mark.group, i_mark->fsn_mark.inode); + WARN_ONCE(1, "%s: i_mark=%p i_mark->wd=%d i_mark->group=%p\n", + __func__, i_mark, i_mark->wd, i_mark->fsn_mark.group); goto out; } /* Lets look in the idr to see if we find it */ found_i_mark = inotify_idr_find_locked(group, wd); if (unlikely(!found_i_mark)) { - WARN_ONCE(1, "%s: i_mark=%p i_mark->wd=%d i_mark->group=%p" - " i_mark->inode=%p\n", __func__, i_mark, i_mark->wd, - i_mark->fsn_mark.group, i_mark->fsn_mark.inode); + WARN_ONCE(1, "%s: i_mark=%p i_mark->wd=%d i_mark->group=%p\n", + __func__, i_mark, i_mark->wd, i_mark->fsn_mark.group); goto out; } @@ -451,35 +435,33 @@ static void inotify_remove_from_idr(struct fsnotify_group *group, */ if (unlikely(found_i_mark != i_mark)) { WARN_ONCE(1, "%s: i_mark=%p i_mark->wd=%d i_mark->group=%p " - "mark->inode=%p found_i_mark=%p found_i_mark->wd=%d " - "found_i_mark->group=%p found_i_mark->inode=%p\n", - __func__, i_mark, i_mark->wd, i_mark->fsn_mark.group, - i_mark->fsn_mark.inode, found_i_mark, found_i_mark->wd, - found_i_mark->fsn_mark.group, - found_i_mark->fsn_mark.inode); + "found_i_mark=%p found_i_mark->wd=%d " + "found_i_mark->group=%p\n", __func__, i_mark, + i_mark->wd, i_mark->fsn_mark.group, found_i_mark, + found_i_mark->wd, found_i_mark->fsn_mark.group); goto out; } /* * One ref for being in the idr - * one ref held by the caller trying to kill us * one ref grabbed by inotify_idr_find */ - if (unlikely(atomic_read(&i_mark->fsn_mark.refcnt) < 3)) { - printk(KERN_ERR "%s: i_mark=%p i_mark->wd=%d i_mark->group=%p" - " i_mark->inode=%p\n", __func__, i_mark, i_mark->wd, - i_mark->fsn_mark.group, i_mark->fsn_mark.inode); + if (unlikely(atomic_read(&i_mark->fsn_mark.refcnt) < 2)) { + printk(KERN_ERR "%s: i_mark=%p i_mark->wd=%d i_mark->group=%p\n", + __func__, i_mark, i_mark->wd, i_mark->fsn_mark.group); /* we can't really recover with bad ref cnting.. */ BUG(); } - do_inotify_remove_from_idr(group, i_mark); + idr_remove(idr, wd); + /* Removed from the idr, drop that ref. */ + fsnotify_put_mark(&i_mark->fsn_mark); out: + i_mark->wd = -1; + spin_unlock(idr_lock); /* match the ref taken by inotify_idr_find_locked() */ if (found_i_mark) fsnotify_put_mark(&found_i_mark->fsn_mark); - i_mark->wd = -1; - spin_unlock(idr_lock); } /* @@ -492,7 +474,7 @@ void inotify_ignored_and_remove_idr(struct fsnotify_mark *fsn_mark, /* Queue ignore event for the watch */ inotify_handle_event(group, NULL, fsn_mark, NULL, FS_IN_IGNORED, - NULL, FSNOTIFY_EVENT_NONE, NULL, 0); + NULL, FSNOTIFY_EVENT_NONE, NULL, 0, NULL); i_mark = container_of(fsn_mark, struct inotify_inode_mark, fsn_mark); /* remove this mark from the idr */ @@ -501,16 +483,6 @@ void inotify_ignored_and_remove_idr(struct fsnotify_mark *fsn_mark, dec_inotify_watches(group->inotify_data.ucounts); } -/* ding dong the mark is dead */ -static void inotify_free_mark(struct fsnotify_mark *fsn_mark) -{ - struct inotify_inode_mark *i_mark; - - i_mark = container_of(fsn_mark, struct inotify_inode_mark, fsn_mark); - - kmem_cache_free(inotify_inode_mark_cachep, i_mark); -} - static int inotify_update_existing_watch(struct fsnotify_group *group, struct inode *inode, u32 arg) @@ -524,21 +496,19 @@ static int inotify_update_existing_watch(struct fsnotify_group *group, mask = inotify_arg_to_mask(arg); - fsn_mark = fsnotify_find_inode_mark(group, inode); + fsn_mark = fsnotify_find_mark(&inode->i_fsnotify_marks, group); if (!fsn_mark) return -ENOENT; i_mark = container_of(fsn_mark, struct inotify_inode_mark, fsn_mark); spin_lock(&fsn_mark->lock); - old_mask = fsn_mark->mask; if (add) - fsnotify_set_mark_mask_locked(fsn_mark, (fsn_mark->mask | mask)); + fsn_mark->mask |= mask; else - fsnotify_set_mark_mask_locked(fsn_mark, mask); + fsn_mark->mask = mask; new_mask = fsn_mark->mask; - spin_unlock(&fsn_mark->lock); if (old_mask != new_mask) { @@ -549,7 +519,7 @@ static int inotify_update_existing_watch(struct fsnotify_group *group, /* update the inode with this new fsn_mark */ if (dropped || do_inode) - fsnotify_recalc_inode_mask(inode); + fsnotify_recalc_mask(inode->i_fsnotify_marks); } @@ -578,7 +548,7 @@ static int inotify_new_watch(struct fsnotify_group *group, if (unlikely(!tmp_i_mark)) return -ENOMEM; - fsnotify_init_mark(&tmp_i_mark->fsn_mark, inotify_free_mark); + fsnotify_init_mark(&tmp_i_mark->fsn_mark, group); tmp_i_mark->fsn_mark.mask = mask; tmp_i_mark->wd = -1; @@ -594,8 +564,7 @@ static int inotify_new_watch(struct fsnotify_group *group, } /* we are on the idr, now get on the inode */ - ret = fsnotify_add_mark_locked(&tmp_i_mark->fsn_mark, group, inode, - NULL, 0); + ret = fsnotify_add_mark_locked(&tmp_i_mark->fsn_mark, inode, NULL, 0); if (ret) { /* we failed to get on the inode, get off the idr */ inotify_remove_from_idr(group, tmp_i_mark); diff --git a/fs/notify/mark.c b/fs/notify/mark.c index 6043306e8e21..9991f8826734 100644 --- a/fs/notify/mark.c +++ b/fs/notify/mark.c @@ -33,7 +33,7 @@ * * group->mark_mutex * mark->lock - * inode->i_lock + * mark->connector->lock * * group->mark_mutex protects the marks_list anchored inside a given group and * each mark is hooked via the g_list. It also protects the groups private @@ -44,14 +44,22 @@ * is assigned to as well as the access to a reference of the inode/vfsmount * that is being watched by the mark. * - * inode->i_lock protects the i_fsnotify_marks list anchored inside a - * given inode and each mark is hooked via the i_list. (and sorta the - * free_i_list) + * mark->connector->lock protects the list of marks anchored inside an + * inode / vfsmount and each mark is hooked via the i_list. * + * A list of notification marks relating to inode / mnt is contained in + * fsnotify_mark_connector. That structure is alive as long as there are any + * marks in the list and is also protected by fsnotify_mark_srcu. A mark gets + * detached from fsnotify_mark_connector when last reference to the mark is + * dropped. Thus having mark reference is enough to protect mark->connector + * pointer and to make sure fsnotify_mark_connector cannot disappear. Also + * because we remove mark from g_list before dropping mark reference associated + * with that, any mark found through g_list is guaranteed to have + * mark->connector set until we drop group->mark_mutex. * * LIFETIME: * Inode marks survive between when they are added to an inode and when their - * refcnt==0. + * refcnt==0. Marks are also protected by fsnotify_mark_srcu. * * The inode mark can be cleared for a number of different reasons including: * - The inode is unlinked for the last time. (fsnotify_inode_remove) @@ -61,17 +69,6 @@ * - The fsnotify_group associated with the mark is going away and all such marks * need to be cleaned up. (fsnotify_clear_marks_by_group) * - * Worst case we are given an inode and need to clean up all the marks on that - * inode. We take i_lock and walk the i_fsnotify_marks safely. For each - * mark on the list we take a reference (so the mark can't disappear under us). - * We remove that mark form the inode's list of marks and we add this mark to a - * private list anchored on the stack using i_free_list; we walk i_free_list - * and before we destroy the mark we make sure that we dont race with a - * concurrent destroy_group by getting a ref to the marks group and taking the - * groups mutex. - - * Very similarly for freeing by group, except we use free_g_list. - * * This has the very interesting property of being able to run concurrently with * any (or all) other directions. */ @@ -94,94 +91,281 @@ #define FSNOTIFY_REAPER_DELAY (1) /* 1 jiffy */ struct srcu_struct fsnotify_mark_srcu; +struct kmem_cache *fsnotify_mark_connector_cachep; + static DEFINE_SPINLOCK(destroy_lock); static LIST_HEAD(destroy_list); +static struct fsnotify_mark_connector *connector_destroy_list; static void fsnotify_mark_destroy_workfn(struct work_struct *work); static DECLARE_DELAYED_WORK(reaper_work, fsnotify_mark_destroy_workfn); +static void fsnotify_connector_destroy_workfn(struct work_struct *work); +static DECLARE_WORK(connector_reaper_work, fsnotify_connector_destroy_workfn); + void fsnotify_get_mark(struct fsnotify_mark *mark) { + WARN_ON_ONCE(!atomic_read(&mark->refcnt)); atomic_inc(&mark->refcnt); } -void fsnotify_put_mark(struct fsnotify_mark *mark) +/* + * Get mark reference when we found the mark via lockless traversal of object + * list. Mark can be already removed from the list by now and on its way to be + * destroyed once SRCU period ends. + */ +static bool fsnotify_get_mark_safe(struct fsnotify_mark *mark) { - if (atomic_dec_and_test(&mark->refcnt)) { - if (mark->group) - fsnotify_put_group(mark->group); - mark->free_mark(mark); - } + return atomic_inc_not_zero(&mark->refcnt); } -/* Calculate mask of events for a list of marks */ -u32 fsnotify_recalc_mask(struct hlist_head *head) +static void __fsnotify_recalc_mask(struct fsnotify_mark_connector *conn) { u32 new_mask = 0; struct fsnotify_mark *mark; - hlist_for_each_entry(mark, head, obj_list) - new_mask |= mark->mask; - return new_mask; + assert_spin_locked(&conn->lock); + hlist_for_each_entry(mark, &conn->list, obj_list) { + if (mark->flags & FSNOTIFY_MARK_FLAG_ATTACHED) + new_mask |= mark->mask; + } + if (conn->flags & FSNOTIFY_OBJ_TYPE_INODE) + conn->inode->i_fsnotify_mask = new_mask; + else if (conn->flags & FSNOTIFY_OBJ_TYPE_VFSMOUNT) + real_mount(conn->mnt)->mnt_fsnotify_mask = new_mask; } /* - * Remove mark from inode / vfsmount list, group list, drop inode reference - * if we got one. - * - * Must be called with group->mark_mutex held. + * Calculate mask of events for a list of marks. The caller must make sure + * connector and connector->inode cannot disappear under us. Callers achieve + * this by holding a mark->lock or mark->group->mark_mutex for a mark on this + * list. */ -void fsnotify_detach_mark(struct fsnotify_mark *mark) +void fsnotify_recalc_mask(struct fsnotify_mark_connector *conn) +{ + if (!conn) + return; + + spin_lock(&conn->lock); + __fsnotify_recalc_mask(conn); + spin_unlock(&conn->lock); + if (conn->flags & FSNOTIFY_OBJ_TYPE_INODE) + __fsnotify_update_child_dentry_flags(conn->inode); +} + +/* Free all connectors queued for freeing once SRCU period ends */ +static void fsnotify_connector_destroy_workfn(struct work_struct *work) +{ + struct fsnotify_mark_connector *conn, *free; + + spin_lock(&destroy_lock); + conn = connector_destroy_list; + connector_destroy_list = NULL; + spin_unlock(&destroy_lock); + + synchronize_srcu(&fsnotify_mark_srcu); + while (conn) { + free = conn; + conn = conn->destroy_next; + kmem_cache_free(fsnotify_mark_connector_cachep, free); + } +} + +static struct inode *fsnotify_detach_connector_from_object( + struct fsnotify_mark_connector *conn) { struct inode *inode = NULL; + + if (conn->flags & FSNOTIFY_OBJ_TYPE_INODE) { + inode = conn->inode; + rcu_assign_pointer(inode->i_fsnotify_marks, NULL); + inode->i_fsnotify_mask = 0; + conn->inode = NULL; + conn->flags &= ~FSNOTIFY_OBJ_TYPE_INODE; + } else if (conn->flags & FSNOTIFY_OBJ_TYPE_VFSMOUNT) { + rcu_assign_pointer(real_mount(conn->mnt)->mnt_fsnotify_marks, + NULL); + real_mount(conn->mnt)->mnt_fsnotify_mask = 0; + conn->mnt = NULL; + conn->flags &= ~FSNOTIFY_OBJ_TYPE_VFSMOUNT; + } + + return inode; +} + +static void fsnotify_final_mark_destroy(struct fsnotify_mark *mark) +{ struct fsnotify_group *group = mark->group; - BUG_ON(!mutex_is_locked(&group->mark_mutex)); + if (WARN_ON_ONCE(!group)) + return; + group->ops->free_mark(mark); + fsnotify_put_group(group); +} - spin_lock(&mark->lock); +void fsnotify_put_mark(struct fsnotify_mark *mark) +{ + struct fsnotify_mark_connector *conn; + struct inode *inode = NULL; + bool free_conn = false; - /* something else already called this function on this mark */ - if (!(mark->flags & FSNOTIFY_MARK_FLAG_ATTACHED)) { - spin_unlock(&mark->lock); + /* Catch marks that were actually never attached to object */ + if (!mark->connector) { + if (atomic_dec_and_test(&mark->refcnt)) + fsnotify_final_mark_destroy(mark); return; } - mark->flags &= ~FSNOTIFY_MARK_FLAG_ATTACHED; + /* + * We have to be careful so that traversals of obj_list under lock can + * safely grab mark reference. + */ + if (!atomic_dec_and_lock(&mark->refcnt, &mark->connector->lock)) + return; - if (mark->flags & FSNOTIFY_MARK_FLAG_INODE) { - inode = mark->inode; - fsnotify_destroy_inode_mark(mark); - } else if (mark->flags & FSNOTIFY_MARK_FLAG_VFSMOUNT) - fsnotify_destroy_vfsmount_mark(mark); - else - BUG(); + conn = mark->connector; + hlist_del_init_rcu(&mark->obj_list); + if (hlist_empty(&conn->list)) { + inode = fsnotify_detach_connector_from_object(conn); + free_conn = true; + } else { + __fsnotify_recalc_mask(conn); + } + mark->connector = NULL; + spin_unlock(&conn->lock); + + iput(inode); + + if (free_conn) { + spin_lock(&destroy_lock); + conn->destroy_next = connector_destroy_list; + connector_destroy_list = conn; + spin_unlock(&destroy_lock); + queue_work(system_unbound_wq, &connector_reaper_work); + } /* * Note that we didn't update flags telling whether inode cares about * what's happening with children. We update these flags from * __fsnotify_parent() lazily when next event happens on one of our * children. */ + spin_lock(&destroy_lock); + list_add(&mark->g_list, &destroy_list); + spin_unlock(&destroy_lock); + queue_delayed_work(system_unbound_wq, &reaper_work, + FSNOTIFY_REAPER_DELAY); +} - list_del_init(&mark->g_list); +bool fsnotify_prepare_user_wait(struct fsnotify_iter_info *iter_info) +{ + struct fsnotify_group *group; - spin_unlock(&mark->lock); + if (WARN_ON_ONCE(!iter_info->inode_mark && !iter_info->vfsmount_mark)) + return false; + + if (iter_info->inode_mark) + group = iter_info->inode_mark->group; + else + group = iter_info->vfsmount_mark->group; + + /* + * Since acquisition of mark reference is an atomic op as well, we can + * be sure this inc is seen before any effect of refcount increment. + */ + atomic_inc(&group->user_waits); + + if (iter_info->inode_mark) { + /* This can fail if mark is being removed */ + if (!fsnotify_get_mark_safe(iter_info->inode_mark)) + goto out_wait; + } + if (iter_info->vfsmount_mark) { + if (!fsnotify_get_mark_safe(iter_info->vfsmount_mark)) + goto out_inode; + } - if (inode && (mark->flags & FSNOTIFY_MARK_FLAG_OBJECT_PINNED)) - iput(inode); + /* + * Now that both marks are pinned by refcount in the inode / vfsmount + * lists, we can drop SRCU lock, and safely resume the list iteration + * once userspace returns. + */ + srcu_read_unlock(&fsnotify_mark_srcu, iter_info->srcu_idx); + + return true; +out_inode: + if (iter_info->inode_mark) + fsnotify_put_mark(iter_info->inode_mark); +out_wait: + if (atomic_dec_and_test(&group->user_waits) && group->shutdown) + wake_up(&group->notification_waitq); + return false; +} + +void fsnotify_finish_user_wait(struct fsnotify_iter_info *iter_info) +{ + struct fsnotify_group *group = NULL; + + iter_info->srcu_idx = srcu_read_lock(&fsnotify_mark_srcu); + if (iter_info->inode_mark) { + group = iter_info->inode_mark->group; + fsnotify_put_mark(iter_info->inode_mark); + } + if (iter_info->vfsmount_mark) { + group = iter_info->vfsmount_mark->group; + fsnotify_put_mark(iter_info->vfsmount_mark); + } + /* + * We abuse notification_waitq on group shutdown for waiting for all + * marks pinned when waiting for userspace. + */ + if (atomic_dec_and_test(&group->user_waits) && group->shutdown) + wake_up(&group->notification_waitq); +} + +/* + * Mark mark as detached, remove it from group list. Mark still stays in object + * list until its last reference is dropped. Note that we rely on mark being + * removed from group list before corresponding reference to it is dropped. In + * particular we rely on mark->connector being valid while we hold + * group->mark_mutex if we found the mark through g_list. + * + * Must be called with group->mark_mutex held. The caller must either hold + * reference to the mark or be protected by fsnotify_mark_srcu. + */ +void fsnotify_detach_mark(struct fsnotify_mark *mark) +{ + struct fsnotify_group *group = mark->group; + + WARN_ON_ONCE(!mutex_is_locked(&group->mark_mutex)); + WARN_ON_ONCE(!srcu_read_lock_held(&fsnotify_mark_srcu) && + atomic_read(&mark->refcnt) < 1 + + !!(mark->flags & FSNOTIFY_MARK_FLAG_ATTACHED)); + + spin_lock(&mark->lock); + /* something else already called this function on this mark */ + if (!(mark->flags & FSNOTIFY_MARK_FLAG_ATTACHED)) { + spin_unlock(&mark->lock); + return; + } + mark->flags &= ~FSNOTIFY_MARK_FLAG_ATTACHED; + list_del_init(&mark->g_list); + spin_unlock(&mark->lock); atomic_dec(&group->num_marks); + + /* Drop mark reference acquired in fsnotify_add_mark_locked() */ + fsnotify_put_mark(mark); } /* - * Prepare mark for freeing and add it to the list of marks prepared for - * freeing. The actual freeing must happen after SRCU period ends and the - * caller is responsible for this. + * Free fsnotify mark. The mark is actually only marked as being freed. The + * freeing is actually happening only once last reference to the mark is + * dropped from a workqueue which first waits for srcu period end. * - * The function returns true if the mark was added to the list of marks for - * freeing. The function returns false if someone else has already called - * __fsnotify_free_mark() for the mark. + * Caller must have a reference to the mark or be protected by + * fsnotify_mark_srcu. */ -static bool __fsnotify_free_mark(struct fsnotify_mark *mark) +void fsnotify_free_mark(struct fsnotify_mark *mark) { struct fsnotify_group *group = mark->group; @@ -189,7 +373,7 @@ static bool __fsnotify_free_mark(struct fsnotify_mark *mark) /* something else already called this function on this mark */ if (!(mark->flags & FSNOTIFY_MARK_FLAG_ALIVE)) { spin_unlock(&mark->lock); - return false; + return; } mark->flags &= ~FSNOTIFY_MARK_FLAG_ALIVE; spin_unlock(&mark->lock); @@ -201,25 +385,6 @@ static bool __fsnotify_free_mark(struct fsnotify_mark *mark) */ if (group->ops->freeing_mark) group->ops->freeing_mark(mark, group); - - spin_lock(&destroy_lock); - list_add(&mark->g_list, &destroy_list); - spin_unlock(&destroy_lock); - - return true; -} - -/* - * Free fsnotify mark. The freeing is actually happening from a workqueue which - * first waits for srcu period end. Caller must have a reference to the mark - * or be protected by fsnotify_mark_srcu. - */ -void fsnotify_free_mark(struct fsnotify_mark *mark) -{ - if (__fsnotify_free_mark(mark)) { - queue_delayed_work(system_unbound_wq, &reaper_work, - FSNOTIFY_REAPER_DELAY); - } } void fsnotify_destroy_mark(struct fsnotify_mark *mark, @@ -231,54 +396,6 @@ void fsnotify_destroy_mark(struct fsnotify_mark *mark, fsnotify_free_mark(mark); } -void fsnotify_destroy_marks(struct hlist_head *head, spinlock_t *lock) -{ - struct fsnotify_mark *mark; - - while (1) { - /* - * We have to be careful since we can race with e.g. - * fsnotify_clear_marks_by_group() and once we drop 'lock', - * mark can get removed from the obj_list and destroyed. But - * we are holding mark reference so mark cannot be freed and - * calling fsnotify_destroy_mark() more than once is fine. - */ - spin_lock(lock); - if (hlist_empty(head)) { - spin_unlock(lock); - break; - } - mark = hlist_entry(head->first, struct fsnotify_mark, obj_list); - /* - * We don't update i_fsnotify_mask / mnt_fsnotify_mask here - * since inode / mount is going away anyway. So just remove - * mark from the list. - */ - hlist_del_init_rcu(&mark->obj_list); - fsnotify_get_mark(mark); - spin_unlock(lock); - fsnotify_destroy_mark(mark, mark->group); - fsnotify_put_mark(mark); - } -} - -void fsnotify_set_mark_mask_locked(struct fsnotify_mark *mark, __u32 mask) -{ - assert_spin_locked(&mark->lock); - - mark->mask = mask; - - if (mark->flags & FSNOTIFY_MARK_FLAG_INODE) - fsnotify_set_inode_mark_mask_locked(mark, mask); -} - -void fsnotify_set_mark_ignored_mask_locked(struct fsnotify_mark *mark, __u32 mask) -{ - assert_spin_locked(&mark->lock); - - mark->ignored_mask = mask; -} - /* * Sorting function for lists of fsnotify marks. * @@ -315,37 +432,133 @@ int fsnotify_compare_groups(struct fsnotify_group *a, struct fsnotify_group *b) return -1; } -/* Add mark into proper place in given list of marks */ -int fsnotify_add_mark_list(struct hlist_head *head, struct fsnotify_mark *mark, - int allow_dups) +static int fsnotify_attach_connector_to_object( + struct fsnotify_mark_connector __rcu **connp, + struct inode *inode, + struct vfsmount *mnt) +{ + struct fsnotify_mark_connector *conn; + + conn = kmem_cache_alloc(fsnotify_mark_connector_cachep, GFP_KERNEL); + if (!conn) + return -ENOMEM; + spin_lock_init(&conn->lock); + INIT_HLIST_HEAD(&conn->list); + if (inode) { + conn->flags = FSNOTIFY_OBJ_TYPE_INODE; + conn->inode = igrab(inode); + } else { + conn->flags = FSNOTIFY_OBJ_TYPE_VFSMOUNT; + conn->mnt = mnt; + } + /* + * cmpxchg() provides the barrier so that readers of *connp can see + * only initialized structure + */ + if (cmpxchg(connp, NULL, conn)) { + /* Someone else created list structure for us */ + if (inode) + iput(inode); + kmem_cache_free(fsnotify_mark_connector_cachep, conn); + } + + return 0; +} + +/* + * Get mark connector, make sure it is alive and return with its lock held. + * This is for users that get connector pointer from inode or mount. Users that + * hold reference to a mark on the list may directly lock connector->lock as + * they are sure list cannot go away under them. + */ +static struct fsnotify_mark_connector *fsnotify_grab_connector( + struct fsnotify_mark_connector __rcu **connp) +{ + struct fsnotify_mark_connector *conn; + int idx; + + idx = srcu_read_lock(&fsnotify_mark_srcu); + conn = srcu_dereference(*connp, &fsnotify_mark_srcu); + if (!conn) + goto out; + spin_lock(&conn->lock); + if (!(conn->flags & (FSNOTIFY_OBJ_TYPE_INODE | + FSNOTIFY_OBJ_TYPE_VFSMOUNT))) { + spin_unlock(&conn->lock); + srcu_read_unlock(&fsnotify_mark_srcu, idx); + return NULL; + } +out: + srcu_read_unlock(&fsnotify_mark_srcu, idx); + return conn; +} + +/* + * Add mark into proper place in given list of marks. These marks may be used + * for the fsnotify backend to determine which event types should be delivered + * to which group and for which inodes. These marks are ordered according to + * priority, highest number first, and then by the group's location in memory. + */ +static int fsnotify_add_mark_list(struct fsnotify_mark *mark, + struct inode *inode, struct vfsmount *mnt, + int allow_dups) { struct fsnotify_mark *lmark, *last = NULL; + struct fsnotify_mark_connector *conn; + struct fsnotify_mark_connector __rcu **connp; int cmp; + int err = 0; + + if (WARN_ON(!inode && !mnt)) + return -EINVAL; + if (inode) + connp = &inode->i_fsnotify_marks; + else + connp = &real_mount(mnt)->mnt_fsnotify_marks; +restart: + spin_lock(&mark->lock); + conn = fsnotify_grab_connector(connp); + if (!conn) { + spin_unlock(&mark->lock); + err = fsnotify_attach_connector_to_object(connp, inode, mnt); + if (err) + return err; + goto restart; + } /* is mark the first mark? */ - if (hlist_empty(head)) { - hlist_add_head_rcu(&mark->obj_list, head); - return 0; + if (hlist_empty(&conn->list)) { + hlist_add_head_rcu(&mark->obj_list, &conn->list); + goto added; } /* should mark be in the middle of the current list? */ - hlist_for_each_entry(lmark, head, obj_list) { + hlist_for_each_entry(lmark, &conn->list, obj_list) { last = lmark; - if ((lmark->group == mark->group) && !allow_dups) - return -EEXIST; + if ((lmark->group == mark->group) && + (lmark->flags & FSNOTIFY_MARK_FLAG_ATTACHED) && + !allow_dups) { + err = -EEXIST; + goto out_err; + } cmp = fsnotify_compare_groups(lmark->group, mark->group); if (cmp >= 0) { hlist_add_before_rcu(&mark->obj_list, &lmark->obj_list); - return 0; + goto added; } } BUG_ON(last == NULL); /* mark should be the last entry. last is the current last entry */ hlist_add_behind_rcu(&mark->obj_list, &last->obj_list); - return 0; +added: + mark->connector = conn; +out_err: + spin_unlock(&conn->lock); + spin_unlock(&mark->lock); + return err; } /* @@ -353,10 +566,10 @@ int fsnotify_add_mark_list(struct hlist_head *head, struct fsnotify_mark *mark, * These marks may be used for the fsnotify backend to determine which * event types should be delivered to which group. */ -int fsnotify_add_mark_locked(struct fsnotify_mark *mark, - struct fsnotify_group *group, struct inode *inode, +int fsnotify_add_mark_locked(struct fsnotify_mark *mark, struct inode *inode, struct vfsmount *mnt, int allow_dups) { + struct fsnotify_group *group = mark->group; int ret = 0; BUG_ON(inode && mnt); @@ -367,61 +580,42 @@ int fsnotify_add_mark_locked(struct fsnotify_mark *mark, * LOCKING ORDER!!!! * group->mark_mutex * mark->lock - * inode->i_lock + * mark->connector->lock */ spin_lock(&mark->lock); mark->flags |= FSNOTIFY_MARK_FLAG_ALIVE | FSNOTIFY_MARK_FLAG_ATTACHED; - fsnotify_get_group(group); - mark->group = group; list_add(&mark->g_list, &group->marks_list); atomic_inc(&group->num_marks); - fsnotify_get_mark(mark); /* for i_list and g_list */ - - if (inode) { - ret = fsnotify_add_inode_mark(mark, group, inode, allow_dups); - if (ret) - goto err; - } else if (mnt) { - ret = fsnotify_add_vfsmount_mark(mark, group, mnt, allow_dups); - if (ret) - goto err; - } else { - BUG(); - } - - /* this will pin the object if appropriate */ - fsnotify_set_mark_mask_locked(mark, mark->mask); + fsnotify_get_mark(mark); /* for g_list */ spin_unlock(&mark->lock); - if (inode) - __fsnotify_update_child_dentry_flags(inode); + ret = fsnotify_add_mark_list(mark, inode, mnt, allow_dups); + if (ret) + goto err; + + if (mark->mask) + fsnotify_recalc_mask(mark->connector); return ret; err: - mark->flags &= ~FSNOTIFY_MARK_FLAG_ALIVE; + mark->flags &= ~(FSNOTIFY_MARK_FLAG_ALIVE | + FSNOTIFY_MARK_FLAG_ATTACHED); list_del_init(&mark->g_list); - fsnotify_put_group(group); - mark->group = NULL; atomic_dec(&group->num_marks); - spin_unlock(&mark->lock); - - spin_lock(&destroy_lock); - list_add(&mark->g_list, &destroy_list); - spin_unlock(&destroy_lock); - queue_delayed_work(system_unbound_wq, &reaper_work, - FSNOTIFY_REAPER_DELAY); - + fsnotify_put_mark(mark); return ret; } -int fsnotify_add_mark(struct fsnotify_mark *mark, struct fsnotify_group *group, - struct inode *inode, struct vfsmount *mnt, int allow_dups) +int fsnotify_add_mark(struct fsnotify_mark *mark, struct inode *inode, + struct vfsmount *mnt, int allow_dups) { int ret; + struct fsnotify_group *group = mark->group; + mutex_lock(&group->mark_mutex); - ret = fsnotify_add_mark_locked(mark, group, inode, mnt, allow_dups); + ret = fsnotify_add_mark_locked(mark, inode, mnt, allow_dups); mutex_unlock(&group->mark_mutex); return ret; } @@ -430,29 +624,42 @@ int fsnotify_add_mark(struct fsnotify_mark *mark, struct fsnotify_group *group, * Given a list of marks, find the mark associated with given group. If found * take a reference to that mark and return it, else return NULL. */ -struct fsnotify_mark *fsnotify_find_mark(struct hlist_head *head, - struct fsnotify_group *group) +struct fsnotify_mark *fsnotify_find_mark( + struct fsnotify_mark_connector __rcu **connp, + struct fsnotify_group *group) { + struct fsnotify_mark_connector *conn; struct fsnotify_mark *mark; - hlist_for_each_entry(mark, head, obj_list) { - if (mark->group == group) { + conn = fsnotify_grab_connector(connp); + if (!conn) + return NULL; + + hlist_for_each_entry(mark, &conn->list, obj_list) { + if (mark->group == group && + (mark->flags & FSNOTIFY_MARK_FLAG_ATTACHED)) { fsnotify_get_mark(mark); + spin_unlock(&conn->lock); return mark; } } + spin_unlock(&conn->lock); return NULL; } -/* - * clear any marks in a group in which mark->flags & flags is true - */ -void fsnotify_clear_marks_by_group_flags(struct fsnotify_group *group, - unsigned int flags) +/* Clear any marks in a group with given type */ +void fsnotify_clear_marks_by_group(struct fsnotify_group *group, + unsigned int type) { struct fsnotify_mark *lmark, *mark; LIST_HEAD(to_free); + struct list_head *head = &to_free; + /* Skip selection step if we want to clear all marks. */ + if (type == FSNOTIFY_OBJ_ALL_TYPES) { + head = &group->marks_list; + goto clear; + } /* * We have to be really careful here. Anytime we drop mark_mutex, e.g. * fsnotify_clear_marks_by_inode() can come and free marks. Even in our @@ -464,18 +671,19 @@ void fsnotify_clear_marks_by_group_flags(struct fsnotify_group *group, */ mutex_lock_nested(&group->mark_mutex, SINGLE_DEPTH_NESTING); list_for_each_entry_safe(mark, lmark, &group->marks_list, g_list) { - if (mark->flags & flags) + if (mark->connector->flags & type) list_move(&mark->g_list, &to_free); } mutex_unlock(&group->mark_mutex); +clear: while (1) { mutex_lock_nested(&group->mark_mutex, SINGLE_DEPTH_NESTING); - if (list_empty(&to_free)) { + if (list_empty(head)) { mutex_unlock(&group->mark_mutex); break; } - mark = list_first_entry(&to_free, struct fsnotify_mark, g_list); + mark = list_first_entry(head, struct fsnotify_mark, g_list); fsnotify_get_mark(mark); fsnotify_detach_mark(mark); mutex_unlock(&group->mark_mutex); @@ -484,49 +692,62 @@ void fsnotify_clear_marks_by_group_flags(struct fsnotify_group *group, } } -/* - * Given a group, prepare for freeing all the marks associated with that group. - * The marks are attached to the list of marks prepared for destruction, the - * caller is responsible for freeing marks in that list after SRCU period has - * ended. - */ -void fsnotify_detach_group_marks(struct fsnotify_group *group) +/* Destroy all marks attached to inode / vfsmount */ +void fsnotify_destroy_marks(struct fsnotify_mark_connector __rcu **connp) { - struct fsnotify_mark *mark; + struct fsnotify_mark_connector *conn; + struct fsnotify_mark *mark, *old_mark = NULL; + struct inode *inode; - while (1) { - mutex_lock_nested(&group->mark_mutex, SINGLE_DEPTH_NESTING); - if (list_empty(&group->marks_list)) { - mutex_unlock(&group->mark_mutex); - break; - } - mark = list_first_entry(&group->marks_list, - struct fsnotify_mark, g_list); + conn = fsnotify_grab_connector(connp); + if (!conn) + return; + /* + * We have to be careful since we can race with e.g. + * fsnotify_clear_marks_by_group() and once we drop the conn->lock, the + * list can get modified. However we are holding mark reference and + * thus our mark cannot be removed from obj_list so we can continue + * iteration after regaining conn->lock. + */ + hlist_for_each_entry(mark, &conn->list, obj_list) { fsnotify_get_mark(mark); - fsnotify_detach_mark(mark); - mutex_unlock(&group->mark_mutex); - __fsnotify_free_mark(mark); - fsnotify_put_mark(mark); + spin_unlock(&conn->lock); + if (old_mark) + fsnotify_put_mark(old_mark); + old_mark = mark; + fsnotify_destroy_mark(mark, mark->group); + spin_lock(&conn->lock); } + /* + * Detach list from object now so that we don't pin inode until all + * mark references get dropped. It would lead to strange results such + * as delaying inode deletion or blocking unmount. + */ + inode = fsnotify_detach_connector_from_object(conn); + spin_unlock(&conn->lock); + if (old_mark) + fsnotify_put_mark(old_mark); + iput(inode); } /* * Nothing fancy, just initialize lists and locks and counters. */ void fsnotify_init_mark(struct fsnotify_mark *mark, - void (*free_mark)(struct fsnotify_mark *mark)) + struct fsnotify_group *group) { memset(mark, 0, sizeof(*mark)); spin_lock_init(&mark->lock); atomic_set(&mark->refcnt, 1); - mark->free_mark = free_mark; + fsnotify_get_group(group); + mark->group = group; } /* * Destroy all marks in destroy_list, waits for SRCU period to finish before * actually freeing marks. */ -void fsnotify_mark_destroy_list(void) +static void fsnotify_mark_destroy_workfn(struct work_struct *work) { struct fsnotify_mark *mark, *next; struct list_head private_destroy_list; @@ -540,11 +761,12 @@ void fsnotify_mark_destroy_list(void) list_for_each_entry_safe(mark, next, &private_destroy_list, g_list) { list_del_init(&mark->g_list); - fsnotify_put_mark(mark); + fsnotify_final_mark_destroy(mark); } } -static void fsnotify_mark_destroy_workfn(struct work_struct *work) +/* Wait for all marks queued for destruction to be actually destroyed */ +void fsnotify_wait_marks_destroyed(void) { - fsnotify_mark_destroy_list(); + flush_delayed_work(&reaper_work); } diff --git a/fs/notify/vfsmount_mark.c b/fs/notify/vfsmount_mark.c deleted file mode 100644 index a8fcab68faef..000000000000 --- a/fs/notify/vfsmount_mark.c +++ /dev/null @@ -1,108 +0,0 @@ -/* - * Copyright (C) 2008 Red Hat, Inc., Eric Paris <eparis@redhat.com> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2, or (at your option) - * any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; see the file COPYING. If not, write to - * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. - */ - -#include <linux/fs.h> -#include <linux/init.h> -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/mount.h> -#include <linux/mutex.h> -#include <linux/spinlock.h> - -#include <linux/atomic.h> - -#include <linux/fsnotify_backend.h> -#include "fsnotify.h" - -void fsnotify_clear_vfsmount_marks_by_group(struct fsnotify_group *group) -{ - fsnotify_clear_marks_by_group_flags(group, FSNOTIFY_MARK_FLAG_VFSMOUNT); -} - -/* - * Recalculate the mnt->mnt_fsnotify_mask, or the mask of all FS_* event types - * any notifier is interested in hearing for this mount point - */ -void fsnotify_recalc_vfsmount_mask(struct vfsmount *mnt) -{ - struct mount *m = real_mount(mnt); - - spin_lock(&mnt->mnt_root->d_lock); - m->mnt_fsnotify_mask = fsnotify_recalc_mask(&m->mnt_fsnotify_marks); - spin_unlock(&mnt->mnt_root->d_lock); -} - -void fsnotify_destroy_vfsmount_mark(struct fsnotify_mark *mark) -{ - struct vfsmount *mnt = mark->mnt; - struct mount *m = real_mount(mnt); - - BUG_ON(!mutex_is_locked(&mark->group->mark_mutex)); - assert_spin_locked(&mark->lock); - - spin_lock(&mnt->mnt_root->d_lock); - - hlist_del_init_rcu(&mark->obj_list); - mark->mnt = NULL; - - m->mnt_fsnotify_mask = fsnotify_recalc_mask(&m->mnt_fsnotify_marks); - spin_unlock(&mnt->mnt_root->d_lock); -} - -/* - * given a group and vfsmount, find the mark associated with that combination. - * if found take a reference to that mark and return it, else return NULL - */ -struct fsnotify_mark *fsnotify_find_vfsmount_mark(struct fsnotify_group *group, - struct vfsmount *mnt) -{ - struct mount *m = real_mount(mnt); - struct fsnotify_mark *mark; - - spin_lock(&mnt->mnt_root->d_lock); - mark = fsnotify_find_mark(&m->mnt_fsnotify_marks, group); - spin_unlock(&mnt->mnt_root->d_lock); - - return mark; -} - -/* - * Attach an initialized mark to a given group and vfsmount. - * These marks may be used for the fsnotify backend to determine which - * event types should be delivered to which groups. - */ -int fsnotify_add_vfsmount_mark(struct fsnotify_mark *mark, - struct fsnotify_group *group, struct vfsmount *mnt, - int allow_dups) -{ - struct mount *m = real_mount(mnt); - int ret; - - mark->flags |= FSNOTIFY_MARK_FLAG_VFSMOUNT; - - BUG_ON(!mutex_is_locked(&group->mark_mutex)); - assert_spin_locked(&mark->lock); - - spin_lock(&mnt->mnt_root->d_lock); - mark->mnt = mnt; - ret = fsnotify_add_mark_list(&m->mnt_fsnotify_marks, mark, allow_dups); - m->mnt_fsnotify_mask = fsnotify_recalc_mask(&m->mnt_fsnotify_marks); - spin_unlock(&mnt->mnt_root->d_lock); - - return ret; -} diff --git a/fs/nsfs.c b/fs/nsfs.c index 1656843e87d2..323f492e0822 100644 --- a/fs/nsfs.c +++ b/fs/nsfs.c @@ -91,6 +91,7 @@ slow: return ERR_PTR(-ENOMEM); } d_instantiate(dentry, inode); + dentry->d_flags |= DCACHE_RCUACCESS; dentry->d_fsdata = (void *)ns->ops; d = atomic_long_cmpxchg(&ns->stashed, 0, (unsigned long)dentry); if (d) { diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index f6e871760f8d..0da0332725aa 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -2242,13 +2242,13 @@ unlock: spin_unlock(&o2hb_live_lock); } -static ssize_t o2hb_heartbeat_group_threshold_show(struct config_item *item, +static ssize_t o2hb_heartbeat_group_dead_threshold_show(struct config_item *item, char *page) { return sprintf(page, "%u\n", o2hb_dead_threshold); } -static ssize_t o2hb_heartbeat_group_threshold_store(struct config_item *item, +static ssize_t o2hb_heartbeat_group_dead_threshold_store(struct config_item *item, const char *page, size_t count) { unsigned long tmp; @@ -2297,11 +2297,11 @@ static ssize_t o2hb_heartbeat_group_mode_store(struct config_item *item, } -CONFIGFS_ATTR(o2hb_heartbeat_group_, threshold); +CONFIGFS_ATTR(o2hb_heartbeat_group_, dead_threshold); CONFIGFS_ATTR(o2hb_heartbeat_group_, mode); static struct configfs_attribute *o2hb_heartbeat_group_attrs[] = { - &o2hb_heartbeat_group_attr_threshold, + &o2hb_heartbeat_group_attr_dead_threshold, &o2hb_heartbeat_group_attr_mode, NULL, }; diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index d0ab7e56d0b4..8d779227370a 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c @@ -450,9 +450,8 @@ static struct o2net_sock_container *sc_alloc(struct o2nm_node *node) INIT_WORK(&sc->sc_shutdown_work, o2net_shutdown_sc); INIT_DELAYED_WORK(&sc->sc_keepalive_work, o2net_sc_send_keep_req); - init_timer(&sc->sc_idle_timeout); - sc->sc_idle_timeout.function = o2net_idle_timer; - sc->sc_idle_timeout.data = (unsigned long)sc; + setup_timer(&sc->sc_idle_timeout, o2net_idle_timer, + (unsigned long)sc); sclog(sc, "alloced\n"); @@ -956,7 +955,7 @@ static void o2net_sendpage(struct o2net_sock_container *sc, mutex_lock(&sc->sc_send_lock); ret = sc->sc_sock->ops->sendpage(sc->sc_sock, virt_to_page(kmalloced_virt), - (long)kmalloced_virt & ~PAGE_MASK, + offset_in_page(kmalloced_virt), size, MSG_DONTWAIT); mutex_unlock(&sc->sc_send_lock); if (ret == size) @@ -1460,27 +1459,10 @@ static void o2net_rx_until_empty(struct work_struct *work) static int o2net_set_nodelay(struct socket *sock) { - int ret, val = 1; - mm_segment_t oldfs; + int val = 1; - oldfs = get_fs(); - set_fs(KERNEL_DS); - - /* - * Dear unsuspecting programmer, - * - * Don't use sock_setsockopt() for SOL_TCP. It doesn't check its level - * argument and assumes SOL_SOCKET so, say, your TCP_NODELAY will - * silently turn into SO_DEBUG. - * - * Yours, - * Keeper of hilariously fragile interfaces. - */ - ret = sock->ops->setsockopt(sock, SOL_TCP, TCP_NODELAY, - (char __user *)&val, sizeof(val)); - - set_fs(oldfs); - return ret; + return kernel_setsockopt(sock, SOL_TCP, TCP_NODELAY, + (void *)&val, sizeof(val)); } static int o2net_set_usertimeout(struct socket *sock) @@ -1488,7 +1470,7 @@ static int o2net_set_usertimeout(struct socket *sock) int user_timeout = O2NET_TCP_USER_TIMEOUT; return kernel_setsockopt(sock, SOL_TCP, TCP_USER_TIMEOUT, - (char *)&user_timeout, sizeof(user_timeout)); + (void *)&user_timeout, sizeof(user_timeout)); } static void o2net_initialize_handshake(void) diff --git a/fs/open.c b/fs/open.c index 949cef29c3bb..4d23f729dcc6 100644 --- a/fs/open.c +++ b/fs/open.c @@ -1078,6 +1078,26 @@ SYSCALL_DEFINE4(openat, int, dfd, const char __user *, filename, int, flags, return do_sys_open(dfd, filename, flags, mode); } +#ifdef CONFIG_COMPAT +/* + * Exactly like sys_open(), except that it doesn't set the + * O_LARGEFILE flag. + */ +COMPAT_SYSCALL_DEFINE3(open, const char __user *, filename, int, flags, umode_t, mode) +{ + return do_sys_open(AT_FDCWD, filename, flags, mode); +} + +/* + * Exactly like sys_openat(), except that it doesn't set the + * O_LARGEFILE flag. + */ +COMPAT_SYSCALL_DEFINE4(openat, int, dfd, const char __user *, filename, int, flags, umode_t, mode) +{ + return do_sys_open(dfd, filename, flags, mode); +} +#endif + #ifndef __alpha__ /* diff --git a/fs/orangefs/devorangefs-req.c b/fs/orangefs/devorangefs-req.c index c4ab6fdf17a0..c19f0787c9c6 100644 --- a/fs/orangefs/devorangefs-req.c +++ b/fs/orangefs/devorangefs-req.c @@ -180,6 +180,10 @@ static ssize_t orangefs_devreq_read(struct file *file, return -EINVAL; } + /* Check for an empty list before locking. */ + if (list_empty(&orangefs_request_list)) + return -EAGAIN; + restart: /* Get next op (if any) from top of list. */ spin_lock(&orangefs_request_list_lock); @@ -208,14 +212,19 @@ restart: continue; /* * Skip ops whose filesystem we don't know about unless - * it is being mounted. + * it is being mounted or unmounted. It is possible for + * a filesystem we don't know about to be unmounted if + * it fails to mount in the kernel after userspace has + * been sent the mount request. */ /* XXX: is there a better way to detect this? */ } else if (ret == -1 && !(op->upcall.type == ORANGEFS_VFS_OP_FS_MOUNT || op->upcall.type == - ORANGEFS_VFS_OP_GETATTR)) { + ORANGEFS_VFS_OP_GETATTR || + op->upcall.type == + ORANGEFS_VFS_OP_FS_UMOUNT)) { gossip_debug(GOSSIP_DEV_DEBUG, "orangefs: skipping op tag %llu %s\n", llu(op->tag), get_opname_string(op)); diff --git a/fs/orangefs/dir.c b/fs/orangefs/dir.c index 284373a57a08..d327cbd17756 100644 --- a/fs/orangefs/dir.c +++ b/fs/orangefs/dir.c @@ -1,396 +1,404 @@ /* - * (C) 2001 Clemson University and The University of Chicago - * - * See COPYING in top-level directory. + * Copyright 2017 Omnibond Systems, L.L.C. */ #include "protocol.h" #include "orangefs-kernel.h" #include "orangefs-bufmap.h" +struct orangefs_dir_part { + struct orangefs_dir_part *next; + size_t len; +}; + +struct orangefs_dir { + __u64 token; + struct orangefs_dir_part *part; + loff_t end; + int error; +}; + +#define PART_SHIFT (24) +#define PART_SIZE (1<<24) +#define PART_MASK (~(PART_SIZE - 1)) + /* - * decode routine used by kmod to deal with the blob sent from - * userspace for readdirs. The blob contains zero or more of these - * sub-blobs: - * __u32 - represents length of the character string that follows. - * string - between 1 and ORANGEFS_NAME_MAX bytes long. - * padding - (if needed) to cause the __u32 plus the string to be - * eight byte aligned. - * khandle - sizeof(khandle) bytes. + * There can be up to 512 directory entries. Each entry is encoded as + * follows: + * 4 bytes: string size (n) + * n bytes: string + * 1 byte: trailing zero + * padding to 8 bytes + * 16 bytes: khandle + * padding to 8 bytes + * + * The trailer_buf starts with a struct orangefs_readdir_response_s + * which must be skipped to get to the directory data. + * + * The data which is received from the userspace daemon is termed a + * part and is stored in a linked list in case more than one part is + * needed for a large directory. + * + * The position pointer (ctx->pos) encodes the part and offset on which + * to begin reading at. Bits above PART_SHIFT encode the part and bits + * below PART_SHIFT encode the offset. Parts are stored in a linked + * list which grows as data is received from the server. The overhead + * associated with managing the list is presumed to be small compared to + * the overhead of communicating with the server. + * + * As data is received from the server, it is placed at the end of the + * part list. Data is parsed from the current position as it is needed. + * When data is determined to be corrupt, it is either because the + * userspace component has sent back corrupt data or because the file + * pointer has been moved to an invalid location. Since the two cannot + * be differentiated, return EIO. + * + * Part zero is synthesized to contains `.' and `..'. Part one is the + * first part of the part list. */ -static long decode_dirents(char *ptr, size_t size, - struct orangefs_readdir_response_s *readdir) + +static int do_readdir(struct orangefs_inode_s *oi, + struct orangefs_dir *od, struct dentry *dentry, + struct orangefs_kernel_op_s *op) { - int i; - struct orangefs_readdir_response_s *rd = - (struct orangefs_readdir_response_s *) ptr; - char *buf = ptr; - int khandle_size = sizeof(struct orangefs_khandle); - size_t offset = offsetof(struct orangefs_readdir_response_s, - dirent_array); - /* 8 reflects eight byte alignment */ - int smallest_blob = khandle_size + 8; - __u32 len; - int aligned_len; - int sizeof_u32 = sizeof(__u32); - long ret; - - gossip_debug(GOSSIP_DIR_DEBUG, "%s: size:%zu:\n", __func__, size); - - /* size is = offset on empty dirs, > offset on non-empty dirs... */ - if (size < offset) { - gossip_err("%s: size:%zu: offset:%zu:\n", - __func__, - size, - offset); - ret = -EINVAL; - goto out; - } + struct orangefs_readdir_response_s *resp; + int bufi, r; - if ((size == offset) && (readdir->orangefs_dirent_outcount != 0)) { - gossip_err("%s: size:%zu: dirent_outcount:%d:\n", - __func__, - size, - readdir->orangefs_dirent_outcount); - ret = -EINVAL; - goto out; - } + /* + * Despite the badly named field, readdir does not use shared + * memory. However, there are a limited number of readdir + * slots, which must be allocated here. This flag simply tells + * the op scheduler to return the op here for retry. + */ + op->uses_shared_memory = 1; + op->upcall.req.readdir.refn = oi->refn; + op->upcall.req.readdir.token = od->token; + op->upcall.req.readdir.max_dirent_count = + ORANGEFS_MAX_DIRENT_COUNT_READDIR; - readdir->token = rd->token; - readdir->orangefs_dirent_outcount = rd->orangefs_dirent_outcount; - readdir->dirent_array = kcalloc(readdir->orangefs_dirent_outcount, - sizeof(*readdir->dirent_array), - GFP_KERNEL); - if (readdir->dirent_array == NULL) { - gossip_err("%s: kcalloc failed.\n", __func__); - ret = -ENOMEM; - goto out; +again: + bufi = orangefs_readdir_index_get(); + if (bufi < 0) { + od->error = bufi; + return bufi; } - buf += offset; - size -= offset; - - for (i = 0; i < readdir->orangefs_dirent_outcount; i++) { - if (size < smallest_blob) { - gossip_err("%s: size:%zu: smallest_blob:%d:\n", - __func__, - size, - smallest_blob); - ret = -EINVAL; - goto free; - } + op->upcall.req.readdir.buf_index = bufi; - len = *(__u32 *)buf; - if ((len < 1) || (len > ORANGEFS_NAME_MAX)) { - gossip_err("%s: len:%d:\n", __func__, len); - ret = -EINVAL; - goto free; - } + r = service_operation(op, "orangefs_readdir", + get_interruptible_flag(dentry->d_inode)); - gossip_debug(GOSSIP_DIR_DEBUG, - "%s: size:%zu: len:%d:\n", - __func__, - size, - len); + orangefs_readdir_index_put(bufi); - readdir->dirent_array[i].d_name = buf + sizeof_u32; - readdir->dirent_array[i].d_length = len; + if (op_state_purged(op)) { + if (r == -EAGAIN) { + vfree(op->downcall.trailer_buf); + goto again; + } else if (r == -EIO) { + vfree(op->downcall.trailer_buf); + od->error = r; + return r; + } + } - /* - * Calculate "aligned" length of this string and its - * associated __u32 descriptor. - */ - aligned_len = ((sizeof_u32 + len + 1) + 7) & ~7; - gossip_debug(GOSSIP_DIR_DEBUG, - "%s: aligned_len:%d:\n", - __func__, - aligned_len); + if (r < 0) { + vfree(op->downcall.trailer_buf); + od->error = r; + return r; + } else if (op->downcall.status) { + vfree(op->downcall.trailer_buf); + od->error = op->downcall.status; + return op->downcall.status; + } - /* - * The end of the blob should coincide with the end - * of the last sub-blob. - */ - if (size < aligned_len + khandle_size) { - gossip_err("%s: ran off the end of the blob.\n", - __func__); - ret = -EINVAL; - goto free; - } - size -= aligned_len + khandle_size; + /* + * The maximum size is size per entry times the 512 entries plus + * the header. This is well under the limit. + */ + if (op->downcall.trailer_size > PART_SIZE) { + vfree(op->downcall.trailer_buf); + od->error = -EIO; + return -EIO; + } - buf += aligned_len; + resp = (struct orangefs_readdir_response_s *) + op->downcall.trailer_buf; + od->token = resp->token; + return 0; +} - readdir->dirent_array[i].khandle = - *(struct orangefs_khandle *) buf; - buf += khandle_size; +static int parse_readdir(struct orangefs_dir *od, + struct orangefs_kernel_op_s *op) +{ + struct orangefs_dir_part *part, *new; + size_t count; + + count = 1; + part = od->part; + while (part) { + count++; + if (part->next) + part = part->next; + else + break; } - ret = buf - ptr; - gossip_debug(GOSSIP_DIR_DEBUG, "%s: returning:%ld:\n", __func__, ret); - goto out; -free: - kfree(readdir->dirent_array); - readdir->dirent_array = NULL; + new = (void *)op->downcall.trailer_buf; + new->next = NULL; + new->len = op->downcall.trailer_size - + sizeof(struct orangefs_readdir_response_s); + if (!od->part) + od->part = new; + else + part->next = new; + count++; + od->end = count << PART_SHIFT; -out: - return ret; + return 0; } -/* - * Read directory entries from an instance of an open directory. - */ -static int orangefs_readdir(struct file *file, struct dir_context *ctx) +static int orangefs_dir_more(struct orangefs_inode_s *oi, + struct orangefs_dir *od, struct dentry *dentry) { - int ret = 0; - int buffer_index; - /* - * ptoken supports Orangefs' distributed directory logic, added - * in 2.9.2. - */ - __u64 *ptoken = file->private_data; - __u64 pos = 0; - ino_t ino = 0; - struct dentry *dentry = file->f_path.dentry; - struct orangefs_kernel_op_s *new_op = NULL; - struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(dentry->d_inode); - struct orangefs_readdir_response_s readdir_response; - void *dents_buf; - int i = 0; - int len = 0; - ino_t current_ino = 0; - char *current_entry = NULL; - long bytes_decoded; - - gossip_debug(GOSSIP_DIR_DEBUG, - "%s: ctx->pos:%lld, ptoken = %llu\n", - __func__, - lld(ctx->pos), - llu(*ptoken)); - - pos = (__u64) ctx->pos; - - /* are we done? */ - if (pos == ORANGEFS_READDIR_END) { - gossip_debug(GOSSIP_DIR_DEBUG, - "Skipping to termination path\n"); - return 0; + struct orangefs_kernel_op_s *op; + int r; + + op = op_alloc(ORANGEFS_VFS_OP_READDIR); + if (!op) { + od->error = -ENOMEM; + return -ENOMEM; + } + r = do_readdir(oi, od, dentry, op); + if (r) { + od->error = r; + goto out; + } + r = parse_readdir(od, op); + if (r) { + od->error = r; + goto out; } - gossip_debug(GOSSIP_DIR_DEBUG, - "orangefs_readdir called on %pd (pos=%llu)\n", - dentry, llu(pos)); + od->error = 0; +out: + op_release(op); + return od->error; +} - memset(&readdir_response, 0, sizeof(readdir_response)); +static int fill_from_part(struct orangefs_dir_part *part, + struct dir_context *ctx) +{ + const int offset = sizeof(struct orangefs_readdir_response_s); + struct orangefs_khandle *khandle; + __u32 *len, padlen; + loff_t i; + char *s; + i = ctx->pos & ~PART_MASK; - new_op = op_alloc(ORANGEFS_VFS_OP_READDIR); - if (!new_op) - return -ENOMEM; + /* The file offset from userspace is too large. */ + if (i > part->len) + return 1; /* - * Only the indices are shared. No memory is actually shared, but the - * mechanism is used. + * If the seek pointer is positioned just before an entry it + * should find the next entry. */ - new_op->uses_shared_memory = 1; - new_op->upcall.req.readdir.refn = orangefs_inode->refn; - new_op->upcall.req.readdir.max_dirent_count = - ORANGEFS_MAX_DIRENT_COUNT_READDIR; - - gossip_debug(GOSSIP_DIR_DEBUG, - "%s: upcall.req.readdir.refn.khandle: %pU\n", - __func__, - &new_op->upcall.req.readdir.refn.khandle); + if (i % 8) + i = i + (8 - i%8)%8; - new_op->upcall.req.readdir.token = *ptoken; - -get_new_buffer_index: - buffer_index = orangefs_readdir_index_get(); - if (buffer_index < 0) { - ret = buffer_index; - gossip_lerr("orangefs_readdir: orangefs_readdir_index_get() failure (%d)\n", - ret); - goto out_free_op; + while (i < part->len) { + if (part->len < i + sizeof *len) + break; + len = (void *)part + offset + i; + /* + * len is the size of the string itself. padlen is the + * total size of the encoded string. + */ + padlen = (sizeof *len + *len + 1) + + (8 - (sizeof *len + *len + 1)%8)%8; + if (part->len < i + padlen + sizeof *khandle) + goto next; + s = (void *)part + offset + i + sizeof *len; + if (s[*len] != 0) + goto next; + khandle = (void *)part + offset + i + padlen; + if (!dir_emit(ctx, s, *len, + orangefs_khandle_to_ino(khandle), + DT_UNKNOWN)) + return 0; + i += padlen + sizeof *khandle; + i = i + (8 - i%8)%8; + BUG_ON(i > part->len); + ctx->pos = (ctx->pos & PART_MASK) | i; + continue; +next: + i += 8; } - new_op->upcall.req.readdir.buf_index = buffer_index; - - ret = service_operation(new_op, - "orangefs_readdir", - get_interruptible_flag(dentry->d_inode)); + return 1; +} - gossip_debug(GOSSIP_DIR_DEBUG, - "Readdir downcall status is %d. ret:%d\n", - new_op->downcall.status, - ret); +static int orangefs_dir_fill(struct orangefs_inode_s *oi, + struct orangefs_dir *od, struct dentry *dentry, + struct dir_context *ctx) +{ + struct orangefs_dir_part *part; + size_t count; - orangefs_readdir_index_put(buffer_index); + count = ((ctx->pos & PART_MASK) >> PART_SHIFT) - 1; - if (ret == -EAGAIN && op_state_purged(new_op)) { - /* Client-core indices are invalid after it restarted. */ - gossip_debug(GOSSIP_DIR_DEBUG, - "%s: Getting new buffer_index for retry of readdir..\n", - __func__); - goto get_new_buffer_index; + part = od->part; + while (part->next && count) { + count--; + part = part->next; } - - if (ret == -EIO && op_state_purged(new_op)) { - gossip_err("%s: Client is down. Aborting readdir call.\n", - __func__); - goto out_free_op; + /* This means the userspace file offset is invalid. */ + if (count) { + od->error = -EIO; + return -EIO; } - if (ret < 0 || new_op->downcall.status != 0) { - gossip_debug(GOSSIP_DIR_DEBUG, - "Readdir request failed. Status:%d\n", - new_op->downcall.status); - if (ret >= 0) - ret = new_op->downcall.status; - goto out_free_op; + while (part && part->len) { + int r; + r = fill_from_part(part, ctx); + if (r < 0) { + od->error = r; + return r; + } else if (r == 0) { + /* Userspace buffer is full. */ + break; + } else { + /* + * The part ran out of data. Move to the next + * part. */ + ctx->pos = (ctx->pos & PART_MASK) + + (1 << PART_SHIFT); + part = part->next; + } } + return 0; +} - dents_buf = new_op->downcall.trailer_buf; - if (dents_buf == NULL) { - gossip_err("Invalid NULL buffer in readdir response\n"); - ret = -ENOMEM; - goto out_free_op; +static loff_t orangefs_dir_llseek(struct file *file, loff_t offset, + int whence) +{ + struct orangefs_dir *od = file->private_data; + /* + * Delete the stored data so userspace sees new directory + * entries. + */ + if (!whence && offset < od->end) { + struct orangefs_dir_part *part = od->part; + while (part) { + struct orangefs_dir_part *next = part->next; + vfree(part); + part = next; + } + od->token = ORANGEFS_ITERATE_START; + od->part = NULL; + od->end = 1 << PART_SHIFT; } + return default_llseek(file, offset, whence); +} - bytes_decoded = decode_dirents(dents_buf, new_op->downcall.trailer_size, - &readdir_response); - if (bytes_decoded < 0) { - ret = bytes_decoded; - gossip_err("Could not decode readdir from buffer %d\n", ret); - goto out_vfree; - } +static int orangefs_dir_iterate(struct file *file, + struct dir_context *ctx) +{ + struct orangefs_inode_s *oi; + struct orangefs_dir *od; + struct dentry *dentry; + int r; - if (bytes_decoded != new_op->downcall.trailer_size) { - gossip_err("orangefs_readdir: # bytes decoded (%ld) " - "!= trailer size (%ld)\n", - bytes_decoded, - (long)new_op->downcall.trailer_size); - ret = -EINVAL; - goto out_destroy_handle; - } + dentry = file->f_path.dentry; + oi = ORANGEFS_I(dentry->d_inode); + od = file->private_data; - /* - * orangefs doesn't actually store dot and dot-dot, but - * we need to have them represented. - */ - if (pos == 0) { - ino = get_ino_from_khandle(dentry->d_inode); - gossip_debug(GOSSIP_DIR_DEBUG, - "%s: calling dir_emit of \".\" with pos = %llu\n", - __func__, - llu(pos)); - ret = dir_emit(ctx, ".", 1, ino, DT_DIR); - pos += 1; - } + if (od->error) + return od->error; - if (pos == 1) { - ino = get_parent_ino_from_dentry(dentry); - gossip_debug(GOSSIP_DIR_DEBUG, - "%s: calling dir_emit of \"..\" with pos = %llu\n", - __func__, - llu(pos)); - ret = dir_emit(ctx, "..", 2, ino, DT_DIR); - pos += 1; + if (ctx->pos == 0) { + if (!dir_emit_dot(file, ctx)) + return 0; + ctx->pos++; + } + if (ctx->pos == 1) { + if (!dir_emit_dotdot(file, ctx)) + return 0; + ctx->pos = 1 << PART_SHIFT; } /* - * we stored ORANGEFS_ITERATE_NEXT in ctx->pos last time around - * to prevent "finding" dot and dot-dot on any iteration - * other than the first. + * The seek position is in the first synthesized part but is not + * valid. */ - if (ctx->pos == ORANGEFS_ITERATE_NEXT) - ctx->pos = 0; - - gossip_debug(GOSSIP_DIR_DEBUG, - "%s: dirent_outcount:%d:\n", - __func__, - readdir_response.orangefs_dirent_outcount); - for (i = ctx->pos; - i < readdir_response.orangefs_dirent_outcount; - i++) { - len = readdir_response.dirent_array[i].d_length; - current_entry = readdir_response.dirent_array[i].d_name; - current_ino = orangefs_khandle_to_ino( - &readdir_response.dirent_array[i].khandle); - - gossip_debug(GOSSIP_DIR_DEBUG, - "calling dir_emit for %s with len %d" - ", ctx->pos %ld\n", - current_entry, - len, - (unsigned long)ctx->pos); - /* - * type is unknown. We don't return object type - * in the dirent_array. This leaves getdents - * clueless about type. - */ - ret = - dir_emit(ctx, current_entry, len, current_ino, DT_UNKNOWN); - if (!ret) - break; - ctx->pos++; - gossip_debug(GOSSIP_DIR_DEBUG, - "%s: ctx->pos:%lld\n", - __func__, - lld(ctx->pos)); + if ((ctx->pos & PART_MASK) == 0) + return -EIO; - } + r = 0; /* - * we ran all the way through the last batch, set up for - * getting another batch... + * Must read more if the user has sought past what has been read + * so far. Stop a user who has sought past the end. */ - if (ret) { - *ptoken = readdir_response.token; - ctx->pos = ORANGEFS_ITERATE_NEXT; + while (od->token != ORANGEFS_ITERATE_END && + ctx->pos > od->end) { + r = orangefs_dir_more(oi, od, dentry); + if (r) + return r; + } + if (od->token == ORANGEFS_ITERATE_END && ctx->pos > od->end) + return -EIO; + + /* Then try to fill if there's any left in the buffer. */ + if (ctx->pos < od->end) { + r = orangefs_dir_fill(oi, od, dentry, ctx); + if (r) + return r; } - /* - * Did we hit the end of the directory? - */ - if (readdir_response.token == ORANGEFS_READDIR_END) { - gossip_debug(GOSSIP_DIR_DEBUG, - "End of dir detected; setting ctx->pos to ORANGEFS_READDIR_END.\n"); - ctx->pos = ORANGEFS_READDIR_END; + /* Finally get some more and try to fill. */ + if (od->token != ORANGEFS_ITERATE_END) { + r = orangefs_dir_more(oi, od, dentry); + if (r) + return r; + r = orangefs_dir_fill(oi, od, dentry, ctx); } -out_destroy_handle: - /* kfree(NULL) is safe */ - kfree(readdir_response.dirent_array); -out_vfree: - gossip_debug(GOSSIP_DIR_DEBUG, "vfree %p\n", dents_buf); - vfree(dents_buf); -out_free_op: - op_release(new_op); - gossip_debug(GOSSIP_DIR_DEBUG, "orangefs_readdir returning %d\n", ret); - return ret; + return r; } static int orangefs_dir_open(struct inode *inode, struct file *file) { - __u64 *ptoken; - - file->private_data = kmalloc(sizeof(__u64), GFP_KERNEL); + struct orangefs_dir *od; + file->private_data = kmalloc(sizeof(struct orangefs_dir), + GFP_KERNEL); if (!file->private_data) return -ENOMEM; - - ptoken = file->private_data; - *ptoken = ORANGEFS_READDIR_START; + od = file->private_data; + od->token = ORANGEFS_ITERATE_START; + od->part = NULL; + od->end = 1 << PART_SHIFT; + od->error = 0; return 0; } static int orangefs_dir_release(struct inode *inode, struct file *file) { + struct orangefs_dir *od = file->private_data; + struct orangefs_dir_part *part = od->part; orangefs_flush_inode(inode); - kfree(file->private_data); + while (part) { + struct orangefs_dir_part *next = part->next; + vfree(part); + part = next; + } + kfree(od); return 0; } -/** ORANGEFS implementation of VFS directory operations */ const struct file_operations orangefs_dir_operations = { + .llseek = orangefs_dir_llseek, .read = generic_read_dir, - .iterate = orangefs_readdir, + .iterate = orangefs_dir_iterate, .open = orangefs_dir_open, - .release = orangefs_dir_release, + .release = orangefs_dir_release }; diff --git a/fs/orangefs/downcall.h b/fs/orangefs/downcall.h index 3b8923f8bf21..163001c95501 100644 --- a/fs/orangefs/downcall.h +++ b/fs/orangefs/downcall.h @@ -40,16 +40,6 @@ struct orangefs_mkdir_response { struct orangefs_object_kref refn; }; -/* - * duplication of some system interface structures so that I don't have - * to allocate extra memory - */ -struct orangefs_dirent { - char *d_name; - int d_length; - struct orangefs_khandle khandle; -}; - struct orangefs_statfs_response { __s64 block_size; __s64 blocks_total; @@ -131,12 +121,16 @@ struct orangefs_downcall_s { } resp; }; +/* + * The readdir response comes in the trailer. It is followed by the + * directory entries as described in dir.c. + */ + struct orangefs_readdir_response_s { __u64 token; __u64 directory_version; __u32 __pad2; __u32 orangefs_dirent_outcount; - struct orangefs_dirent *dirent_array; }; #endif /* __DOWNCALL_H */ diff --git a/fs/orangefs/file.c b/fs/orangefs/file.c index e6bbc8083d77..28f38d813ad2 100644 --- a/fs/orangefs/file.c +++ b/fs/orangefs/file.c @@ -114,7 +114,6 @@ static ssize_t wait_for_direct_io(enum ORANGEFS_io_type type, struct inode *inod struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode); struct orangefs_khandle *handle = &orangefs_inode->refn.khandle; struct orangefs_kernel_op_s *new_op = NULL; - struct iov_iter saved = *iter; int buffer_index = -1; ssize_t ret; @@ -193,7 +192,7 @@ populate_shared_memory: orangefs_bufmap_put(buffer_index); buffer_index = -1; if (type == ORANGEFS_IO_WRITE) - *iter = saved; + iov_iter_revert(iter, total_size); gossip_debug(GOSSIP_FILE_DEBUG, "%s:going to repopulate_shared_memory.\n", __func__); @@ -475,7 +474,8 @@ static ssize_t orangefs_file_write_iter(struct kiocb *iocb, struct iov_iter *ite /* Make sure generic_write_checks sees an up to date inode size. */ if (file->f_flags & O_APPEND) { - rc = orangefs_inode_getattr(file->f_mapping->host, 0, 1); + rc = orangefs_inode_getattr(file->f_mapping->host, 0, 1, + STATX_SIZE); if (rc == -ESTALE) rc = -EIO; if (rc) { @@ -693,7 +693,8 @@ static loff_t orangefs_file_llseek(struct file *file, loff_t offset, int origin) * NOTE: We are only interested in file size here, * so we set mask accordingly. */ - ret = orangefs_inode_getattr(file->f_mapping->host, 0, 1); + ret = orangefs_inode_getattr(file->f_mapping->host, 0, 1, + STATX_SIZE); if (ret == -ESTALE) ret = -EIO; if (ret) { diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c index a304bf34b212..9428ea0aac16 100644 --- a/fs/orangefs/inode.c +++ b/fs/orangefs/inode.c @@ -161,7 +161,7 @@ static int orangefs_setattr_size(struct inode *inode, struct iattr *iattr) iattr->ia_size); /* Ensure that we have a up to date size, so we know if it changed. */ - ret = orangefs_inode_getattr(inode, 0, 1); + ret = orangefs_inode_getattr(inode, 0, 1, STATX_SIZE); if (ret == -ESTALE) ret = -EIO; if (ret) { @@ -218,8 +218,7 @@ int orangefs_setattr(struct dentry *dentry, struct iattr *iattr) if (ret) goto out; - if ((iattr->ia_valid & ATTR_SIZE) && - iattr->ia_size != i_size_read(inode)) { + if (iattr->ia_valid & ATTR_SIZE) { ret = orangefs_setattr_size(inode, iattr); if (ret) goto out; @@ -256,13 +255,19 @@ int orangefs_getattr(const struct path *path, struct kstat *stat, "orangefs_getattr: called on %pd\n", path->dentry); - ret = orangefs_inode_getattr(inode, 0, 0); + ret = orangefs_inode_getattr(inode, 0, 0, request_mask); if (ret == 0) { generic_fillattr(inode, stat); /* override block size reported to stat */ orangefs_inode = ORANGEFS_I(inode); stat->blksize = orangefs_inode->blksize; + + if (request_mask & STATX_SIZE) + stat->result_mask = STATX_BASIC_STATS; + else + stat->result_mask = STATX_BASIC_STATS & + ~STATX_SIZE; } return ret; } @@ -277,7 +282,7 @@ int orangefs_permission(struct inode *inode, int mask) gossip_debug(GOSSIP_INODE_DEBUG, "%s: refreshing\n", __func__); /* Make sure the permission (and other common attrs) are up to date. */ - ret = orangefs_inode_getattr(inode, 0, 0); + ret = orangefs_inode_getattr(inode, 0, 0, STATX_MODE); if (ret < 0) return ret; @@ -375,7 +380,7 @@ struct inode *orangefs_iget(struct super_block *sb, struct orangefs_object_kref if (!inode || !(inode->i_state & I_NEW)) return inode; - error = orangefs_inode_getattr(inode, 1, 1); + error = orangefs_inode_getattr(inode, 1, 1, STATX_ALL); if (error) { iget_failed(inode); return ERR_PTR(error); @@ -420,7 +425,7 @@ struct inode *orangefs_new_inode(struct super_block *sb, struct inode *dir, orangefs_set_inode(inode, ref); inode->i_ino = hash; /* needed for stat etc */ - error = orangefs_inode_getattr(inode, 1, 1); + error = orangefs_inode_getattr(inode, 1, 1, STATX_ALL); if (error) goto out_iput; diff --git a/fs/orangefs/namei.c b/fs/orangefs/namei.c index a290ff6ec756..478e88bd7f9d 100644 --- a/fs/orangefs/namei.c +++ b/fs/orangefs/namei.c @@ -74,6 +74,7 @@ static int orangefs_create(struct inode *dir, unlock_new_inode(inode); orangefs_set_timeout(dentry); ORANGEFS_I(inode)->getattr_time = jiffies - 1; + ORANGEFS_I(inode)->getattr_mask = STATX_BASIC_STATS; gossip_debug(GOSSIP_NAME_DEBUG, "%s: dentry instantiated for %pd\n", @@ -193,8 +194,6 @@ static struct dentry *orangefs_lookup(struct inode *dir, struct dentry *dentry, goto out; } - ORANGEFS_I(inode)->getattr_time = jiffies - 1; - gossip_debug(GOSSIP_NAME_DEBUG, "%s:%s:%d " "Found good inode [%lu] with count [%d]\n", @@ -324,6 +323,7 @@ static int orangefs_symlink(struct inode *dir, unlock_new_inode(inode); orangefs_set_timeout(dentry); ORANGEFS_I(inode)->getattr_time = jiffies - 1; + ORANGEFS_I(inode)->getattr_mask = STATX_BASIC_STATS; gossip_debug(GOSSIP_NAME_DEBUG, "Inode (Symlink) %pU -> %pd\n", @@ -388,6 +388,7 @@ static int orangefs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode unlock_new_inode(inode); orangefs_set_timeout(dentry); ORANGEFS_I(inode)->getattr_time = jiffies - 1; + ORANGEFS_I(inode)->getattr_mask = STATX_BASIC_STATS; gossip_debug(GOSSIP_NAME_DEBUG, "Inode (Directory) %pU -> %pd\n", diff --git a/fs/orangefs/orangefs-bufmap.c b/fs/orangefs/orangefs-bufmap.c index 6333cbbdfef7..83b506020718 100644 --- a/fs/orangefs/orangefs-bufmap.c +++ b/fs/orangefs/orangefs-bufmap.c @@ -521,13 +521,11 @@ int orangefs_bufmap_copy_from_iovec(struct iov_iter *iter, size_t n = size; if (n > PAGE_SIZE) n = PAGE_SIZE; - n = copy_page_from_iter(page, 0, n, iter); - if (!n) + if (copy_page_from_iter(page, 0, n, iter) != n) return -EFAULT; size -= n; } return 0; - } /* diff --git a/fs/orangefs/orangefs-debugfs.c b/fs/orangefs/orangefs-debugfs.c index 791912da97d7..716ed337f166 100644 --- a/fs/orangefs/orangefs-debugfs.c +++ b/fs/orangefs/orangefs-debugfs.c @@ -440,6 +440,9 @@ static ssize_t orangefs_debug_write(struct file *file, "orangefs_debug_write: %pD\n", file); + if (count == 0) + return 0; + /* * Thwart users who try to jamb a ridiculous number * of bytes into the debug file... diff --git a/fs/orangefs/orangefs-dev-proto.h b/fs/orangefs/orangefs-dev-proto.h index f380f9ed1b28..efe08c763e56 100644 --- a/fs/orangefs/orangefs-dev-proto.h +++ b/fs/orangefs/orangefs-dev-proto.h @@ -52,12 +52,7 @@ */ #define ORANGEFS_MAX_DEBUG_STRING_LEN 0x00000800 -/* - * The maximum number of directory entries in a single request is 96. - * XXX: Why can this not be higher. The client-side code can handle up to 512. - * XXX: What happens if we expect more than the client can return? - */ -#define ORANGEFS_MAX_DIRENT_COUNT_READDIR 96 +#define ORANGEFS_MAX_DIRENT_COUNT_READDIR 512 #include "upcall.h" #include "downcall.h" diff --git a/fs/orangefs/orangefs-kernel.h b/fs/orangefs/orangefs-kernel.h index 5e48a0be9761..ea0ce507a6ab 100644 --- a/fs/orangefs/orangefs-kernel.h +++ b/fs/orangefs/orangefs-kernel.h @@ -215,6 +215,7 @@ struct orangefs_inode_s { unsigned long pinode_flags; unsigned long getattr_time; + u32 getattr_mask; }; #define P_ATIME_FLAG 0 @@ -249,6 +250,7 @@ struct orangefs_sb_info_s { char devname[ORANGEFS_MAX_SERVER_ADDR_LEN]; struct super_block *sb; int mount_pending; + int no_list; struct list_head list; }; @@ -339,11 +341,6 @@ static inline struct orangefs_khandle *get_khandle_from_ino(struct inode *inode) return &(ORANGEFS_I(inode)->refn.khandle); } -static inline __s32 get_fsid_from_ino(struct inode *inode) -{ - return ORANGEFS_I(inode)->refn.fs_id; -} - static inline ino_t get_ino_from_khandle(struct inode *inode) { struct orangefs_khandle *khandle; @@ -499,7 +496,8 @@ int orangefs_inode_setxattr(struct inode *inode, size_t size, int flags); -int orangefs_inode_getattr(struct inode *inode, int new, int bypass); +int orangefs_inode_getattr(struct inode *inode, int new, int bypass, + u32 request_mask); int orangefs_inode_check_changed(struct inode *inode); diff --git a/fs/orangefs/orangefs-utils.c b/fs/orangefs/orangefs-utils.c index 9b96b99539d6..aab6f1842963 100644 --- a/fs/orangefs/orangefs-utils.c +++ b/fs/orangefs/orangefs-utils.c @@ -251,7 +251,8 @@ static int orangefs_inode_is_stale(struct inode *inode, int new, return 0; } -int orangefs_inode_getattr(struct inode *inode, int new, int bypass) +int orangefs_inode_getattr(struct inode *inode, int new, int bypass, + u32 request_mask) { struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode); struct orangefs_kernel_op_s *new_op; @@ -262,7 +263,13 @@ int orangefs_inode_getattr(struct inode *inode, int new, int bypass) get_khandle_from_ino(inode)); if (!new && !bypass) { - if (time_before(jiffies, orangefs_inode->getattr_time)) + /* + * Must have all the attributes in the mask and be within cache + * time. + */ + if ((request_mask & orangefs_inode->getattr_mask) == + request_mask && + time_before(jiffies, orangefs_inode->getattr_time)) return 0; } @@ -270,7 +277,15 @@ int orangefs_inode_getattr(struct inode *inode, int new, int bypass) if (!new_op) return -ENOMEM; new_op->upcall.req.getattr.refn = orangefs_inode->refn; - new_op->upcall.req.getattr.mask = ORANGEFS_ATTR_SYS_ALL_NOHINT; + /* + * Size is the hardest attribute to get. The incremental cost of any + * other attribute is essentially zero. + */ + if (request_mask & STATX_SIZE || new) + new_op->upcall.req.getattr.mask = ORANGEFS_ATTR_SYS_ALL_NOHINT; + else + new_op->upcall.req.getattr.mask = + ORANGEFS_ATTR_SYS_ALL_NOHINT & ~ORANGEFS_ATTR_SYS_SIZE; ret = service_operation(new_op, __func__, get_interruptible_flag(inode)); @@ -291,25 +306,29 @@ int orangefs_inode_getattr(struct inode *inode, int new, int bypass) case S_IFREG: inode->i_flags = orangefs_inode_flags(&new_op-> downcall.resp.getattr.attributes); - inode_size = (loff_t)new_op-> - downcall.resp.getattr.attributes.size; - rounded_up_size = - (inode_size + (4096 - (inode_size % 4096))); - inode->i_size = inode_size; - orangefs_inode->blksize = - new_op->downcall.resp.getattr.attributes.blksize; - spin_lock(&inode->i_lock); - inode->i_bytes = inode_size; - inode->i_blocks = - (unsigned long)(rounded_up_size / 512); - spin_unlock(&inode->i_lock); + if (request_mask & STATX_SIZE || new) { + inode_size = (loff_t)new_op-> + downcall.resp.getattr.attributes.size; + rounded_up_size = + (inode_size + (4096 - (inode_size % 4096))); + inode->i_size = inode_size; + orangefs_inode->blksize = + new_op->downcall.resp.getattr.attributes.blksize; + spin_lock(&inode->i_lock); + inode->i_bytes = inode_size; + inode->i_blocks = + (unsigned long)(rounded_up_size / 512); + spin_unlock(&inode->i_lock); + } break; case S_IFDIR: - inode->i_size = PAGE_SIZE; - orangefs_inode->blksize = i_blocksize(inode); - spin_lock(&inode->i_lock); - inode_set_bytes(inode, inode->i_size); - spin_unlock(&inode->i_lock); + if (request_mask & STATX_SIZE || new) { + inode->i_size = PAGE_SIZE; + orangefs_inode->blksize = i_blocksize(inode); + spin_lock(&inode->i_lock); + inode_set_bytes(inode, inode->i_size); + spin_unlock(&inode->i_lock); + } set_nlink(inode, 1); break; case S_IFLNK: @@ -349,6 +368,10 @@ int orangefs_inode_getattr(struct inode *inode, int new, int bypass) orangefs_inode->getattr_time = jiffies + orangefs_getattr_timeout_msecs*HZ/1000; + if (request_mask & STATX_SIZE || new) + orangefs_inode->getattr_mask = STATX_BASIC_STATS; + else + orangefs_inode->getattr_mask = STATX_BASIC_STATS & ~STATX_SIZE; ret = 0; out: op_release(new_op); @@ -500,41 +523,6 @@ int orangefs_flush_inode(struct inode *inode) return ret; } -int orangefs_unmount_sb(struct super_block *sb) -{ - int ret = -EINVAL; - struct orangefs_kernel_op_s *new_op = NULL; - - gossip_debug(GOSSIP_UTILS_DEBUG, - "orangefs_unmount_sb called on sb %p\n", - sb); - - new_op = op_alloc(ORANGEFS_VFS_OP_FS_UMOUNT); - if (!new_op) - return -ENOMEM; - new_op->upcall.req.fs_umount.id = ORANGEFS_SB(sb)->id; - new_op->upcall.req.fs_umount.fs_id = ORANGEFS_SB(sb)->fs_id; - strncpy(new_op->upcall.req.fs_umount.orangefs_config_server, - ORANGEFS_SB(sb)->devname, - ORANGEFS_MAX_SERVER_ADDR_LEN); - - gossip_debug(GOSSIP_UTILS_DEBUG, - "Attempting ORANGEFS Unmount via host %s\n", - new_op->upcall.req.fs_umount.orangefs_config_server); - - ret = service_operation(new_op, "orangefs_fs_umount", 0); - - gossip_debug(GOSSIP_UTILS_DEBUG, - "orangefs_unmount: got return value of %d\n", ret); - if (ret) - sb = ERR_PTR(ret); - else - ORANGEFS_SB(sb)->mount_pending = 1; - - op_release(new_op); - return ret; -} - void orangefs_make_bad_inode(struct inode *inode) { if (is_root_handle(inode)) { diff --git a/fs/orangefs/protocol.h b/fs/orangefs/protocol.h index 971307ad69be..48bcc1bbe415 100644 --- a/fs/orangefs/protocol.h +++ b/fs/orangefs/protocol.h @@ -138,13 +138,8 @@ typedef __s64 ORANGEFS_offset; #define ORANGEFS_G_SGID (1 << 10) #define ORANGEFS_U_SUID (1 << 11) -/* definition taken from stdint.h */ -#define INT32_MAX (2147483647) -#define ORANGEFS_ITERATE_START (INT32_MAX - 1) -#define ORANGEFS_ITERATE_END (INT32_MAX - 2) -#define ORANGEFS_ITERATE_NEXT (INT32_MAX - 3) -#define ORANGEFS_READDIR_START ORANGEFS_ITERATE_START -#define ORANGEFS_READDIR_END ORANGEFS_ITERATE_END +#define ORANGEFS_ITERATE_START 2147483646 +#define ORANGEFS_ITERATE_END 2147483645 #define ORANGEFS_IMMUTABLE_FL FS_IMMUTABLE_FL #define ORANGEFS_APPEND_FL FS_APPEND_FL #define ORANGEFS_NOATIME_FL FS_NOATIME_FL diff --git a/fs/orangefs/super.c b/fs/orangefs/super.c index cd261c8de53a..5c7c273e17ec 100644 --- a/fs/orangefs/super.c +++ b/fs/orangefs/super.c @@ -376,6 +376,25 @@ static const struct export_operations orangefs_export_ops = { .fh_to_dentry = orangefs_fh_to_dentry, }; +static int orangefs_unmount(int id, __s32 fs_id, const char *devname) +{ + struct orangefs_kernel_op_s *op; + int r; + op = op_alloc(ORANGEFS_VFS_OP_FS_UMOUNT); + if (!op) + return -ENOMEM; + op->upcall.req.fs_umount.id = id; + op->upcall.req.fs_umount.fs_id = fs_id; + strncpy(op->upcall.req.fs_umount.orangefs_config_server, + devname, ORANGEFS_MAX_SERVER_ADDR_LEN); + r = service_operation(op, "orangefs_fs_umount", 0); + /* Not much to do about an error here. */ + if (r) + gossip_err("orangefs_unmount: service_operation %d\n", r); + op_release(op); + return r; +} + static int orangefs_fill_sb(struct super_block *sb, struct orangefs_fs_mount_response *fs_mount, void *data, int silent) @@ -484,6 +503,8 @@ struct dentry *orangefs_mount(struct file_system_type *fst, if (IS_ERR(sb)) { d = ERR_CAST(sb); + orangefs_unmount(new_op->downcall.resp.fs_mount.id, + new_op->downcall.resp.fs_mount.fs_id, devname); goto free_op; } @@ -493,7 +514,7 @@ struct dentry *orangefs_mount(struct file_system_type *fst, if (ret) { d = ERR_PTR(ret); - goto free_op; + goto free_sb_and_op; } /* @@ -519,6 +540,9 @@ struct dentry *orangefs_mount(struct file_system_type *fst, spin_unlock(&orangefs_superblocks_lock); op_release(new_op); + /* Must be removed from the list now. */ + ORANGEFS_SB(sb)->no_list = 0; + if (orangefs_userspace_version >= 20906) { new_op = op_alloc(ORANGEFS_VFS_OP_FEATURES); if (!new_op) @@ -533,6 +557,11 @@ struct dentry *orangefs_mount(struct file_system_type *fst, return dget(sb->s_root); +free_sb_and_op: + /* Will call orangefs_kill_sb with sb not in list. */ + ORANGEFS_SB(sb)->no_list = 1; + /* ORANGEFS_VFS_OP_FS_UMOUNT is done by orangefs_kill_sb. */ + deactivate_locked_super(sb); free_op: gossip_err("orangefs_mount: mount request failed with %d\n", ret); if (ret == -EINVAL) { @@ -547,6 +576,7 @@ free_op: void orangefs_kill_sb(struct super_block *sb) { + int r; gossip_debug(GOSSIP_SUPER_DEBUG, "orangefs_kill_sb: called\n"); /* provided sb cleanup */ @@ -556,14 +586,19 @@ void orangefs_kill_sb(struct super_block *sb) * issue the unmount to userspace to tell it to remove the * dynamic mount info it has for this superblock */ - orangefs_unmount_sb(sb); - - /* remove the sb from our list of orangefs specific sb's */ - - spin_lock(&orangefs_superblocks_lock); - __list_del_entry(&ORANGEFS_SB(sb)->list); /* not list_del_init */ - ORANGEFS_SB(sb)->list.prev = NULL; - spin_unlock(&orangefs_superblocks_lock); + r = orangefs_unmount(ORANGEFS_SB(sb)->id, ORANGEFS_SB(sb)->fs_id, + ORANGEFS_SB(sb)->devname); + if (!r) + ORANGEFS_SB(sb)->mount_pending = 1; + + if (!ORANGEFS_SB(sb)->no_list) { + /* remove the sb from our list of orangefs specific sb's */ + spin_lock(&orangefs_superblocks_lock); + /* not list_del_init */ + __list_del_entry(&ORANGEFS_SB(sb)->list); + ORANGEFS_SB(sb)->list.prev = NULL; + spin_unlock(&orangefs_superblocks_lock); + } /* * make sure that ORANGEFS_DEV_REMOUNT_ALL loop that might've seen us diff --git a/fs/orangefs/waitqueue.c b/fs/orangefs/waitqueue.c index abcfa3fa9992..61e2ca7fec55 100644 --- a/fs/orangefs/waitqueue.c +++ b/fs/orangefs/waitqueue.c @@ -124,7 +124,14 @@ retry_servicing: gossip_debug(GOSSIP_WAIT_DEBUG, "%s:client core is NOT in service.\n", __func__); - timeout = op_timeout_secs * HZ; + /* + * Don't wait for the userspace component to return if + * the filesystem is being umounted anyway. + */ + if (op->upcall.type == ORANGEFS_VFS_OP_FS_UMOUNT) + timeout = 0; + else + timeout = op_timeout_secs * HZ; } spin_unlock(&orangefs_request_list_lock); diff --git a/fs/orangefs/xattr.c b/fs/orangefs/xattr.c index 74a81b1daaac..237c9c04dc3b 100644 --- a/fs/orangefs/xattr.c +++ b/fs/orangefs/xattr.c @@ -76,11 +76,8 @@ ssize_t orangefs_inode_getxattr(struct inode *inode, const char *name, if (S_ISLNK(inode->i_mode)) return -EOPNOTSUPP; - if (strlen(name) >= ORANGEFS_MAX_XATTR_NAMELEN) { - gossip_err("Invalid key length (%d)\n", - (int)strlen(name)); + if (strlen(name) > ORANGEFS_MAX_XATTR_NAMELEN) return -EINVAL; - } fsuid = from_kuid(&init_user_ns, current_fsuid()); fsgid = from_kgid(&init_user_ns, current_fsgid()); @@ -172,6 +169,9 @@ static int orangefs_inode_removexattr(struct inode *inode, const char *name, struct orangefs_kernel_op_s *new_op = NULL; int ret = -ENOMEM; + if (strlen(name) > ORANGEFS_MAX_XATTR_NAMELEN) + return -EINVAL; + down_write(&orangefs_inode->xattr_sem); new_op = op_alloc(ORANGEFS_VFS_OP_REMOVEXATTR); if (!new_op) @@ -231,23 +231,13 @@ int orangefs_inode_setxattr(struct inode *inode, const char *name, "%s: name %s, buffer_size %zd\n", __func__, name, size); - if (size >= ORANGEFS_MAX_XATTR_VALUELEN || - flags < 0) { - gossip_err("orangefs_inode_setxattr: bogus values of size(%d), flags(%d)\n", - (int)size, - flags); + if (size > ORANGEFS_MAX_XATTR_VALUELEN) + return -EINVAL; + if (strlen(name) > ORANGEFS_MAX_XATTR_NAMELEN) return -EINVAL; - } internal_flag = convert_to_internal_xattr_flags(flags); - if (strlen(name) >= ORANGEFS_MAX_XATTR_NAMELEN) { - gossip_err - ("orangefs_inode_setxattr: bogus key size (%d)\n", - (int)(strlen(name))); - return -EINVAL; - } - /* This is equivalent to a removexattr */ if (size == 0 && value == NULL) { gossip_debug(GOSSIP_XATTR_DEBUG, @@ -358,7 +348,7 @@ try_again: returned_count = new_op->downcall.resp.listxattr.returned_count; if (returned_count < 0 || - returned_count >= ORANGEFS_MAX_XATTR_LISTLEN) { + returned_count > ORANGEFS_MAX_XATTR_LISTLEN) { gossip_err("%s: impossible value for returned_count:%d:\n", __func__, returned_count); diff --git a/fs/proc/base.c b/fs/proc/base.c index c87b6b9a8a76..9e3ac5c11780 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -2834,6 +2834,15 @@ static int proc_pid_personality(struct seq_file *m, struct pid_namespace *ns, return err; } +#ifdef CONFIG_LIVEPATCH +static int proc_pid_patch_state(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *task) +{ + seq_printf(m, "%d\n", task->patch_state); + return 0; +} +#endif /* CONFIG_LIVEPATCH */ + /* * Thread groups */ @@ -2933,6 +2942,9 @@ static const struct pid_entry tgid_base_stuff[] = { REG("timers", S_IRUGO, proc_timers_operations), #endif REG("timerslack_ns", S_IRUGO|S_IWUGO, proc_pid_set_timerslack_ns_operations), +#ifdef CONFIG_LIVEPATCH + ONE("patch_state", S_IRUSR, proc_pid_patch_state), +#endif }; static int proc_tgid_base_readdir(struct file *file, struct dir_context *ctx) @@ -3315,6 +3327,9 @@ static const struct pid_entry tid_base_stuff[] = { REG("projid_map", S_IRUGO|S_IWUSR, proc_projid_map_operations), REG("setgroups", S_IRUGO|S_IWUSR, proc_setgroups_operations), #endif +#ifdef CONFIG_LIVEPATCH + ONE("patch_state", S_IRUSR, proc_pid_patch_state), +#endif }; static int proc_tid_base_readdir(struct file *file, struct dir_context *ctx) diff --git a/fs/proc/generic.c b/fs/proc/generic.c index ee27feb34cf4..9425c0d97262 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -472,6 +472,7 @@ struct proc_dir_entry *proc_create_mount_point(const char *name) ent->data = NULL; ent->proc_fops = NULL; ent->proc_iops = NULL; + parent->nlink++; if (proc_register(parent, ent) < 0) { kfree(ent); parent->nlink--; diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index d04ea4349909..67985a7233c2 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -408,10 +408,6 @@ static void next_entry(struct ctl_table_header **phead, struct ctl_table **pentr *pentry = entry; } -void register_sysctl_root(struct ctl_table_root *root) -{ -} - /* * sysctl_perm does NOT grant the superuser all rights automatically, because * some sysctl variables are readonly even to root. diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index f08bd31c1081..f0c8b33d99b1 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -441,6 +441,7 @@ struct mem_size_stats { unsigned long private_dirty; unsigned long referenced; unsigned long anonymous; + unsigned long lazyfree; unsigned long anonymous_thp; unsigned long shmem_thp; unsigned long swap; @@ -457,8 +458,11 @@ static void smaps_account(struct mem_size_stats *mss, struct page *page, int i, nr = compound ? 1 << compound_order(page) : 1; unsigned long size = nr * PAGE_SIZE; - if (PageAnon(page)) + if (PageAnon(page)) { mss->anonymous += size; + if (!PageSwapBacked(page) && !dirty && !PageDirty(page)) + mss->lazyfree += size; + } mss->resident += size; /* Accumulate the size in pages that have been accessed. */ @@ -771,6 +775,7 @@ static int show_smap(struct seq_file *m, void *v, int is_pid) "Private_Dirty: %8lu kB\n" "Referenced: %8lu kB\n" "Anonymous: %8lu kB\n" + "LazyFree: %8lu kB\n" "AnonHugePages: %8lu kB\n" "ShmemPmdMapped: %8lu kB\n" "Shared_Hugetlb: %8lu kB\n" @@ -789,6 +794,7 @@ static int show_smap(struct seq_file *m, void *v, int is_pid) mss.private_dirty >> 10, mss.referenced >> 10, mss.anonymous >> 10, + mss.lazyfree >> 10, mss.anonymous_thp >> 10, mss.shmem_thp >> 10, mss.shared_hugetlb >> 10, @@ -900,7 +906,14 @@ static inline void clear_soft_dirty(struct vm_area_struct *vma, static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmdp) { - pmd_t pmd = pmdp_huge_get_and_clear(vma->vm_mm, addr, pmdp); + pmd_t pmd = *pmdp; + + /* See comment in change_huge_pmd() */ + pmdp_invalidate(vma, addr, pmdp); + if (pmd_dirty(*pmdp)) + pmd = pmd_mkdirty(pmd); + if (pmd_young(*pmdp)) + pmd = pmd_mkyoung(pmd); pmd = pmd_wrprotect(pmd); pmd = pmd_clear_soft_dirty(pmd); diff --git a/fs/pstore/ftrace.c b/fs/pstore/ftrace.c index 899d0ba0bd6c..06aab07b6bb7 100644 --- a/fs/pstore/ftrace.c +++ b/fs/pstore/ftrace.c @@ -37,6 +37,12 @@ static void notrace pstore_ftrace_call(unsigned long ip, { unsigned long flags; struct pstore_ftrace_record rec = {}; + struct pstore_record record = { + .type = PSTORE_TYPE_FTRACE, + .buf = (char *)&rec, + .size = sizeof(rec), + .psi = psinfo, + }; if (unlikely(oops_in_progress)) return; @@ -47,8 +53,7 @@ static void notrace pstore_ftrace_call(unsigned long ip, rec.parent_ip = parent_ip; pstore_ftrace_write_timestamp(&rec, pstore_ftrace_stamp++); pstore_ftrace_encode_cpu(&rec, raw_smp_processor_id()); - psinfo->write_buf(PSTORE_TYPE_FTRACE, 0, NULL, 0, (void *)&rec, - 0, sizeof(rec), psinfo); + psinfo->write(&record); local_irq_restore(flags); } @@ -117,7 +122,7 @@ void pstore_register_ftrace(void) { struct dentry *file; - if (!psinfo->write_buf) + if (!psinfo->write) return; pstore_ftrace_dir = debugfs_create_dir("pstore", NULL); diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c index 57c0646479f5..792a4e5f9226 100644 --- a/fs/pstore/inode.c +++ b/fs/pstore/inode.c @@ -47,12 +47,8 @@ static LIST_HEAD(allpstore); struct pstore_private { struct list_head list; - struct pstore_info *psi; - enum pstore_type_id type; - u64 id; - int count; - ssize_t size; - char data[]; + struct pstore_record *record; + size_t total_size; }; struct pstore_ftrace_seq_data { @@ -63,6 +59,17 @@ struct pstore_ftrace_seq_data { #define REC_SIZE sizeof(struct pstore_ftrace_record) +static void free_pstore_private(struct pstore_private *private) +{ + if (!private) + return; + if (private->record) { + kfree(private->record->buf); + kfree(private->record); + } + kfree(private); +} + static void *pstore_ftrace_seq_start(struct seq_file *s, loff_t *pos) { struct pstore_private *ps = s->private; @@ -72,9 +79,9 @@ static void *pstore_ftrace_seq_start(struct seq_file *s, loff_t *pos) if (!data) return NULL; - data->off = ps->size % REC_SIZE; + data->off = ps->total_size % REC_SIZE; data->off += *pos * REC_SIZE; - if (data->off + REC_SIZE > ps->size) { + if (data->off + REC_SIZE > ps->total_size) { kfree(data); return NULL; } @@ -94,7 +101,7 @@ static void *pstore_ftrace_seq_next(struct seq_file *s, void *v, loff_t *pos) struct pstore_ftrace_seq_data *data = v; data->off += REC_SIZE; - if (data->off + REC_SIZE > ps->size) + if (data->off + REC_SIZE > ps->total_size) return NULL; (*pos)++; @@ -105,7 +112,9 @@ static int pstore_ftrace_seq_show(struct seq_file *s, void *v) { struct pstore_private *ps = s->private; struct pstore_ftrace_seq_data *data = v; - struct pstore_ftrace_record *rec = (void *)(ps->data + data->off); + struct pstore_ftrace_record *rec; + + rec = (struct pstore_ftrace_record *)(ps->record->buf + data->off); seq_printf(s, "CPU:%d ts:%llu %08lx %08lx %pf <- %pF\n", pstore_ftrace_decode_cpu(rec), @@ -125,7 +134,7 @@ static const struct seq_operations pstore_ftrace_seq_ops = { static int pstore_check_syslog_permissions(struct pstore_private *ps) { - switch (ps->type) { + switch (ps->record->type) { case PSTORE_TYPE_DMESG: case PSTORE_TYPE_CONSOLE: return check_syslog_permissions(SYSLOG_ACTION_READ_ALL, @@ -141,9 +150,10 @@ static ssize_t pstore_file_read(struct file *file, char __user *userbuf, struct seq_file *sf = file->private_data; struct pstore_private *ps = sf->private; - if (ps->type == PSTORE_TYPE_FTRACE) + if (ps->record->type == PSTORE_TYPE_FTRACE) return seq_read(file, userbuf, count, ppos); - return simple_read_from_buffer(userbuf, count, ppos, ps->data, ps->size); + return simple_read_from_buffer(userbuf, count, ppos, + ps->record->buf, ps->total_size); } static int pstore_file_open(struct inode *inode, struct file *file) @@ -157,7 +167,7 @@ static int pstore_file_open(struct inode *inode, struct file *file) if (err) return err; - if (ps->type == PSTORE_TYPE_FTRACE) + if (ps->record->type == PSTORE_TYPE_FTRACE) sops = &pstore_ftrace_seq_ops; err = seq_open(file, sops); @@ -193,20 +203,19 @@ static const struct file_operations pstore_file_operations = { static int pstore_unlink(struct inode *dir, struct dentry *dentry) { struct pstore_private *p = d_inode(dentry)->i_private; + struct pstore_record *record = p->record; int err; err = pstore_check_syslog_permissions(p); if (err) return err; - if (p->psi->erase) { - mutex_lock(&p->psi->read_mutex); - p->psi->erase(p->type, p->id, p->count, - d_inode(dentry)->i_ctime, p->psi); - mutex_unlock(&p->psi->read_mutex); - } else { + if (!record->psi->erase) return -EPERM; - } + + mutex_lock(&record->psi->read_mutex); + record->psi->erase(record); + mutex_unlock(&record->psi->read_mutex); return simple_unlink(dir, dentry); } @@ -221,7 +230,7 @@ static void pstore_evict_inode(struct inode *inode) spin_lock_irqsave(&allpstore_lock, flags); list_del(&p->list); spin_unlock_irqrestore(&allpstore_lock, flags); - kfree(p); + free_pstore_private(p); } } @@ -302,23 +311,23 @@ bool pstore_is_mounted(void) * Load it up with "size" bytes of data from "buf". * Set the mtime & ctime to the date that this record was originally stored. */ -int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id, int count, - char *data, bool compressed, size_t size, - struct timespec time, struct pstore_info *psi) +int pstore_mkfile(struct dentry *root, struct pstore_record *record) { - struct dentry *root = pstore_sb->s_root; struct dentry *dentry; struct inode *inode; int rc = 0; char name[PSTORE_NAMELEN]; struct pstore_private *private, *pos; unsigned long flags; + size_t size = record->size + record->ecc_notice_size; + + WARN_ON(!inode_is_locked(d_inode(root))); spin_lock_irqsave(&allpstore_lock, flags); list_for_each_entry(pos, &allpstore, list) { - if (pos->type == type && - pos->id == id && - pos->psi == psi) { + if (pos->record->type == record->type && + pos->record->id == record->id && + pos->record->psi == record->psi) { rc = -EEXIST; break; } @@ -328,72 +337,74 @@ int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id, int count, return rc; rc = -ENOMEM; - inode = pstore_get_inode(pstore_sb); + inode = pstore_get_inode(root->d_sb); if (!inode) goto fail; inode->i_mode = S_IFREG | 0444; inode->i_fop = &pstore_file_operations; - private = kmalloc(sizeof *private + size, GFP_KERNEL); + private = kzalloc(sizeof(*private), GFP_KERNEL); if (!private) goto fail_alloc; - private->type = type; - private->id = id; - private->count = count; - private->psi = psi; + private->record = record; - switch (type) { + switch (record->type) { case PSTORE_TYPE_DMESG: scnprintf(name, sizeof(name), "dmesg-%s-%lld%s", - psname, id, compressed ? ".enc.z" : ""); + record->psi->name, record->id, + record->compressed ? ".enc.z" : ""); break; case PSTORE_TYPE_CONSOLE: - scnprintf(name, sizeof(name), "console-%s-%lld", psname, id); + scnprintf(name, sizeof(name), "console-%s-%lld", + record->psi->name, record->id); break; case PSTORE_TYPE_FTRACE: - scnprintf(name, sizeof(name), "ftrace-%s-%lld", psname, id); + scnprintf(name, sizeof(name), "ftrace-%s-%lld", + record->psi->name, record->id); break; case PSTORE_TYPE_MCE: - scnprintf(name, sizeof(name), "mce-%s-%lld", psname, id); + scnprintf(name, sizeof(name), "mce-%s-%lld", + record->psi->name, record->id); break; case PSTORE_TYPE_PPC_RTAS: - scnprintf(name, sizeof(name), "rtas-%s-%lld", psname, id); + scnprintf(name, sizeof(name), "rtas-%s-%lld", + record->psi->name, record->id); break; case PSTORE_TYPE_PPC_OF: scnprintf(name, sizeof(name), "powerpc-ofw-%s-%lld", - psname, id); + record->psi->name, record->id); break; case PSTORE_TYPE_PPC_COMMON: scnprintf(name, sizeof(name), "powerpc-common-%s-%lld", - psname, id); + record->psi->name, record->id); break; case PSTORE_TYPE_PMSG: - scnprintf(name, sizeof(name), "pmsg-%s-%lld", psname, id); + scnprintf(name, sizeof(name), "pmsg-%s-%lld", + record->psi->name, record->id); break; case PSTORE_TYPE_PPC_OPAL: - sprintf(name, "powerpc-opal-%s-%lld", psname, id); + scnprintf(name, sizeof(name), "powerpc-opal-%s-%lld", + record->psi->name, record->id); break; case PSTORE_TYPE_UNKNOWN: - scnprintf(name, sizeof(name), "unknown-%s-%lld", psname, id); + scnprintf(name, sizeof(name), "unknown-%s-%lld", + record->psi->name, record->id); break; default: scnprintf(name, sizeof(name), "type%d-%s-%lld", - type, psname, id); + record->type, record->psi->name, record->id); break; } - inode_lock(d_inode(root)); - dentry = d_alloc_name(root, name); if (!dentry) - goto fail_lockedalloc; + goto fail_private; - memcpy(private->data, data, size); - inode->i_size = private->size = size; + inode->i_size = private->total_size = size; inode->i_private = private; - if (time.tv_sec) - inode->i_mtime = inode->i_ctime = time; + if (record->time.tv_sec) + inode->i_mtime = inode->i_ctime = record->time; d_add(dentry, inode); @@ -401,13 +412,10 @@ int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id, int count, list_add(&private->list, &allpstore); spin_unlock_irqrestore(&allpstore_lock, flags); - inode_unlock(d_inode(root)); - return 0; -fail_lockedalloc: - inode_unlock(d_inode(root)); - kfree(private); +fail_private: + free_pstore_private(private); fail_alloc: iput(inode); @@ -415,6 +423,27 @@ fail: return rc; } +/* + * Read all the records from the persistent store. Create + * files in our filesystem. Don't warn about -EEXIST errors + * when we are re-scanning the backing store looking to add new + * error records. + */ +void pstore_get_records(int quiet) +{ + struct pstore_info *psi = psinfo; + struct dentry *root; + + if (!psi || !pstore_sb) + return; + + root = pstore_sb->s_root; + + inode_lock(d_inode(root)); + pstore_get_backend_records(psi, root, quiet); + inode_unlock(d_inode(root)); +} + static int pstore_fill_super(struct super_block *sb, void *data, int silent) { struct inode *inode; diff --git a/fs/pstore/internal.h b/fs/pstore/internal.h index da416e6591c9..c416e653dc4f 100644 --- a/fs/pstore/internal.h +++ b/fs/pstore/internal.h @@ -25,10 +25,10 @@ extern struct pstore_info *psinfo; extern void pstore_set_kmsg_bytes(int); extern void pstore_get_records(int); -extern int pstore_mkfile(enum pstore_type_id, char *psname, u64 id, - int count, char *data, bool compressed, - size_t size, struct timespec time, - struct pstore_info *psi); +extern void pstore_get_backend_records(struct pstore_info *psi, + struct dentry *root, int quiet); +extern int pstore_mkfile(struct dentry *root, + struct pstore_record *record); extern bool pstore_is_mounted(void); #endif diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c index efab7b64925b..d468eec9b8a6 100644 --- a/fs/pstore/platform.c +++ b/fs/pstore/platform.c @@ -267,7 +267,7 @@ static void free_zlib(void) big_oops_buf_sz = 0; } -static struct pstore_zbackend backend_zlib = { +static const struct pstore_zbackend backend_zlib = { .compress = compress_zlib, .decompress = decompress_zlib, .allocate = allocate_zlib, @@ -328,7 +328,7 @@ static void free_lzo(void) big_oops_buf_sz = 0; } -static struct pstore_zbackend backend_lzo = { +static const struct pstore_zbackend backend_lzo = { .compress = compress_lzo, .decompress = decompress_lzo, .allocate = allocate_lzo, @@ -393,7 +393,7 @@ static void free_lz4(void) big_oops_buf_sz = 0; } -static struct pstore_zbackend backend_lz4 = { +static const struct pstore_zbackend backend_lz4 = { .compress = compress_lz4, .decompress = decompress_lz4, .allocate = allocate_lz4, @@ -402,7 +402,7 @@ static struct pstore_zbackend backend_lz4 = { }; #endif -static struct pstore_zbackend *zbackend = +static const struct pstore_zbackend *zbackend = #if defined(CONFIG_PSTORE_ZLIB_COMPRESS) &backend_zlib; #elif defined(CONFIG_PSTORE_LZO_COMPRESS) @@ -484,7 +484,6 @@ static void pstore_dump(struct kmsg_dumper *dumper, { unsigned long total = 0; const char *why; - u64 id; unsigned int part = 1; unsigned long flags = 0; int is_locked; @@ -506,48 +505,59 @@ static void pstore_dump(struct kmsg_dumper *dumper, oopscount++; while (total < kmsg_bytes) { char *dst; - unsigned long size; - int hsize; + size_t dst_size; + int header_size; int zipped_len = -1; - size_t len; - bool compressed = false; - size_t total_len; + size_t dump_size; + struct pstore_record record = { + .type = PSTORE_TYPE_DMESG, + .count = oopscount, + .reason = reason, + .part = part, + .compressed = false, + .buf = psinfo->buf, + .psi = psinfo, + }; if (big_oops_buf && is_locked) { dst = big_oops_buf; - size = big_oops_buf_sz; + dst_size = big_oops_buf_sz; } else { dst = psinfo->buf; - size = psinfo->bufsize; + dst_size = psinfo->bufsize; } - hsize = sprintf(dst, "%s#%d Part%u\n", why, oopscount, part); - size -= hsize; + /* Write dump header. */ + header_size = snprintf(dst, dst_size, "%s#%d Part%u\n", why, + oopscount, part); + dst_size -= header_size; - if (!kmsg_dump_get_buffer(dumper, true, dst + hsize, - size, &len)) + /* Write dump contents. */ + if (!kmsg_dump_get_buffer(dumper, true, dst + header_size, + dst_size, &dump_size)) break; if (big_oops_buf && is_locked) { zipped_len = pstore_compress(dst, psinfo->buf, - hsize + len, psinfo->bufsize); + header_size + dump_size, + psinfo->bufsize); if (zipped_len > 0) { - compressed = true; - total_len = zipped_len; + record.compressed = true; + record.size = zipped_len; } else { - total_len = copy_kmsg_to_buffer(hsize, len); + record.size = copy_kmsg_to_buffer(header_size, + dump_size); } } else { - total_len = hsize + len; + record.size = header_size + dump_size; } - ret = psinfo->write(PSTORE_TYPE_DMESG, reason, &id, part, - oopscount, compressed, total_len, psinfo); + ret = psinfo->write(&record); if (ret == 0 && reason == KMSG_DUMP_OOPS && pstore_is_mounted()) pstore_new_entry = 1; - total += total_len; + total += record.size; part++; } if (is_locked) @@ -577,8 +587,11 @@ static void pstore_console_write(struct console *con, const char *s, unsigned c) const char *e = s + c; while (s < e) { + struct pstore_record record = { + .type = PSTORE_TYPE_CONSOLE, + .psi = psinfo, + }; unsigned long flags; - u64 id; if (c > psinfo->bufsize) c = psinfo->bufsize; @@ -589,8 +602,9 @@ static void pstore_console_write(struct console *con, const char *s, unsigned c) } else { spin_lock_irqsave(&psinfo->buf_lock, flags); } - psinfo->write_buf(PSTORE_TYPE_CONSOLE, 0, &id, 0, - s, 0, c, psinfo); + record.buf = (char *)s; + record.size = c; + psinfo->write(&record); spin_unlock_irqrestore(&psinfo->buf_lock, flags); s += c; c = e - s; @@ -618,48 +632,30 @@ static void pstore_register_console(void) {} static void pstore_unregister_console(void) {} #endif -static int pstore_write_compat(enum pstore_type_id type, - enum kmsg_dump_reason reason, - u64 *id, unsigned int part, int count, - bool compressed, size_t size, - struct pstore_info *psi) -{ - return psi->write_buf(type, reason, id, part, psinfo->buf, compressed, - size, psi); -} - -static int pstore_write_buf_user_compat(enum pstore_type_id type, - enum kmsg_dump_reason reason, - u64 *id, unsigned int part, - const char __user *buf, - bool compressed, size_t size, - struct pstore_info *psi) -{ - unsigned long flags = 0; - size_t i, bufsize = size; - long ret = 0; - - if (unlikely(!access_ok(VERIFY_READ, buf, size))) - return -EFAULT; - if (bufsize > psinfo->bufsize) - bufsize = psinfo->bufsize; - spin_lock_irqsave(&psinfo->buf_lock, flags); - for (i = 0; i < size; ) { - size_t c = min(size - i, bufsize); - - ret = __copy_from_user(psinfo->buf, buf + i, c); - if (unlikely(ret != 0)) { - ret = -EFAULT; - break; - } - ret = psi->write_buf(type, reason, id, part, psinfo->buf, - compressed, c, psi); - if (unlikely(ret < 0)) - break; - i += c; +static int pstore_write_user_compat(struct pstore_record *record, + const char __user *buf) +{ + int ret = 0; + + if (record->buf) + return -EINVAL; + + record->buf = kmalloc(record->size, GFP_KERNEL); + if (!record->buf) + return -ENOMEM; + + if (unlikely(copy_from_user(record->buf, buf, record->size))) { + ret = -EFAULT; + goto out; } - spin_unlock_irqrestore(&psinfo->buf_lock, flags); - return unlikely(ret < 0) ? ret : size; + + ret = record->psi->write(record); + +out: + kfree(record->buf); + record->buf = NULL; + + return unlikely(ret < 0) ? ret : record->size; } /* @@ -673,19 +669,35 @@ int pstore_register(struct pstore_info *psi) { struct module *owner = psi->owner; - if (backend && strcmp(backend, psi->name)) + if (backend && strcmp(backend, psi->name)) { + pr_warn("ignoring unexpected backend '%s'\n", psi->name); return -EPERM; + } + + /* Sanity check flags. */ + if (!psi->flags) { + pr_warn("backend '%s' must support at least one frontend\n", + psi->name); + return -EINVAL; + } + + /* Check for required functions. */ + if (!psi->read || !psi->write) { + pr_warn("backend '%s' must implement read() and write()\n", + psi->name); + return -EINVAL; + } spin_lock(&pstore_lock); if (psinfo) { + pr_warn("backend '%s' already loaded: ignoring '%s'\n", + psinfo->name, psi->name); spin_unlock(&pstore_lock); return -EBUSY; } - if (!psi->write) - psi->write = pstore_write_compat; - if (!psi->write_buf_user) - psi->write_buf_user = pstore_write_buf_user_compat; + if (!psi->write_user) + psi->write_user = pstore_write_user_compat; psinfo = psi; mutex_init(&psinfo->read_mutex); spin_unlock(&pstore_lock); @@ -709,6 +721,7 @@ int pstore_register(struct pstore_info *psi) if (psi->flags & PSTORE_FLAGS_PMSG) pstore_register_pmsg(); + /* Start watching for new records, if desired. */ if (pstore_update_ms >= 0) { pstore_timer.expires = jiffies + msecs_to_jiffies(pstore_update_ms); @@ -721,16 +734,21 @@ int pstore_register(struct pstore_info *psi) */ backend = psi->name; - module_put(owner); - pr_info("Registered %s as persistent store backend\n", psi->name); + module_put(owner); + return 0; } EXPORT_SYMBOL_GPL(pstore_register); void pstore_unregister(struct pstore_info *psi) { + /* Stop timer and make sure all work has finished. */ + pstore_update_ms = -1; + del_timer_sync(&pstore_timer); + flush_work(&pstore_work); + if (psi->flags & PSTORE_FLAGS_PMSG) pstore_unregister_pmsg(); if (psi->flags & PSTORE_FLAGS_FTRACE) @@ -747,66 +765,99 @@ void pstore_unregister(struct pstore_info *psi) } EXPORT_SYMBOL_GPL(pstore_unregister); +static void decompress_record(struct pstore_record *record) +{ + int unzipped_len; + char *decompressed; + + /* Only PSTORE_TYPE_DMESG support compression. */ + if (!record->compressed || record->type != PSTORE_TYPE_DMESG) { + pr_warn("ignored compressed record type %d\n", record->type); + return; + } + + /* No compression method has created the common buffer. */ + if (!big_oops_buf) { + pr_warn("no decompression buffer allocated\n"); + return; + } + + unzipped_len = pstore_decompress(record->buf, big_oops_buf, + record->size, big_oops_buf_sz); + if (unzipped_len <= 0) { + pr_err("decompression failed: %d\n", unzipped_len); + return; + } + + /* Build new buffer for decompressed contents. */ + decompressed = kmalloc(unzipped_len + record->ecc_notice_size, + GFP_KERNEL); + if (!decompressed) { + pr_err("decompression ran out of memory\n"); + return; + } + memcpy(decompressed, big_oops_buf, unzipped_len); + + /* Append ECC notice to decompressed buffer. */ + memcpy(decompressed + unzipped_len, record->buf + record->size, + record->ecc_notice_size); + + /* Swap out compresed contents with decompressed contents. */ + kfree(record->buf); + record->buf = decompressed; + record->size = unzipped_len; + record->compressed = false; +} + /* - * Read all the records from the persistent store. Create + * Read all the records from one persistent store backend. Create * files in our filesystem. Don't warn about -EEXIST errors * when we are re-scanning the backing store looking to add new * error records. */ -void pstore_get_records(int quiet) -{ - struct pstore_info *psi = psinfo; - char *buf = NULL; - ssize_t size; - u64 id; - int count; - enum pstore_type_id type; - struct timespec time; - int failed = 0, rc; - bool compressed; - int unzipped_len = -1; - ssize_t ecc_notice_size = 0; - - if (!psi) +void pstore_get_backend_records(struct pstore_info *psi, + struct dentry *root, int quiet) +{ + int failed = 0; + + if (!psi || !root) return; mutex_lock(&psi->read_mutex); if (psi->open && psi->open(psi)) goto out; - while ((size = psi->read(&id, &type, &count, &time, &buf, &compressed, - &ecc_notice_size, psi)) > 0) { - if (compressed && (type == PSTORE_TYPE_DMESG)) { - if (big_oops_buf) - unzipped_len = pstore_decompress(buf, - big_oops_buf, size, - big_oops_buf_sz); - - if (unzipped_len > 0) { - if (ecc_notice_size) - memcpy(big_oops_buf + unzipped_len, - buf + size, ecc_notice_size); - kfree(buf); - buf = big_oops_buf; - size = unzipped_len; - compressed = false; - } else { - pr_err("decompression failed;returned %d\n", - unzipped_len); - compressed = true; - } + /* + * Backend callback read() allocates record.buf. decompress_record() + * may reallocate record.buf. On success, pstore_mkfile() will keep + * the record.buf, so free it only on failure. + */ + for (;;) { + struct pstore_record *record; + int rc; + + record = kzalloc(sizeof(*record), GFP_KERNEL); + if (!record) { + pr_err("out of memory creating record\n"); + break; + } + record->psi = psi; + + record->size = psi->read(record); + + /* No more records left in backend? */ + if (record->size <= 0) + break; + + decompress_record(record); + rc = pstore_mkfile(root, record); + if (rc) { + /* pstore_mkfile() did not take record, so free it. */ + kfree(record->buf); + kfree(record); + if (rc != -EEXIST || !quiet) + failed++; } - rc = pstore_mkfile(type, psi->name, id, count, buf, - compressed, size + ecc_notice_size, - time, psi); - if (unzipped_len < 0) { - /* Free buffer other than big oops */ - kfree(buf); - buf = NULL; - } else - unzipped_len = -1; - if (rc && (rc != -EEXIST || !quiet)) - failed++; } if (psi->close) psi->close(psi); @@ -830,7 +881,9 @@ static void pstore_timefunc(unsigned long dummy) schedule_work(&pstore_work); } - mod_timer(&pstore_timer, jiffies + msecs_to_jiffies(pstore_update_ms)); + if (pstore_update_ms >= 0) + mod_timer(&pstore_timer, + jiffies + msecs_to_jiffies(pstore_update_ms)); } module_param(backend, charp, 0444); diff --git a/fs/pstore/pmsg.c b/fs/pstore/pmsg.c index 78f6176c020f..209755e0d7c8 100644 --- a/fs/pstore/pmsg.c +++ b/fs/pstore/pmsg.c @@ -15,7 +15,6 @@ #include <linux/device.h> #include <linux/fs.h> #include <linux/uaccess.h> -#include <linux/vmalloc.h> #include "internal.h" static DEFINE_MUTEX(pmsg_lock); @@ -23,19 +22,22 @@ static DEFINE_MUTEX(pmsg_lock); static ssize_t write_pmsg(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { - u64 id; + struct pstore_record record = { + .type = PSTORE_TYPE_PMSG, + .size = count, + .psi = psinfo, + }; int ret; if (!count) return 0; - /* check outside lock, page in any data. write_buf_user also checks */ + /* check outside lock, page in any data. write_user also checks */ if (!access_ok(VERIFY_READ, buf, count)) return -EFAULT; mutex_lock(&pmsg_lock); - ret = psinfo->write_buf_user(PSTORE_TYPE_PMSG, 0, &id, 0, buf, 0, count, - psinfo); + ret = psinfo->write_user(&record, buf); mutex_unlock(&pmsg_lock); return ret ? ret : count; } diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c index 11f918d34b1e..5523df7f17ef 100644 --- a/fs/pstore/ram.c +++ b/fs/pstore/ram.c @@ -235,35 +235,34 @@ static ssize_t ftrace_log_combine(struct persistent_ram_zone *dest, return 0; } -static ssize_t ramoops_pstore_read(u64 *id, enum pstore_type_id *type, - int *count, struct timespec *time, - char **buf, bool *compressed, - ssize_t *ecc_notice_size, - struct pstore_info *psi) +static ssize_t ramoops_pstore_read(struct pstore_record *record) { ssize_t size = 0; - struct ramoops_context *cxt = psi->data; + struct ramoops_context *cxt = record->psi->data; struct persistent_ram_zone *prz = NULL; int header_length = 0; bool free_prz = false; - /* Ramoops headers provide time stamps for PSTORE_TYPE_DMESG, but + /* + * Ramoops headers provide time stamps for PSTORE_TYPE_DMESG, but * PSTORE_TYPE_CONSOLE and PSTORE_TYPE_FTRACE don't currently have * valid time stamps, so it is initialized to zero. */ - time->tv_sec = 0; - time->tv_nsec = 0; - *compressed = false; + record->time.tv_sec = 0; + record->time.tv_nsec = 0; + record->compressed = false; /* Find the next valid persistent_ram_zone for DMESG */ while (cxt->dump_read_cnt < cxt->max_dump_cnt && !prz) { prz = ramoops_get_next_prz(cxt->dprzs, &cxt->dump_read_cnt, - cxt->max_dump_cnt, id, type, + cxt->max_dump_cnt, &record->id, + &record->type, PSTORE_TYPE_DMESG, 1); if (!prz_ok(prz)) continue; header_length = ramoops_read_kmsg_hdr(persistent_ram_old(prz), - time, compressed); + &record->time, + &record->compressed); /* Clear and skip this DMESG record if it has no valid header */ if (!header_length) { persistent_ram_free_old(prz); @@ -274,18 +273,20 @@ static ssize_t ramoops_pstore_read(u64 *id, enum pstore_type_id *type, if (!prz_ok(prz)) prz = ramoops_get_next_prz(&cxt->cprz, &cxt->console_read_cnt, - 1, id, type, PSTORE_TYPE_CONSOLE, 0); + 1, &record->id, &record->type, + PSTORE_TYPE_CONSOLE, 0); if (!prz_ok(prz)) prz = ramoops_get_next_prz(&cxt->mprz, &cxt->pmsg_read_cnt, - 1, id, type, PSTORE_TYPE_PMSG, 0); + 1, &record->id, &record->type, + PSTORE_TYPE_PMSG, 0); /* ftrace is last since it may want to dynamically allocate memory. */ if (!prz_ok(prz)) { if (!(cxt->flags & RAMOOPS_FLAG_FTRACE_PER_CPU)) { prz = ramoops_get_next_prz(cxt->fprzs, - &cxt->ftrace_read_cnt, 1, id, type, - PSTORE_TYPE_FTRACE, 0); + &cxt->ftrace_read_cnt, 1, &record->id, + &record->type, PSTORE_TYPE_FTRACE, 0); } else { /* * Build a new dummy record which combines all the @@ -302,8 +303,10 @@ static ssize_t ramoops_pstore_read(u64 *id, enum pstore_type_id *type, while (cxt->ftrace_read_cnt < cxt->max_ftrace_cnt) { prz_next = ramoops_get_next_prz(cxt->fprzs, &cxt->ftrace_read_cnt, - cxt->max_ftrace_cnt, id, - type, PSTORE_TYPE_FTRACE, 0); + cxt->max_ftrace_cnt, + &record->id, + &record->type, + PSTORE_TYPE_FTRACE, 0); if (!prz_ok(prz_next)) continue; @@ -316,7 +319,7 @@ static ssize_t ramoops_pstore_read(u64 *id, enum pstore_type_id *type, if (size) goto out; } - *id = 0; + record->id = 0; prz = tmp_prz; } } @@ -329,17 +332,19 @@ static ssize_t ramoops_pstore_read(u64 *id, enum pstore_type_id *type, size = persistent_ram_old_size(prz) - header_length; /* ECC correction notice */ - *ecc_notice_size = persistent_ram_ecc_string(prz, NULL, 0); + record->ecc_notice_size = persistent_ram_ecc_string(prz, NULL, 0); - *buf = kmalloc(size + *ecc_notice_size + 1, GFP_KERNEL); - if (*buf == NULL) { + record->buf = kmalloc(size + record->ecc_notice_size + 1, GFP_KERNEL); + if (record->buf == NULL) { size = -ENOMEM; goto out; } - memcpy(*buf, (char *)persistent_ram_old(prz) + header_length, size); + memcpy(record->buf, (char *)persistent_ram_old(prz) + header_length, + size); - persistent_ram_ecc_string(prz, *buf + size, *ecc_notice_size + 1); + persistent_ram_ecc_string(prz, record->buf + size, + record->ecc_notice_size + 1); out: if (free_prz) { @@ -373,23 +378,18 @@ static size_t ramoops_write_kmsg_hdr(struct persistent_ram_zone *prz, return len; } -static int notrace ramoops_pstore_write_buf(enum pstore_type_id type, - enum kmsg_dump_reason reason, - u64 *id, unsigned int part, - const char *buf, - bool compressed, size_t size, - struct pstore_info *psi) +static int notrace ramoops_pstore_write(struct pstore_record *record) { - struct ramoops_context *cxt = psi->data; + struct ramoops_context *cxt = record->psi->data; struct persistent_ram_zone *prz; - size_t hlen; + size_t size, hlen; - if (type == PSTORE_TYPE_CONSOLE) { + if (record->type == PSTORE_TYPE_CONSOLE) { if (!cxt->cprz) return -ENOMEM; - persistent_ram_write(cxt->cprz, buf, size); + persistent_ram_write(cxt->cprz, record->buf, record->size); return 0; - } else if (type == PSTORE_TYPE_FTRACE) { + } else if (record->type == PSTORE_TYPE_FTRACE) { int zonenum; if (!cxt->fprzs) @@ -402,33 +402,36 @@ static int notrace ramoops_pstore_write_buf(enum pstore_type_id type, else zonenum = 0; - persistent_ram_write(cxt->fprzs[zonenum], buf, size); + persistent_ram_write(cxt->fprzs[zonenum], record->buf, + record->size); return 0; - } else if (type == PSTORE_TYPE_PMSG) { + } else if (record->type == PSTORE_TYPE_PMSG) { pr_warn_ratelimited("PMSG shouldn't call %s\n", __func__); return -EINVAL; } - if (type != PSTORE_TYPE_DMESG) + if (record->type != PSTORE_TYPE_DMESG) return -EINVAL; - /* Out of the various dmesg dump types, ramoops is currently designed + /* + * Out of the various dmesg dump types, ramoops is currently designed * to only store crash logs, rather than storing general kernel logs. */ - if (reason != KMSG_DUMP_OOPS && - reason != KMSG_DUMP_PANIC) + if (record->reason != KMSG_DUMP_OOPS && + record->reason != KMSG_DUMP_PANIC) return -EINVAL; /* Skip Oopes when configured to do so. */ - if (reason == KMSG_DUMP_OOPS && !cxt->dump_oops) + if (record->reason == KMSG_DUMP_OOPS && !cxt->dump_oops) return -EINVAL; - /* Explicitly only take the first part of any new crash. + /* + * Explicitly only take the first part of any new crash. * If our buffer is larger than kmsg_bytes, this can never happen, * and if our buffer is smaller than kmsg_bytes, we don't want the * report split across multiple records. */ - if (part != 1) + if (record->part != 1) return -ENOSPC; if (!cxt->dprzs) @@ -436,53 +439,50 @@ static int notrace ramoops_pstore_write_buf(enum pstore_type_id type, prz = cxt->dprzs[cxt->dump_write_cnt]; - hlen = ramoops_write_kmsg_hdr(prz, compressed); + /* Build header and append record contents. */ + hlen = ramoops_write_kmsg_hdr(prz, record->compressed); + size = record->size; if (size + hlen > prz->buffer_size) size = prz->buffer_size - hlen; - persistent_ram_write(prz, buf, size); + persistent_ram_write(prz, record->buf, size); cxt->dump_write_cnt = (cxt->dump_write_cnt + 1) % cxt->max_dump_cnt; return 0; } -static int notrace ramoops_pstore_write_buf_user(enum pstore_type_id type, - enum kmsg_dump_reason reason, - u64 *id, unsigned int part, - const char __user *buf, - bool compressed, size_t size, - struct pstore_info *psi) +static int notrace ramoops_pstore_write_user(struct pstore_record *record, + const char __user *buf) { - if (type == PSTORE_TYPE_PMSG) { - struct ramoops_context *cxt = psi->data; + if (record->type == PSTORE_TYPE_PMSG) { + struct ramoops_context *cxt = record->psi->data; if (!cxt->mprz) return -ENOMEM; - return persistent_ram_write_user(cxt->mprz, buf, size); + return persistent_ram_write_user(cxt->mprz, buf, record->size); } return -EINVAL; } -static int ramoops_pstore_erase(enum pstore_type_id type, u64 id, int count, - struct timespec time, struct pstore_info *psi) +static int ramoops_pstore_erase(struct pstore_record *record) { - struct ramoops_context *cxt = psi->data; + struct ramoops_context *cxt = record->psi->data; struct persistent_ram_zone *prz; - switch (type) { + switch (record->type) { case PSTORE_TYPE_DMESG: - if (id >= cxt->max_dump_cnt) + if (record->id >= cxt->max_dump_cnt) return -EINVAL; - prz = cxt->dprzs[id]; + prz = cxt->dprzs[record->id]; break; case PSTORE_TYPE_CONSOLE: prz = cxt->cprz; break; case PSTORE_TYPE_FTRACE: - if (id >= cxt->max_ftrace_cnt) + if (record->id >= cxt->max_ftrace_cnt) return -EINVAL; - prz = cxt->fprzs[id]; + prz = cxt->fprzs[record->id]; break; case PSTORE_TYPE_PMSG: prz = cxt->mprz; @@ -503,8 +503,8 @@ static struct ramoops_context oops_cxt = { .name = "ramoops", .open = ramoops_pstore_open, .read = ramoops_pstore_read, - .write_buf = ramoops_pstore_write_buf, - .write_buf_user = ramoops_pstore_write_buf_user, + .write = ramoops_pstore_write, + .write_user = ramoops_pstore_write_user, .erase = ramoops_pstore_erase, }, }; diff --git a/fs/pstore/ram_core.c b/fs/pstore/ram_core.c index bc927e30bdcc..e11672aa4575 100644 --- a/fs/pstore/ram_core.c +++ b/fs/pstore/ram_core.c @@ -532,7 +532,7 @@ struct persistent_ram_zone *persistent_ram_new(phys_addr_t start, size_t size, } /* Initialize general buffer state. */ - prz->buffer_lock = __RAW_SPIN_LOCK_UNLOCKED(buffer_lock); + raw_spin_lock_init(&prz->buffer_lock); prz->flags = flags; ret = persistent_ram_buffer_map(start, size, prz, memtype); diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index 74b489e3714d..ebf80c7739e1 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -2188,8 +2188,7 @@ int dquot_disable(struct super_block *sb, int type, unsigned int flags) /* This can happen when suspending quotas on remount-ro... */ if (toputinode[cnt] && !sb_has_quota_loaded(sb, cnt)) { inode_lock(toputinode[cnt]); - toputinode[cnt]->i_flags &= ~(S_IMMUTABLE | - S_NOATIME | S_NOQUOTA); + toputinode[cnt]->i_flags &= ~S_NOQUOTA; truncate_inode_pages(&toputinode[cnt]->i_data, 0); inode_unlock(toputinode[cnt]); mark_inode_dirty_sync(toputinode[cnt]); @@ -2237,7 +2236,6 @@ static int vfs_load_quota_inode(struct inode *inode, int type, int format_id, struct super_block *sb = inode->i_sb; struct quota_info *dqopt = sb_dqopt(sb); int error; - int oldflags = -1; if (!fmt) return -ESRCH; @@ -2285,9 +2283,7 @@ static int vfs_load_quota_inode(struct inode *inode, int type, int format_id, * possible) Also nobody should write to the file - we use * special IO operations which ignore the immutable bit. */ inode_lock(inode); - oldflags = inode->i_flags & (S_NOATIME | S_IMMUTABLE | - S_NOQUOTA); - inode->i_flags |= S_NOQUOTA | S_NOATIME | S_IMMUTABLE; + inode->i_flags |= S_NOQUOTA; inode_unlock(inode); /* * When S_NOQUOTA is set, remove dquot references as no more @@ -2329,14 +2325,9 @@ out_file_init: dqopt->files[type] = NULL; iput(inode); out_file_flags: - if (oldflags != -1) { - inode_lock(inode); - /* Set the flags back (in the case of accidental quotaon() - * on a wrong file we don't want to mess up the flags) */ - inode->i_flags &= ~(S_NOATIME | S_NOQUOTA | S_IMMUTABLE); - inode->i_flags |= oldflags; - inode_unlock(inode); - } + inode_lock(inode); + inode->i_flags &= ~S_NOQUOTA; + inode_unlock(inode); out_fmt: put_quota_format(fmt); @@ -2780,18 +2771,6 @@ int dquot_set_dqinfo(struct super_block *sb, int type, struct qc_info *ii) } EXPORT_SYMBOL(dquot_set_dqinfo); -const struct quotactl_ops dquot_quotactl_ops = { - .quota_on = dquot_quota_on, - .quota_off = dquot_quota_off, - .quota_sync = dquot_quota_sync, - .get_state = dquot_get_state, - .set_info = dquot_set_dqinfo, - .get_dqblk = dquot_get_dqblk, - .get_nextdqblk = dquot_get_next_dqblk, - .set_dqblk = dquot_set_dqblk -}; -EXPORT_SYMBOL(dquot_quotactl_ops); - const struct quotactl_ops dquot_quotactl_sysfile_ops = { .quota_enable = dquot_quota_enable, .quota_disable = dquot_quota_disable, diff --git a/fs/read_write.c b/fs/read_write.c index c4f88afbc67f..47c1d4484df9 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -841,6 +841,81 @@ out: return ret; } +#ifdef CONFIG_COMPAT +ssize_t compat_rw_copy_check_uvector(int type, + const struct compat_iovec __user *uvector, unsigned long nr_segs, + unsigned long fast_segs, struct iovec *fast_pointer, + struct iovec **ret_pointer) +{ + compat_ssize_t tot_len; + struct iovec *iov = *ret_pointer = fast_pointer; + ssize_t ret = 0; + int seg; + + /* + * SuS says "The readv() function *may* fail if the iovcnt argument + * was less than or equal to 0, or greater than {IOV_MAX}. Linux has + * traditionally returned zero for zero segments, so... + */ + if (nr_segs == 0) + goto out; + + ret = -EINVAL; + if (nr_segs > UIO_MAXIOV) + goto out; + if (nr_segs > fast_segs) { + ret = -ENOMEM; + iov = kmalloc(nr_segs*sizeof(struct iovec), GFP_KERNEL); + if (iov == NULL) + goto out; + } + *ret_pointer = iov; + + ret = -EFAULT; + if (!access_ok(VERIFY_READ, uvector, nr_segs*sizeof(*uvector))) + goto out; + + /* + * Single unix specification: + * We should -EINVAL if an element length is not >= 0 and fitting an + * ssize_t. + * + * In Linux, the total length is limited to MAX_RW_COUNT, there is + * no overflow possibility. + */ + tot_len = 0; + ret = -EINVAL; + for (seg = 0; seg < nr_segs; seg++) { + compat_uptr_t buf; + compat_ssize_t len; + + if (__get_user(len, &uvector->iov_len) || + __get_user(buf, &uvector->iov_base)) { + ret = -EFAULT; + goto out; + } + if (len < 0) /* size_t not fitting in compat_ssize_t .. */ + goto out; + if (type >= 0 && + !access_ok(vrfy_dir(type), compat_ptr(buf), len)) { + ret = -EFAULT; + goto out; + } + if (len > MAX_RW_COUNT - tot_len) + len = MAX_RW_COUNT - tot_len; + tot_len += len; + iov->iov_base = compat_ptr(buf); + iov->iov_len = (compat_size_t) len; + uvector++; + iov++; + } + ret = tot_len; + +out: + return ret; +} +#endif + static ssize_t __do_readv_writev(int type, struct file *file, struct iov_iter *iter, loff_t *pos, int flags) { diff --git a/fs/readdir.c b/fs/readdir.c index 0e8a7f355f7a..89659549c09d 100644 --- a/fs/readdir.c +++ b/fs/readdir.c @@ -18,6 +18,7 @@ #include <linux/security.h> #include <linux/syscalls.h> #include <linux/unistd.h> +#include <linux/compat.h> #include <linux/uaccess.h> @@ -324,3 +325,167 @@ SYSCALL_DEFINE3(getdents64, unsigned int, fd, fdput_pos(f); return error; } + +#ifdef CONFIG_COMPAT +struct compat_old_linux_dirent { + compat_ulong_t d_ino; + compat_ulong_t d_offset; + unsigned short d_namlen; + char d_name[1]; +}; + +struct compat_readdir_callback { + struct dir_context ctx; + struct compat_old_linux_dirent __user *dirent; + int result; +}; + +static int compat_fillonedir(struct dir_context *ctx, const char *name, + int namlen, loff_t offset, u64 ino, + unsigned int d_type) +{ + struct compat_readdir_callback *buf = + container_of(ctx, struct compat_readdir_callback, ctx); + struct compat_old_linux_dirent __user *dirent; + compat_ulong_t d_ino; + + if (buf->result) + return -EINVAL; + d_ino = ino; + if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) { + buf->result = -EOVERFLOW; + return -EOVERFLOW; + } + buf->result++; + dirent = buf->dirent; + if (!access_ok(VERIFY_WRITE, dirent, + (unsigned long)(dirent->d_name + namlen + 1) - + (unsigned long)dirent)) + goto efault; + if ( __put_user(d_ino, &dirent->d_ino) || + __put_user(offset, &dirent->d_offset) || + __put_user(namlen, &dirent->d_namlen) || + __copy_to_user(dirent->d_name, name, namlen) || + __put_user(0, dirent->d_name + namlen)) + goto efault; + return 0; +efault: + buf->result = -EFAULT; + return -EFAULT; +} + +COMPAT_SYSCALL_DEFINE3(old_readdir, unsigned int, fd, + struct compat_old_linux_dirent __user *, dirent, unsigned int, count) +{ + int error; + struct fd f = fdget_pos(fd); + struct compat_readdir_callback buf = { + .ctx.actor = compat_fillonedir, + .dirent = dirent + }; + + if (!f.file) + return -EBADF; + + error = iterate_dir(f.file, &buf.ctx); + if (buf.result) + error = buf.result; + + fdput_pos(f); + return error; +} + +struct compat_linux_dirent { + compat_ulong_t d_ino; + compat_ulong_t d_off; + unsigned short d_reclen; + char d_name[1]; +}; + +struct compat_getdents_callback { + struct dir_context ctx; + struct compat_linux_dirent __user *current_dir; + struct compat_linux_dirent __user *previous; + int count; + int error; +}; + +static int compat_filldir(struct dir_context *ctx, const char *name, int namlen, + loff_t offset, u64 ino, unsigned int d_type) +{ + struct compat_linux_dirent __user * dirent; + struct compat_getdents_callback *buf = + container_of(ctx, struct compat_getdents_callback, ctx); + compat_ulong_t d_ino; + int reclen = ALIGN(offsetof(struct compat_linux_dirent, d_name) + + namlen + 2, sizeof(compat_long_t)); + + buf->error = -EINVAL; /* only used if we fail.. */ + if (reclen > buf->count) + return -EINVAL; + d_ino = ino; + if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) { + buf->error = -EOVERFLOW; + return -EOVERFLOW; + } + dirent = buf->previous; + if (dirent) { + if (signal_pending(current)) + return -EINTR; + if (__put_user(offset, &dirent->d_off)) + goto efault; + } + dirent = buf->current_dir; + if (__put_user(d_ino, &dirent->d_ino)) + goto efault; + if (__put_user(reclen, &dirent->d_reclen)) + goto efault; + if (copy_to_user(dirent->d_name, name, namlen)) + goto efault; + if (__put_user(0, dirent->d_name + namlen)) + goto efault; + if (__put_user(d_type, (char __user *) dirent + reclen - 1)) + goto efault; + buf->previous = dirent; + dirent = (void __user *)dirent + reclen; + buf->current_dir = dirent; + buf->count -= reclen; + return 0; +efault: + buf->error = -EFAULT; + return -EFAULT; +} + +COMPAT_SYSCALL_DEFINE3(getdents, unsigned int, fd, + struct compat_linux_dirent __user *, dirent, unsigned int, count) +{ + struct fd f; + struct compat_linux_dirent __user * lastdirent; + struct compat_getdents_callback buf = { + .ctx.actor = compat_filldir, + .current_dir = dirent, + .count = count + }; + int error; + + if (!access_ok(VERIFY_WRITE, dirent, count)) + return -EFAULT; + + f = fdget_pos(fd); + if (!f.file) + return -EBADF; + + error = iterate_dir(f.file, &buf.ctx); + if (error >= 0) + error = buf.error; + lastdirent = buf.previous; + if (lastdirent) { + if (put_user(buf.ctx.pos, &lastdirent->d_off)) + error = -EFAULT; + else + error = count - buf.count; + } + fdput_pos(f); + return error; +} +#endif diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index a6ab9d64ea1b..873fc04e9403 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -1375,7 +1375,6 @@ static void init_inode(struct inode *inode, struct treepath *path) static void inode2sd(void *sd, struct inode *inode, loff_t size) { struct stat_data *sd_v2 = (struct stat_data *)sd; - __u16 flags; set_sd_v2_mode(sd_v2, inode->i_mode); set_sd_v2_nlink(sd_v2, inode->i_nlink); @@ -1390,9 +1389,7 @@ static void inode2sd(void *sd, struct inode *inode, loff_t size) set_sd_v2_rdev(sd_v2, new_encode_dev(inode->i_rdev)); else set_sd_v2_generation(sd_v2, inode->i_generation); - flags = REISERFS_I(inode)->i_attrs; - i_attrs_to_sd_attrs(inode, &flags); - set_sd_v2_attrs(sd_v2, flags); + set_sd_v2_attrs(sd_v2, REISERFS_I(inode)->i_attrs); } /* used to copy inode's fields to old stat data */ @@ -2002,10 +1999,6 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th, /* uid and gid must already be set by the caller for quota init */ - /* symlink cannot be immutable or append only, right? */ - if (S_ISLNK(inode->i_mode)) - inode->i_flags &= ~(S_IMMUTABLE | S_APPEND); - inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); inode->i_size = i_size; inode->i_blocks = 0; @@ -3095,28 +3088,6 @@ void sd_attrs_to_i_attrs(__u16 sd_attrs, struct inode *inode) } } -void i_attrs_to_sd_attrs(struct inode *inode, __u16 * sd_attrs) -{ - if (reiserfs_attrs(inode->i_sb)) { - if (inode->i_flags & S_IMMUTABLE) - *sd_attrs |= REISERFS_IMMUTABLE_FL; - else - *sd_attrs &= ~REISERFS_IMMUTABLE_FL; - if (inode->i_flags & S_SYNC) - *sd_attrs |= REISERFS_SYNC_FL; - else - *sd_attrs &= ~REISERFS_SYNC_FL; - if (inode->i_flags & S_NOATIME) - *sd_attrs |= REISERFS_NOATIME_FL; - else - *sd_attrs &= ~REISERFS_NOATIME_FL; - if (REISERFS_I(inode)->i_flags & i_nopack_mask) - *sd_attrs |= REISERFS_NOTAIL_FL; - else - *sd_attrs &= ~REISERFS_NOTAIL_FL; - } -} - /* * decide if this buffer needs to stay around for data logging or ordered * write purposes diff --git a/fs/reiserfs/ioctl.c b/fs/reiserfs/ioctl.c index 1f4692a505a0..acbbaf7a0bb2 100644 --- a/fs/reiserfs/ioctl.c +++ b/fs/reiserfs/ioctl.c @@ -47,7 +47,6 @@ long reiserfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) } flags = REISERFS_I(inode)->i_attrs; - i_attrs_to_sd_attrs(inode, (__u16 *) & flags); err = put_user(flags, (int __user *)arg); break; case REISERFS_IOC_SETFLAGS:{ diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c index aa40c242f1db..da01f497180a 100644 --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -1961,7 +1961,7 @@ static int do_journal_release(struct reiserfs_transaction_handle *th, * will be requeued because superblock is being shutdown and doesn't * have MS_ACTIVE set. */ - cancel_delayed_work_sync(&REISERFS_SB(sb)->old_work); + reiserfs_cancel_old_flush(sb); /* wait for all commits to finish */ cancel_delayed_work_sync(&SB_JOURNAL(sb)->j_work); diff --git a/fs/reiserfs/lbalance.c b/fs/reiserfs/lbalance.c index 249594a821e0..f5cebd70d903 100644 --- a/fs/reiserfs/lbalance.c +++ b/fs/reiserfs/lbalance.c @@ -475,7 +475,7 @@ static void leaf_item_bottle(struct buffer_info *dest_bi, * 'cpy_bytes'; create new item header; * n_ih = new item_header; */ - memcpy(&n_ih, ih, SHORT_KEY_SIZE); + memcpy(&n_ih.ih_key, &ih->ih_key, KEY_SIZE); /* Endian safe, both le */ n_ih.ih_version = ih->ih_version; diff --git a/fs/reiserfs/reiserfs.h b/fs/reiserfs/reiserfs.h index 2adcde137c3f..1d34377fef97 100644 --- a/fs/reiserfs/reiserfs.h +++ b/fs/reiserfs/reiserfs.h @@ -1326,7 +1326,6 @@ struct cpu_key { #define KEY_NOT_FOUND 0 #define KEY_SIZE (sizeof(struct reiserfs_key)) -#define SHORT_KEY_SIZE (sizeof (__u32) + sizeof (__u32)) /* return values for search_by_key and clones */ #define ITEM_FOUND 1 @@ -2949,6 +2948,7 @@ int reiserfs_allocate_list_bitmaps(struct super_block *s, struct reiserfs_list_bitmap *, unsigned int); void reiserfs_schedule_old_flush(struct super_block *s); +void reiserfs_cancel_old_flush(struct super_block *s); void add_save_link(struct reiserfs_transaction_handle *th, struct inode *inode, int truncate); int remove_save_link(struct inode *inode, int truncate); @@ -3099,7 +3099,6 @@ static inline void reiserfs_update_sd(struct reiserfs_transaction_handle *th, } void sd_attrs_to_i_attrs(__u16 sd_attrs, struct inode *inode); -void i_attrs_to_sd_attrs(struct inode *inode, __u16 * sd_attrs); int reiserfs_setattr(struct dentry *dentry, struct iattr *attr); int __reiserfs_write_begin(struct page *page, unsigned from, unsigned len); diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index feabcde0290d..685f1e056998 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -89,11 +89,27 @@ static void flush_old_commits(struct work_struct *work) sbi = container_of(work, struct reiserfs_sb_info, old_work.work); s = sbi->s_journal->j_work_sb; + /* + * We need s_umount for protecting quota writeback. We have to use + * trylock as reiserfs_cancel_old_flush() may be waiting for this work + * to complete with s_umount held. + */ + if (!down_read_trylock(&s->s_umount)) { + /* Requeue work if we are not cancelling it */ + spin_lock(&sbi->old_work_lock); + if (sbi->work_queued == 1) + queue_delayed_work(system_long_wq, &sbi->old_work, HZ); + spin_unlock(&sbi->old_work_lock); + return; + } spin_lock(&sbi->old_work_lock); - sbi->work_queued = 0; + /* Avoid clobbering the cancel state... */ + if (sbi->work_queued == 1) + sbi->work_queued = 0; spin_unlock(&sbi->old_work_lock); reiserfs_sync_fs(s, 1); + up_read(&s->s_umount); } void reiserfs_schedule_old_flush(struct super_block *s) @@ -117,21 +133,22 @@ void reiserfs_schedule_old_flush(struct super_block *s) spin_unlock(&sbi->old_work_lock); } -static void cancel_old_flush(struct super_block *s) +void reiserfs_cancel_old_flush(struct super_block *s) { struct reiserfs_sb_info *sbi = REISERFS_SB(s); - cancel_delayed_work_sync(&REISERFS_SB(s)->old_work); spin_lock(&sbi->old_work_lock); - sbi->work_queued = 0; + /* Make sure no new flushes will be queued */ + sbi->work_queued = 2; spin_unlock(&sbi->old_work_lock); + cancel_delayed_work_sync(&REISERFS_SB(s)->old_work); } static int reiserfs_freeze(struct super_block *s) { struct reiserfs_transaction_handle th; - cancel_old_flush(s); + reiserfs_cancel_old_flush(s); reiserfs_write_lock(s); if (!(s->s_flags & MS_RDONLY)) { @@ -152,7 +169,13 @@ static int reiserfs_freeze(struct super_block *s) static int reiserfs_unfreeze(struct super_block *s) { + struct reiserfs_sb_info *sbi = REISERFS_SB(s); + reiserfs_allow_writes(s); + spin_lock(&sbi->old_work_lock); + /* Allow old_work to run again */ + sbi->work_queued = 0; + spin_unlock(&sbi->old_work_lock); return 0; } @@ -547,12 +570,28 @@ static void reiserfs_kill_sb(struct super_block *s) kill_block_super(s); } +#ifdef CONFIG_QUOTA +static int reiserfs_quota_off(struct super_block *sb, int type); + +static void reiserfs_quota_off_umount(struct super_block *s) +{ + int type; + + for (type = 0; type < REISERFS_MAXQUOTAS; type++) + reiserfs_quota_off(s, type); +} +#else +static inline void reiserfs_quota_off_umount(struct super_block *s) +{ +} +#endif + static void reiserfs_put_super(struct super_block *s) { struct reiserfs_transaction_handle th; th.t_trans_id = 0; - dquot_disable(s, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED); + reiserfs_quota_off_umount(s); reiserfs_write_lock(s); @@ -817,7 +856,7 @@ static const struct dquot_operations reiserfs_quota_operations = { static const struct quotactl_ops reiserfs_qctl_operations = { .quota_on = reiserfs_quota_on, - .quota_off = dquot_quota_off, + .quota_off = reiserfs_quota_off, .quota_sync = dquot_quota_sync, .get_state = dquot_get_state, .set_info = dquot_set_dqinfo, @@ -2194,7 +2233,7 @@ error_unlocked: if (sbi->commit_wq) destroy_workqueue(sbi->commit_wq); - cancel_delayed_work_sync(&REISERFS_SB(s)->old_work); + reiserfs_cancel_old_flush(s); reiserfs_free_bitmap_cache(s); if (SB_BUFFER_WITH_SB(s)) @@ -2405,12 +2444,47 @@ static int reiserfs_quota_on(struct super_block *sb, int type, int format_id, goto out; } reiserfs_write_unlock(sb); - return dquot_quota_on(sb, type, format_id, path); + err = dquot_quota_on(sb, type, format_id, path); + if (!err) { + inode_lock(inode); + REISERFS_I(inode)->i_attrs |= REISERFS_IMMUTABLE_FL | + REISERFS_NOATIME_FL; + inode_set_flags(inode, S_IMMUTABLE | S_NOATIME, + S_IMMUTABLE | S_NOATIME); + inode_unlock(inode); + mark_inode_dirty(inode); + } + return err; out: reiserfs_write_unlock(sb); return err; } +static int reiserfs_quota_off(struct super_block *sb, int type) +{ + int err; + struct inode *inode = sb_dqopt(sb)->files[type]; + + if (!inode || !igrab(inode)) + goto out; + + err = dquot_quota_off(sb, type); + if (err) + goto out_put; + + inode_lock(inode); + REISERFS_I(inode)->i_attrs &= ~(REISERFS_IMMUTABLE_FL | + REISERFS_NOATIME_FL); + inode_set_flags(inode, 0, S_IMMUTABLE | S_NOATIME); + inode_unlock(inode); + mark_inode_dirty(inode); +out_put: + iput(inode); + return err; +out: + return dquot_quota_off(sb, type); +} + /* * Read data from quotafile - avoid pagecache and such because we cannot afford * acquiring the locks... As quota files are never truncated and quota code diff --git a/fs/select.c b/fs/select.c index e2112270d75a..bd4b2ccfd346 100644 --- a/fs/select.c +++ b/fs/select.c @@ -338,6 +338,53 @@ sticky: return ret; } +/* + * Scalable version of the fd_set. + */ + +typedef struct { + unsigned long *in, *out, *ex; + unsigned long *res_in, *res_out, *res_ex; +} fd_set_bits; + +/* + * How many longwords for "nr" bits? + */ +#define FDS_BITPERLONG (8*sizeof(long)) +#define FDS_LONGS(nr) (((nr)+FDS_BITPERLONG-1)/FDS_BITPERLONG) +#define FDS_BYTES(nr) (FDS_LONGS(nr)*sizeof(long)) + +/* + * We do a VERIFY_WRITE here even though we are only reading this time: + * we'll write to it eventually.. + * + * Use "unsigned long" accesses to let user-mode fd_set's be long-aligned. + */ +static inline +int get_fd_set(unsigned long nr, void __user *ufdset, unsigned long *fdset) +{ + nr = FDS_BYTES(nr); + if (ufdset) + return copy_from_user(fdset, ufdset, nr) ? -EFAULT : 0; + + memset(fdset, 0, nr); + return 0; +} + +static inline unsigned long __must_check +set_fd_set(unsigned long nr, void __user *ufdset, unsigned long *fdset) +{ + if (ufdset) + return __copy_to_user(ufdset, fdset, FDS_BYTES(nr)); + return 0; +} + +static inline +void zero_fd_set(unsigned long nr, unsigned long *fdset) +{ + memset(fdset, 0, FDS_BYTES(nr)); +} + #define FDS_IN(fds, n) (fds->in + n) #define FDS_OUT(fds, n) (fds->out + n) #define FDS_EX(fds, n) (fds->ex + n) @@ -401,7 +448,7 @@ static inline void wait_key_set(poll_table *wait, unsigned long in, wait->_key |= POLLOUT_SET; } -int do_select(int n, fd_set_bits *fds, struct timespec64 *end_time) +static int do_select(int n, fd_set_bits *fds, struct timespec64 *end_time) { ktime_t expire, *to = NULL; struct poll_wqueues table; @@ -409,7 +456,7 @@ int do_select(int n, fd_set_bits *fds, struct timespec64 *end_time) int retval, i, timed_out = 0; u64 slack = 0; unsigned int busy_flag = net_busy_loop_on() ? POLL_BUSY_LOOP : 0; - unsigned long busy_end = 0; + unsigned long busy_start = 0; rcu_read_lock(); retval = max_select_fd(n, fds); @@ -512,11 +559,11 @@ int do_select(int n, fd_set_bits *fds, struct timespec64 *end_time) /* only if found POLL_BUSY_LOOP sockets && not out of time */ if (can_busy_loop && !need_resched()) { - if (!busy_end) { - busy_end = busy_loop_end_time(); + if (!busy_start) { + busy_start = busy_loop_current_time(); continue; } - if (!busy_loop_timeout(busy_end)) + if (!busy_loop_timeout(busy_start)) continue; } busy_flag = 0; @@ -800,7 +847,7 @@ static int do_poll(struct poll_list *list, struct poll_wqueues *wait, int timed_out = 0, count = 0; u64 slack = 0; unsigned int busy_flag = net_busy_loop_on() ? POLL_BUSY_LOOP : 0; - unsigned long busy_end = 0; + unsigned long busy_start = 0; /* Optimise the no-wait case */ if (end_time && !end_time->tv_sec && !end_time->tv_nsec) { @@ -853,11 +900,11 @@ static int do_poll(struct poll_list *list, struct poll_wqueues *wait, /* only if found POLL_BUSY_LOOP sockets && not out of time */ if (can_busy_loop && !need_resched()) { - if (!busy_end) { - busy_end = busy_loop_end_time(); + if (!busy_start) { + busy_start = busy_loop_current_time(); continue; } - if (!busy_loop_timeout(busy_end)) + if (!busy_loop_timeout(busy_start)) continue; } busy_flag = 0; @@ -881,7 +928,7 @@ static int do_poll(struct poll_list *list, struct poll_wqueues *wait, #define N_STACK_PPS ((sizeof(stack_pps) - sizeof(struct poll_list)) / \ sizeof(struct pollfd)) -int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds, +static int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds, struct timespec64 *end_time) { struct poll_wqueues table; @@ -1053,3 +1100,373 @@ SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds, unsigned int, nfds, return ret; } + +#ifdef CONFIG_COMPAT +#define __COMPAT_NFDBITS (8 * sizeof(compat_ulong_t)) + +static +int compat_poll_select_copy_remaining(struct timespec *end_time, void __user *p, + int timeval, int ret) +{ + struct timespec ts; + + if (!p) + return ret; + + if (current->personality & STICKY_TIMEOUTS) + goto sticky; + + /* No update for zero timeout */ + if (!end_time->tv_sec && !end_time->tv_nsec) + return ret; + + ktime_get_ts(&ts); + ts = timespec_sub(*end_time, ts); + if (ts.tv_sec < 0) + ts.tv_sec = ts.tv_nsec = 0; + + if (timeval) { + struct compat_timeval rtv; + + rtv.tv_sec = ts.tv_sec; + rtv.tv_usec = ts.tv_nsec / NSEC_PER_USEC; + + if (!copy_to_user(p, &rtv, sizeof(rtv))) + return ret; + } else { + struct compat_timespec rts; + + rts.tv_sec = ts.tv_sec; + rts.tv_nsec = ts.tv_nsec; + + if (!copy_to_user(p, &rts, sizeof(rts))) + return ret; + } + /* + * If an application puts its timeval in read-only memory, we + * don't want the Linux-specific update to the timeval to + * cause a fault after the select has completed + * successfully. However, because we're not updating the + * timeval, we can't restart the system call. + */ + +sticky: + if (ret == -ERESTARTNOHAND) + ret = -EINTR; + return ret; +} + +/* + * Ooo, nasty. We need here to frob 32-bit unsigned longs to + * 64-bit unsigned longs. + */ +static +int compat_get_fd_set(unsigned long nr, compat_ulong_t __user *ufdset, + unsigned long *fdset) +{ + nr = DIV_ROUND_UP(nr, __COMPAT_NFDBITS); + if (ufdset) { + unsigned long odd; + + if (!access_ok(VERIFY_WRITE, ufdset, nr*sizeof(compat_ulong_t))) + return -EFAULT; + + odd = nr & 1UL; + nr &= ~1UL; + while (nr) { + unsigned long h, l; + if (__get_user(l, ufdset) || __get_user(h, ufdset+1)) + return -EFAULT; + ufdset += 2; + *fdset++ = h << 32 | l; + nr -= 2; + } + if (odd && __get_user(*fdset, ufdset)) + return -EFAULT; + } else { + /* Tricky, must clear full unsigned long in the + * kernel fdset at the end, this makes sure that + * actually happens. + */ + memset(fdset, 0, ((nr + 1) & ~1)*sizeof(compat_ulong_t)); + } + return 0; +} + +static +int compat_set_fd_set(unsigned long nr, compat_ulong_t __user *ufdset, + unsigned long *fdset) +{ + unsigned long odd; + nr = DIV_ROUND_UP(nr, __COMPAT_NFDBITS); + + if (!ufdset) + return 0; + + odd = nr & 1UL; + nr &= ~1UL; + while (nr) { + unsigned long h, l; + l = *fdset++; + h = l >> 32; + if (__put_user(l, ufdset) || __put_user(h, ufdset+1)) + return -EFAULT; + ufdset += 2; + nr -= 2; + } + if (odd && __put_user(*fdset, ufdset)) + return -EFAULT; + return 0; +} + + +/* + * This is a virtual copy of sys_select from fs/select.c and probably + * should be compared to it from time to time + */ + +/* + * We can actually return ERESTARTSYS instead of EINTR, but I'd + * like to be certain this leads to no problems. So I return + * EINTR just for safety. + * + * Update: ERESTARTSYS breaks at least the xview clock binary, so + * I'm trying ERESTARTNOHAND which restart only when you want to. + */ +static int compat_core_sys_select(int n, compat_ulong_t __user *inp, + compat_ulong_t __user *outp, compat_ulong_t __user *exp, + struct timespec *end_time) +{ + fd_set_bits fds; + void *bits; + int size, max_fds, ret = -EINVAL; + struct fdtable *fdt; + long stack_fds[SELECT_STACK_ALLOC/sizeof(long)]; + + if (n < 0) + goto out_nofds; + + /* max_fds can increase, so grab it once to avoid race */ + rcu_read_lock(); + fdt = files_fdtable(current->files); + max_fds = fdt->max_fds; + rcu_read_unlock(); + if (n > max_fds) + n = max_fds; + + /* + * We need 6 bitmaps (in/out/ex for both incoming and outgoing), + * since we used fdset we need to allocate memory in units of + * long-words. + */ + size = FDS_BYTES(n); + bits = stack_fds; + if (size > sizeof(stack_fds) / 6) { + bits = kmalloc(6 * size, GFP_KERNEL); + ret = -ENOMEM; + if (!bits) + goto out_nofds; + } + fds.in = (unsigned long *) bits; + fds.out = (unsigned long *) (bits + size); + fds.ex = (unsigned long *) (bits + 2*size); + fds.res_in = (unsigned long *) (bits + 3*size); + fds.res_out = (unsigned long *) (bits + 4*size); + fds.res_ex = (unsigned long *) (bits + 5*size); + + if ((ret = compat_get_fd_set(n, inp, fds.in)) || + (ret = compat_get_fd_set(n, outp, fds.out)) || + (ret = compat_get_fd_set(n, exp, fds.ex))) + goto out; + zero_fd_set(n, fds.res_in); + zero_fd_set(n, fds.res_out); + zero_fd_set(n, fds.res_ex); + + ret = do_select(n, &fds, end_time); + + if (ret < 0) + goto out; + if (!ret) { + ret = -ERESTARTNOHAND; + if (signal_pending(current)) + goto out; + ret = 0; + } + + if (compat_set_fd_set(n, inp, fds.res_in) || + compat_set_fd_set(n, outp, fds.res_out) || + compat_set_fd_set(n, exp, fds.res_ex)) + ret = -EFAULT; +out: + if (bits != stack_fds) + kfree(bits); +out_nofds: + return ret; +} + +COMPAT_SYSCALL_DEFINE5(select, int, n, compat_ulong_t __user *, inp, + compat_ulong_t __user *, outp, compat_ulong_t __user *, exp, + struct compat_timeval __user *, tvp) +{ + struct timespec end_time, *to = NULL; + struct compat_timeval tv; + int ret; + + if (tvp) { + if (copy_from_user(&tv, tvp, sizeof(tv))) + return -EFAULT; + + to = &end_time; + if (poll_select_set_timeout(to, + tv.tv_sec + (tv.tv_usec / USEC_PER_SEC), + (tv.tv_usec % USEC_PER_SEC) * NSEC_PER_USEC)) + return -EINVAL; + } + + ret = compat_core_sys_select(n, inp, outp, exp, to); + ret = compat_poll_select_copy_remaining(&end_time, tvp, 1, ret); + + return ret; +} + +struct compat_sel_arg_struct { + compat_ulong_t n; + compat_uptr_t inp; + compat_uptr_t outp; + compat_uptr_t exp; + compat_uptr_t tvp; +}; + +COMPAT_SYSCALL_DEFINE1(old_select, struct compat_sel_arg_struct __user *, arg) +{ + struct compat_sel_arg_struct a; + + if (copy_from_user(&a, arg, sizeof(a))) + return -EFAULT; + return compat_sys_select(a.n, compat_ptr(a.inp), compat_ptr(a.outp), + compat_ptr(a.exp), compat_ptr(a.tvp)); +} + +static long do_compat_pselect(int n, compat_ulong_t __user *inp, + compat_ulong_t __user *outp, compat_ulong_t __user *exp, + struct compat_timespec __user *tsp, compat_sigset_t __user *sigmask, + compat_size_t sigsetsize) +{ + compat_sigset_t ss32; + sigset_t ksigmask, sigsaved; + struct compat_timespec ts; + struct timespec end_time, *to = NULL; + int ret; + + if (tsp) { + if (copy_from_user(&ts, tsp, sizeof(ts))) + return -EFAULT; + + to = &end_time; + if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec)) + return -EINVAL; + } + + if (sigmask) { + if (sigsetsize != sizeof(compat_sigset_t)) + return -EINVAL; + if (copy_from_user(&ss32, sigmask, sizeof(ss32))) + return -EFAULT; + sigset_from_compat(&ksigmask, &ss32); + + sigdelsetmask(&ksigmask, sigmask(SIGKILL)|sigmask(SIGSTOP)); + sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved); + } + + ret = compat_core_sys_select(n, inp, outp, exp, to); + ret = compat_poll_select_copy_remaining(&end_time, tsp, 0, ret); + + if (ret == -ERESTARTNOHAND) { + /* + * Don't restore the signal mask yet. Let do_signal() deliver + * the signal on the way back to userspace, before the signal + * mask is restored. + */ + if (sigmask) { + memcpy(¤t->saved_sigmask, &sigsaved, + sizeof(sigsaved)); + set_restore_sigmask(); + } + } else if (sigmask) + sigprocmask(SIG_SETMASK, &sigsaved, NULL); + + return ret; +} + +COMPAT_SYSCALL_DEFINE6(pselect6, int, n, compat_ulong_t __user *, inp, + compat_ulong_t __user *, outp, compat_ulong_t __user *, exp, + struct compat_timespec __user *, tsp, void __user *, sig) +{ + compat_size_t sigsetsize = 0; + compat_uptr_t up = 0; + + if (sig) { + if (!access_ok(VERIFY_READ, sig, + sizeof(compat_uptr_t)+sizeof(compat_size_t)) || + __get_user(up, (compat_uptr_t __user *)sig) || + __get_user(sigsetsize, + (compat_size_t __user *)(sig+sizeof(up)))) + return -EFAULT; + } + return do_compat_pselect(n, inp, outp, exp, tsp, compat_ptr(up), + sigsetsize); +} + +COMPAT_SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds, + unsigned int, nfds, struct compat_timespec __user *, tsp, + const compat_sigset_t __user *, sigmask, compat_size_t, sigsetsize) +{ + compat_sigset_t ss32; + sigset_t ksigmask, sigsaved; + struct compat_timespec ts; + struct timespec end_time, *to = NULL; + int ret; + + if (tsp) { + if (copy_from_user(&ts, tsp, sizeof(ts))) + return -EFAULT; + + to = &end_time; + if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec)) + return -EINVAL; + } + + if (sigmask) { + if (sigsetsize != sizeof(compat_sigset_t)) + return -EINVAL; + if (copy_from_user(&ss32, sigmask, sizeof(ss32))) + return -EFAULT; + sigset_from_compat(&ksigmask, &ss32); + + sigdelsetmask(&ksigmask, sigmask(SIGKILL)|sigmask(SIGSTOP)); + sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved); + } + + ret = do_sys_poll(ufds, nfds, to); + + /* We can restart this syscall, usually */ + if (ret == -EINTR) { + /* + * Don't restore the signal mask yet. Let do_signal() deliver + * the signal on the way back to userspace, before the signal + * mask is restored. + */ + if (sigmask) { + memcpy(¤t->saved_sigmask, &sigsaved, + sizeof(sigsaved)); + set_restore_sigmask(); + } + ret = -ERESTARTNOHAND; + } else if (sigmask) + sigprocmask(SIG_SETMASK, &sigsaved, NULL); + + ret = compat_poll_select_copy_remaining(&end_time, tsp, 0, ret); + + return ret; +} +#endif diff --git a/fs/splice.c b/fs/splice.c index 006ba50f4ece..540c4a44756c 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -247,11 +247,6 @@ ssize_t add_to_pipe(struct pipe_inode_info *pipe, struct pipe_buffer *buf) } EXPORT_SYMBOL(add_to_pipe); -void spd_release_page(struct splice_pipe_desc *spd, unsigned int i) -{ - put_page(spd->pages[i]); -} - /* * Check if we need to grow the arrays holding pages and partial page * descriptions. @@ -393,7 +388,7 @@ static ssize_t default_file_splice_read(struct file *in, loff_t *ppos, struct iov_iter to; struct page **pages; unsigned int nr_pages; - size_t offset, dummy, copied = 0; + size_t offset, base, copied = 0; ssize_t res; int i; @@ -408,12 +403,11 @@ static ssize_t default_file_splice_read(struct file *in, loff_t *ppos, iov_iter_pipe(&to, ITER_PIPE | READ, pipe, len + offset); - res = iov_iter_get_pages_alloc(&to, &pages, len + offset, &dummy); + res = iov_iter_get_pages_alloc(&to, &pages, len + offset, &base); if (res <= 0) return -ENOMEM; - BUG_ON(dummy); - nr_pages = DIV_ROUND_UP(res, PAGE_SIZE); + nr_pages = DIV_ROUND_UP(res + base, PAGE_SIZE); vec = __vec; if (nr_pages > PIPE_DEF_BUFFERS) { @@ -1359,6 +1353,8 @@ SYSCALL_DEFINE4(vmsplice, int, fd, const struct iovec __user *, iov, struct fd f; long error; + if (unlikely(flags & ~SPLICE_F_ALL)) + return -EINVAL; if (unlikely(nr_segs > UIO_MAXIOV)) return -EINVAL; else if (unlikely(!nr_segs)) @@ -1409,6 +1405,9 @@ SYSCALL_DEFINE6(splice, int, fd_in, loff_t __user *, off_in, if (unlikely(!len)) return 0; + if (unlikely(flags & ~SPLICE_F_ALL)) + return -EINVAL; + error = -EBADF; in = fdget(fd_in); if (in.file) { @@ -1737,6 +1736,9 @@ SYSCALL_DEFINE4(tee, int, fdin, int, fdout, size_t, len, unsigned int, flags) struct fd in; int error; + if (unlikely(flags & ~SPLICE_F_ALL)) + return -EINVAL; + if (unlikely(!len)) return 0; diff --git a/fs/stat.c b/fs/stat.c index c6c963b2546b..f494b182c7c7 100644 --- a/fs/stat.c +++ b/fs/stat.c @@ -15,6 +15,7 @@ #include <linux/cred.h> #include <linux/syscalls.h> #include <linux/pagemap.h> +#include <linux/compat.h> #include <linux/uaccess.h> #include <asm/unistd.h> @@ -547,13 +548,13 @@ cp_statx(const struct kstat *stat, struct statx __user *buffer) /** * sys_statx - System call to get enhanced stats * @dfd: Base directory to pathwalk from *or* fd to stat. - * @filename: File to stat *or* NULL. + * @filename: File to stat or "" with AT_EMPTY_PATH * @flags: AT_* flags to control pathwalk. * @mask: Parts of statx struct actually required. * @buffer: Result buffer. * - * Note that if filename is NULL, then it does the equivalent of fstat() using - * dfd to indicate the file of interest. + * Note that fstat() can be emulated by setting dfd to the fd of interest, + * supplying "" as the filename and setting AT_EMPTY_PATH in the flags. */ SYSCALL_DEFINE5(statx, int, dfd, const char __user *, filename, unsigned, flags, @@ -568,16 +569,98 @@ SYSCALL_DEFINE5(statx, if ((flags & AT_STATX_SYNC_TYPE) == AT_STATX_SYNC_TYPE) return -EINVAL; - if (filename) - error = vfs_statx(dfd, filename, flags, &stat, mask); - else - error = vfs_statx_fd(dfd, &stat, mask, flags); + error = vfs_statx(dfd, filename, flags, &stat, mask); if (error) return error; return cp_statx(&stat, buffer); } +#ifdef CONFIG_COMPAT +static int cp_compat_stat(struct kstat *stat, struct compat_stat __user *ubuf) +{ + struct compat_stat tmp; + + if (!old_valid_dev(stat->dev) || !old_valid_dev(stat->rdev)) + return -EOVERFLOW; + + memset(&tmp, 0, sizeof(tmp)); + tmp.st_dev = old_encode_dev(stat->dev); + tmp.st_ino = stat->ino; + if (sizeof(tmp.st_ino) < sizeof(stat->ino) && tmp.st_ino != stat->ino) + return -EOVERFLOW; + tmp.st_mode = stat->mode; + tmp.st_nlink = stat->nlink; + if (tmp.st_nlink != stat->nlink) + return -EOVERFLOW; + SET_UID(tmp.st_uid, from_kuid_munged(current_user_ns(), stat->uid)); + SET_GID(tmp.st_gid, from_kgid_munged(current_user_ns(), stat->gid)); + tmp.st_rdev = old_encode_dev(stat->rdev); + if ((u64) stat->size > MAX_NON_LFS) + return -EOVERFLOW; + tmp.st_size = stat->size; + tmp.st_atime = stat->atime.tv_sec; + tmp.st_atime_nsec = stat->atime.tv_nsec; + tmp.st_mtime = stat->mtime.tv_sec; + tmp.st_mtime_nsec = stat->mtime.tv_nsec; + tmp.st_ctime = stat->ctime.tv_sec; + tmp.st_ctime_nsec = stat->ctime.tv_nsec; + tmp.st_blocks = stat->blocks; + tmp.st_blksize = stat->blksize; + return copy_to_user(ubuf, &tmp, sizeof(tmp)) ? -EFAULT : 0; +} + +COMPAT_SYSCALL_DEFINE2(newstat, const char __user *, filename, + struct compat_stat __user *, statbuf) +{ + struct kstat stat; + int error; + + error = vfs_stat(filename, &stat); + if (error) + return error; + return cp_compat_stat(&stat, statbuf); +} + +COMPAT_SYSCALL_DEFINE2(newlstat, const char __user *, filename, + struct compat_stat __user *, statbuf) +{ + struct kstat stat; + int error; + + error = vfs_lstat(filename, &stat); + if (error) + return error; + return cp_compat_stat(&stat, statbuf); +} + +#ifndef __ARCH_WANT_STAT64 +COMPAT_SYSCALL_DEFINE4(newfstatat, unsigned int, dfd, + const char __user *, filename, + struct compat_stat __user *, statbuf, int, flag) +{ + struct kstat stat; + int error; + + error = vfs_fstatat(dfd, filename, &stat, flag); + if (error) + return error; + return cp_compat_stat(&stat, statbuf); +} +#endif + +COMPAT_SYSCALL_DEFINE2(newfstat, unsigned int, fd, + struct compat_stat __user *, statbuf) +{ + struct kstat stat; + int error = vfs_fstat(fd, &stat); + + if (!error) + error = cp_compat_stat(&stat, statbuf); + return error; +} +#endif + /* Caller is here responsible for sufficient locking (ie. inode->i_lock) */ void __inode_add_bytes(struct inode *inode, loff_t bytes) { diff --git a/fs/statfs.c b/fs/statfs.c index 13ae259d4879..4e4623c7a126 100644 --- a/fs/statfs.c +++ b/fs/statfs.c @@ -7,6 +7,7 @@ #include <linux/statfs.h> #include <linux/security.h> #include <linux/uaccess.h> +#include <linux/compat.h> #include "internal.h" static int flags_by_mnt(int mnt_flags) @@ -239,3 +240,142 @@ SYSCALL_DEFINE2(ustat, unsigned, dev, struct ustat __user *, ubuf) return copy_to_user(ubuf, &tmp, sizeof(struct ustat)) ? -EFAULT : 0; } + +#ifdef CONFIG_COMPAT +static int put_compat_statfs(struct compat_statfs __user *ubuf, struct kstatfs *kbuf) +{ + if (sizeof ubuf->f_blocks == 4) { + if ((kbuf->f_blocks | kbuf->f_bfree | kbuf->f_bavail | + kbuf->f_bsize | kbuf->f_frsize) & 0xffffffff00000000ULL) + return -EOVERFLOW; + /* f_files and f_ffree may be -1; it's okay + * to stuff that into 32 bits */ + if (kbuf->f_files != 0xffffffffffffffffULL + && (kbuf->f_files & 0xffffffff00000000ULL)) + return -EOVERFLOW; + if (kbuf->f_ffree != 0xffffffffffffffffULL + && (kbuf->f_ffree & 0xffffffff00000000ULL)) + return -EOVERFLOW; + } + if (!access_ok(VERIFY_WRITE, ubuf, sizeof(*ubuf)) || + __put_user(kbuf->f_type, &ubuf->f_type) || + __put_user(kbuf->f_bsize, &ubuf->f_bsize) || + __put_user(kbuf->f_blocks, &ubuf->f_blocks) || + __put_user(kbuf->f_bfree, &ubuf->f_bfree) || + __put_user(kbuf->f_bavail, &ubuf->f_bavail) || + __put_user(kbuf->f_files, &ubuf->f_files) || + __put_user(kbuf->f_ffree, &ubuf->f_ffree) || + __put_user(kbuf->f_namelen, &ubuf->f_namelen) || + __put_user(kbuf->f_fsid.val[0], &ubuf->f_fsid.val[0]) || + __put_user(kbuf->f_fsid.val[1], &ubuf->f_fsid.val[1]) || + __put_user(kbuf->f_frsize, &ubuf->f_frsize) || + __put_user(kbuf->f_flags, &ubuf->f_flags) || + __clear_user(ubuf->f_spare, sizeof(ubuf->f_spare))) + return -EFAULT; + return 0; +} + +/* + * The following statfs calls are copies of code from fs/statfs.c and + * should be checked against those from time to time + */ +COMPAT_SYSCALL_DEFINE2(statfs, const char __user *, pathname, struct compat_statfs __user *, buf) +{ + struct kstatfs tmp; + int error = user_statfs(pathname, &tmp); + if (!error) + error = put_compat_statfs(buf, &tmp); + return error; +} + +COMPAT_SYSCALL_DEFINE2(fstatfs, unsigned int, fd, struct compat_statfs __user *, buf) +{ + struct kstatfs tmp; + int error = fd_statfs(fd, &tmp); + if (!error) + error = put_compat_statfs(buf, &tmp); + return error; +} + +static int put_compat_statfs64(struct compat_statfs64 __user *ubuf, struct kstatfs *kbuf) +{ + if (sizeof(ubuf->f_bsize) == 4) { + if ((kbuf->f_type | kbuf->f_bsize | kbuf->f_namelen | + kbuf->f_frsize | kbuf->f_flags) & 0xffffffff00000000ULL) + return -EOVERFLOW; + /* f_files and f_ffree may be -1; it's okay + * to stuff that into 32 bits */ + if (kbuf->f_files != 0xffffffffffffffffULL + && (kbuf->f_files & 0xffffffff00000000ULL)) + return -EOVERFLOW; + if (kbuf->f_ffree != 0xffffffffffffffffULL + && (kbuf->f_ffree & 0xffffffff00000000ULL)) + return -EOVERFLOW; + } + if (!access_ok(VERIFY_WRITE, ubuf, sizeof(*ubuf)) || + __put_user(kbuf->f_type, &ubuf->f_type) || + __put_user(kbuf->f_bsize, &ubuf->f_bsize) || + __put_user(kbuf->f_blocks, &ubuf->f_blocks) || + __put_user(kbuf->f_bfree, &ubuf->f_bfree) || + __put_user(kbuf->f_bavail, &ubuf->f_bavail) || + __put_user(kbuf->f_files, &ubuf->f_files) || + __put_user(kbuf->f_ffree, &ubuf->f_ffree) || + __put_user(kbuf->f_namelen, &ubuf->f_namelen) || + __put_user(kbuf->f_fsid.val[0], &ubuf->f_fsid.val[0]) || + __put_user(kbuf->f_fsid.val[1], &ubuf->f_fsid.val[1]) || + __put_user(kbuf->f_frsize, &ubuf->f_frsize) || + __put_user(kbuf->f_flags, &ubuf->f_flags) || + __clear_user(ubuf->f_spare, sizeof(ubuf->f_spare))) + return -EFAULT; + return 0; +} + +COMPAT_SYSCALL_DEFINE3(statfs64, const char __user *, pathname, compat_size_t, sz, struct compat_statfs64 __user *, buf) +{ + struct kstatfs tmp; + int error; + + if (sz != sizeof(*buf)) + return -EINVAL; + + error = user_statfs(pathname, &tmp); + if (!error) + error = put_compat_statfs64(buf, &tmp); + return error; +} + +COMPAT_SYSCALL_DEFINE3(fstatfs64, unsigned int, fd, compat_size_t, sz, struct compat_statfs64 __user *, buf) +{ + struct kstatfs tmp; + int error; + + if (sz != sizeof(*buf)) + return -EINVAL; + + error = fd_statfs(fd, &tmp); + if (!error) + error = put_compat_statfs64(buf, &tmp); + return error; +} + +/* + * This is a copy of sys_ustat, just dealing with a structure layout. + * Given how simple this syscall is that apporach is more maintainable + * than the various conversion hacks. + */ +COMPAT_SYSCALL_DEFINE2(ustat, unsigned, dev, struct compat_ustat __user *, u) +{ + struct compat_ustat tmp; + struct kstatfs sbuf; + int err = vfs_ustat(new_decode_dev(dev), &sbuf); + if (err) + return err; + + memset(&tmp, 0, sizeof(struct compat_ustat)); + tmp.f_tfree = sbuf.f_bfree; + tmp.f_tinode = sbuf.f_ffree; + if (copy_to_user(u, &tmp, sizeof(struct compat_ustat))) + return -EFAULT; + return 0; +} +#endif diff --git a/fs/super.c b/fs/super.c index b8b6a086c03b..adb0c0de428c 100644 --- a/fs/super.c +++ b/fs/super.c @@ -446,6 +446,10 @@ void generic_shutdown_super(struct super_block *sb) hlist_del_init(&sb->s_instances); spin_unlock(&sb_lock); up_write(&sb->s_umount); + if (sb->s_bdi != &noop_backing_dev_info) { + bdi_put(sb->s_bdi); + sb->s_bdi = &noop_backing_dev_info; + } } EXPORT_SYMBOL(generic_shutdown_super); @@ -1049,12 +1053,8 @@ static int set_bdev_super(struct super_block *s, void *data) { s->s_bdev = data; s->s_dev = s->s_bdev->bd_dev; + s->s_bdi = bdi_get(s->s_bdev->bd_bdi); - /* - * We set the bdi here to the queue backing, file systems can - * overwrite this in ->fill_super() - */ - s->s_bdi = bdev_get_queue(s->s_bdev)->backing_dev_info; return 0; } @@ -1256,6 +1256,49 @@ out: } /* + * Setup private BDI for given superblock. It gets automatically cleaned up + * in generic_shutdown_super(). + */ +int super_setup_bdi_name(struct super_block *sb, char *fmt, ...) +{ + struct backing_dev_info *bdi; + int err; + va_list args; + + bdi = bdi_alloc(GFP_KERNEL); + if (!bdi) + return -ENOMEM; + + bdi->name = sb->s_type->name; + + va_start(args, fmt); + err = bdi_register_va(bdi, fmt, args); + va_end(args); + if (err) { + bdi_put(bdi); + return err; + } + WARN_ON(sb->s_bdi != &noop_backing_dev_info); + sb->s_bdi = bdi; + + return 0; +} +EXPORT_SYMBOL(super_setup_bdi_name); + +/* + * Setup private BDI for given superblock. I gets automatically cleaned up + * in generic_shutdown_super(). + */ +int super_setup_bdi(struct super_block *sb) +{ + static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0); + + return super_setup_bdi_name(sb, "%.28s-%ld", sb->s_type->name, + atomic_long_inc_return(&bdi_seq)); +} +EXPORT_SYMBOL(super_setup_bdi); + +/* * This is an internal function, please use sb_end_{write,pagefault,intwrite} * instead. */ diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c index 1e712a364680..718b749fa11a 100644 --- a/fs/ubifs/debug.c +++ b/fs/ubifs/debug.c @@ -32,6 +32,7 @@ #include <linux/math64.h> #include <linux/uaccess.h> #include <linux/random.h> +#include <linux/ctype.h> #include "ubifs.h" static DEFINE_SPINLOCK(dbg_lock); @@ -286,8 +287,10 @@ void ubifs_dump_inode(struct ubifs_info *c, const struct inode *inode) break; } - pr_err("\t%d: %s (%s)\n", - count++, dent->name, get_dent_type(dent->type)); + pr_err("\t%d: inode %llu, type %s, len %d\n", + count++, (unsigned long long) le64_to_cpu(dent->inum), + get_dent_type(dent->type), + le16_to_cpu(dent->nlen)); fname_name(&nm) = dent->name; fname_len(&nm) = le16_to_cpu(dent->nlen); @@ -464,7 +467,8 @@ void ubifs_dump_node(const struct ubifs_info *c, const void *node) pr_err("(bad name length, not printing, bad or corrupted node)"); else { for (i = 0; i < nlen && dent->name[i]; i++) - pr_cont("%c", dent->name[i]); + pr_cont("%c", isprint(dent->name[i]) ? + dent->name[i] : '?'); } pr_cont("\n"); diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c index 30825d882aa9..b777bddaa1dd 100644 --- a/fs/ubifs/dir.c +++ b/fs/ubifs/dir.c @@ -606,8 +606,8 @@ static int ubifs_readdir(struct file *file, struct dir_context *ctx) } while (1) { - dbg_gen("feed '%s', ino %llu, new f_pos %#x", - dent->name, (unsigned long long)le64_to_cpu(dent->inum), + dbg_gen("ino %llu, new f_pos %#x", + (unsigned long long)le64_to_cpu(dent->inum), key_hash_flash(c, &dent->key)); ubifs_assert(le64_to_cpu(dent->ch.sqnum) > ubifs_inode(dir)->creat_sqnum); @@ -748,6 +748,11 @@ static int ubifs_link(struct dentry *old_dentry, struct inode *dir, goto out_fname; lock_2_inodes(dir, inode); + + /* Handle O_TMPFILE corner case, it is allowed to link a O_TMPFILE. */ + if (inode->i_nlink == 0) + ubifs_delete_orphan(c, inode->i_ino); + inc_nlink(inode); ihold(inode); inode->i_ctime = ubifs_current_time(inode); @@ -768,6 +773,8 @@ out_cancel: dir->i_size -= sz_change; dir_ui->ui_size = dir->i_size; drop_nlink(inode); + if (inode->i_nlink == 0) + ubifs_add_orphan(c, inode->i_ino); unlock_2_inodes(dir, inode); ubifs_release_budget(c, &req); iput(inode); @@ -1068,8 +1075,10 @@ static int ubifs_mknod(struct inode *dir, struct dentry *dentry, } err = fscrypt_setup_filename(dir, &dentry->d_name, 0, &nm); - if (err) + if (err) { + kfree(dev); goto out_budg; + } sz_change = CALC_DENT_SIZE(fname_len(&nm)); @@ -1316,9 +1325,6 @@ static int do_rename(struct inode *old_dir, struct dentry *old_dentry, unsigned int uninitialized_var(saved_nlink); struct fscrypt_name old_nm, new_nm; - if (flags & ~RENAME_NOREPLACE) - return -EINVAL; - /* * Budget request settings: deletion direntry, new direntry, removing * the old inode, and changing old and new parent directory inodes. diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index b73811bd7676..cf4cc99b75b5 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -1827,7 +1827,6 @@ static void ubifs_put_super(struct super_block *sb) } ubifs_umount(c); - bdi_destroy(&c->bdi); ubi_close_volume(c->ubi); mutex_unlock(&c->umount_mutex); } @@ -2019,29 +2018,25 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent) goto out; } + err = ubifs_parse_options(c, data, 0); + if (err) + goto out_close; + /* * UBIFS provides 'backing_dev_info' in order to disable read-ahead. For * UBIFS, I/O is not deferred, it is done immediately in readpage, * which means the user would have to wait not just for their own I/O * but the read-ahead I/O as well i.e. completely pointless. * - * Read-ahead will be disabled because @c->bdi.ra_pages is 0. + * Read-ahead will be disabled because @sb->s_bdi->ra_pages is 0. Also + * @sb->s_bdi->capabilities are initialized to 0 so there won't be any + * writeback happening. */ - c->bdi.name = "ubifs", - c->bdi.capabilities = 0; - err = bdi_init(&c->bdi); + err = super_setup_bdi_name(sb, "ubifs_%d_%d", c->vi.ubi_num, + c->vi.vol_id); if (err) goto out_close; - err = bdi_register(&c->bdi, NULL, "ubifs_%d_%d", - c->vi.ubi_num, c->vi.vol_id); - if (err) - goto out_bdi; - - err = ubifs_parse_options(c, data, 0); - if (err) - goto out_bdi; - sb->s_bdi = &c->bdi; sb->s_fs_info = c; sb->s_magic = UBIFS_SUPER_MAGIC; sb->s_blocksize = UBIFS_BLOCK_SIZE; @@ -2080,8 +2075,6 @@ out_umount: ubifs_umount(c); out_unlock: mutex_unlock(&c->umount_mutex); -out_bdi: - bdi_destroy(&c->bdi); out_close: ubi_close_volume(c->ubi); out: diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h index 4d57e488038e..4da10a6d702a 100644 --- a/fs/ubifs/ubifs.h +++ b/fs/ubifs/ubifs.h @@ -972,7 +972,6 @@ struct ubifs_debug_info; * struct ubifs_info - UBIFS file-system description data structure * (per-superblock). * @vfs_sb: VFS @struct super_block object - * @bdi: backing device info object to make VFS happy and disable read-ahead * * @highest_inum: highest used inode number * @max_sqnum: current global sequence number @@ -1220,7 +1219,6 @@ struct ubifs_debug_info; */ struct ubifs_info { struct super_block *vfs_sb; - struct backing_dev_info bdi; ino_t highest_inum; unsigned long long max_sqnum; @@ -1461,7 +1459,6 @@ extern const struct inode_operations ubifs_file_inode_operations; extern const struct file_operations ubifs_dir_operations; extern const struct inode_operations ubifs_dir_inode_operations; extern const struct inode_operations ubifs_symlink_inode_operations; -extern struct backing_dev_info ubifs_backing_dev_info; extern struct ubifs_compressor *ubifs_compressors[UBIFS_COMPR_TYPES_CNT]; /* io.c */ diff --git a/fs/udf/file.c b/fs/udf/file.c index e04cc0cdca9d..f5eb2d5b3bac 100644 --- a/fs/udf/file.c +++ b/fs/udf/file.c @@ -44,12 +44,12 @@ static void __udf_adinicb_readpage(struct page *page) char *kaddr; struct udf_inode_info *iinfo = UDF_I(inode); - kaddr = kmap(page); + kaddr = kmap_atomic(page); memcpy(kaddr, iinfo->i_ext.i_data + iinfo->i_lenEAttr, inode->i_size); memset(kaddr + inode->i_size, 0, PAGE_SIZE - inode->i_size); flush_dcache_page(page); SetPageUptodate(page); - kunmap(page); + kunmap_atomic(kaddr); } static int udf_adinicb_readpage(struct file *file, struct page *page) @@ -70,11 +70,11 @@ static int udf_adinicb_writepage(struct page *page, BUG_ON(!PageLocked(page)); - kaddr = kmap(page); + kaddr = kmap_atomic(page); memcpy(iinfo->i_ext.i_data + iinfo->i_lenEAttr, kaddr, inode->i_size); - mark_inode_dirty(inode); SetPageUptodate(page); - kunmap(page); + kunmap_atomic(kaddr); + mark_inode_dirty(inode); unlock_page(page); return 0; diff --git a/fs/udf/inode.c b/fs/udf/inode.c index a8d8f71ef8bd..98c510e17203 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -276,14 +276,14 @@ int udf_expand_file_adinicb(struct inode *inode) return -ENOMEM; if (!PageUptodate(page)) { - kaddr = kmap(page); + kaddr = kmap_atomic(page); memset(kaddr + iinfo->i_lenAlloc, 0x00, PAGE_SIZE - iinfo->i_lenAlloc); memcpy(kaddr, iinfo->i_ext.i_data + iinfo->i_lenEAttr, iinfo->i_lenAlloc); flush_dcache_page(page); SetPageUptodate(page); - kunmap(page); + kunmap_atomic(kaddr); } down_write(&iinfo->i_data_sem); memset(iinfo->i_ext.i_data + iinfo->i_lenEAttr, 0x00, @@ -300,11 +300,11 @@ int udf_expand_file_adinicb(struct inode *inode) if (err) { /* Restore everything back so that we don't lose data... */ lock_page(page); - kaddr = kmap(page); down_write(&iinfo->i_data_sem); + kaddr = kmap_atomic(page); memcpy(iinfo->i_ext.i_data + iinfo->i_lenEAttr, kaddr, inode->i_size); - kunmap(page); + kunmap_atomic(kaddr); unlock_page(page); iinfo->i_alloc_type = ICBTAG_FLAG_AD_IN_ICB; inode->i_data.a_ops = &udf_adinicb_aops; @@ -1535,7 +1535,7 @@ reread: inode->i_data.a_ops = &udf_symlink_aops; inode->i_op = &udf_symlink_inode_operations; inode_nohighmem(inode); - inode->i_mode = S_IFLNK | S_IRWXUGO; + inode->i_mode = S_IFLNK | 0777; break; case ICBTAG_FILE_TYPE_MAIN: udf_debug("METADATA FILE-----\n"); @@ -1591,9 +1591,9 @@ static umode_t udf_convert_permissions(struct fileEntry *fe) permissions = le32_to_cpu(fe->permissions); flags = le16_to_cpu(fe->icbTag.flags); - mode = ((permissions) & S_IRWXO) | - ((permissions >> 2) & S_IRWXG) | - ((permissions >> 4) & S_IRWXU) | + mode = ((permissions) & 0007) | + ((permissions >> 2) & 0070) | + ((permissions >> 4) & 0700) | ((flags & ICBTAG_FLAG_SETUID) ? S_ISUID : 0) | ((flags & ICBTAG_FLAG_SETGID) ? S_ISGID : 0) | ((flags & ICBTAG_FLAG_STICKY) ? S_ISVTX : 0); @@ -1669,9 +1669,9 @@ static int udf_update_inode(struct inode *inode, int do_sync) else fe->gid = cpu_to_le32(i_gid_read(inode)); - udfperms = ((inode->i_mode & S_IRWXO)) | - ((inode->i_mode & S_IRWXG) << 2) | - ((inode->i_mode & S_IRWXU) << 4); + udfperms = ((inode->i_mode & 0007)) | + ((inode->i_mode & 0070) << 2) | + ((inode->i_mode & 0700) << 4); udfperms |= (le32_to_cpu(fe->permissions) & (FE_PERM_O_DELETE | FE_PERM_O_CHATTR | diff --git a/fs/udf/namei.c b/fs/udf/namei.c index babf48d0e553..385ee89d5824 100644 --- a/fs/udf/namei.c +++ b/fs/udf/namei.c @@ -906,7 +906,7 @@ out: static int udf_symlink(struct inode *dir, struct dentry *dentry, const char *symname) { - struct inode *inode = udf_new_inode(dir, S_IFLNK | S_IRWXUGO); + struct inode *inode = udf_new_inode(dir, S_IFLNK | 0777); struct pathComponent *pc; const char *compstart; struct extent_position epos = {}; diff --git a/fs/utimes.c b/fs/utimes.c index 32b15b3f6629..6571d8c848a0 100644 --- a/fs/utimes.c +++ b/fs/utimes.c @@ -1,14 +1,10 @@ -#include <linux/compiler.h> #include <linux/file.h> -#include <linux/fs.h> -#include <linux/linkage.h> #include <linux/mount.h> #include <linux/namei.h> -#include <linux/sched.h> -#include <linux/stat.h> #include <linux/utime.h> #include <linux/syscalls.h> #include <linux/uaccess.h> +#include <linux/compat.h> #include <asm/unistd.h> #ifdef __ARCH_WANT_SYS_UTIME @@ -219,3 +215,63 @@ SYSCALL_DEFINE2(utimes, char __user *, filename, { return sys_futimesat(AT_FDCWD, filename, utimes); } + +#ifdef CONFIG_COMPAT +/* + * Not all architectures have sys_utime, so implement this in terms + * of sys_utimes. + */ +COMPAT_SYSCALL_DEFINE2(utime, const char __user *, filename, + struct compat_utimbuf __user *, t) +{ + struct timespec tv[2]; + + if (t) { + if (get_user(tv[0].tv_sec, &t->actime) || + get_user(tv[1].tv_sec, &t->modtime)) + return -EFAULT; + tv[0].tv_nsec = 0; + tv[1].tv_nsec = 0; + } + return do_utimes(AT_FDCWD, filename, t ? tv : NULL, 0); +} + +COMPAT_SYSCALL_DEFINE4(utimensat, unsigned int, dfd, const char __user *, filename, struct compat_timespec __user *, t, int, flags) +{ + struct timespec tv[2]; + + if (t) { + if (compat_get_timespec(&tv[0], &t[0]) || + compat_get_timespec(&tv[1], &t[1])) + return -EFAULT; + + if (tv[0].tv_nsec == UTIME_OMIT && tv[1].tv_nsec == UTIME_OMIT) + return 0; + } + return do_utimes(dfd, filename, t ? tv : NULL, flags); +} + +COMPAT_SYSCALL_DEFINE3(futimesat, unsigned int, dfd, const char __user *, filename, struct compat_timeval __user *, t) +{ + struct timespec tv[2]; + + if (t) { + if (get_user(tv[0].tv_sec, &t[0].tv_sec) || + get_user(tv[0].tv_nsec, &t[0].tv_usec) || + get_user(tv[1].tv_sec, &t[1].tv_sec) || + get_user(tv[1].tv_nsec, &t[1].tv_usec)) + return -EFAULT; + if (tv[0].tv_nsec >= 1000000 || tv[0].tv_nsec < 0 || + tv[1].tv_nsec >= 1000000 || tv[1].tv_nsec < 0) + return -EINVAL; + tv[0].tv_nsec *= 1000; + tv[1].tv_nsec *= 1000; + } + return do_utimes(dfd, filename, t ? tv : NULL, 0); +} + +COMPAT_SYSCALL_DEFINE2(utimes, const char __user *, filename, struct compat_timeval __user *, t) +{ + return compat_sys_futimesat(AT_FDCWD, filename, t); +} +#endif diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index 26ef1958b65b..5c90f82b8f6b 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -79,6 +79,7 @@ xfs-y += xfs_aops.o \ xfs_extent_busy.o \ xfs_file.o \ xfs_filestream.o \ + xfs_fsmap.o \ xfs_fsops.o \ xfs_globals.o \ xfs_icache.o \ diff --git a/fs/xfs/kmem.c b/fs/xfs/kmem.c index 70a5b55e0870..780fc8986dab 100644 --- a/fs/xfs/kmem.c +++ b/fs/xfs/kmem.c @@ -48,7 +48,7 @@ kmem_alloc(size_t size, xfs_km_flags_t flags) void * kmem_zalloc_large(size_t size, xfs_km_flags_t flags) { - unsigned noio_flag = 0; + unsigned nofs_flag = 0; void *ptr; gfp_t lflags; @@ -60,17 +60,17 @@ kmem_zalloc_large(size_t size, xfs_km_flags_t flags) * __vmalloc() will allocate data pages and auxillary structures (e.g. * pagetables) with GFP_KERNEL, yet we may be under GFP_NOFS context * here. Hence we need to tell memory reclaim that we are in such a - * context via PF_MEMALLOC_NOIO to prevent memory reclaim re-entering + * context via PF_MEMALLOC_NOFS to prevent memory reclaim re-entering * the filesystem here and potentially deadlocking. */ - if ((current->flags & PF_FSTRANS) || (flags & KM_NOFS)) - noio_flag = memalloc_noio_save(); + if (flags & KM_NOFS) + nofs_flag = memalloc_nofs_save(); lflags = kmem_flags_convert(flags); ptr = __vmalloc(size, lflags | __GFP_HIGHMEM | __GFP_ZERO, PAGE_KERNEL); - if ((current->flags & PF_FSTRANS) || (flags & KM_NOFS)) - memalloc_noio_restore(noio_flag); + if (flags & KM_NOFS) + memalloc_nofs_restore(nofs_flag); return ptr; } diff --git a/fs/xfs/kmem.h b/fs/xfs/kmem.h index f0fc84fcaac2..d6ea520162b2 100644 --- a/fs/xfs/kmem.h +++ b/fs/xfs/kmem.h @@ -50,7 +50,7 @@ kmem_flags_convert(xfs_km_flags_t flags) lflags = GFP_ATOMIC | __GFP_NOWARN; } else { lflags = GFP_KERNEL | __GFP_NOWARN; - if ((current->flags & PF_FSTRANS) || (flags & KM_NOFS)) + if (flags & KM_NOFS) lflags &= ~__GFP_FS; } diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index 369adcc18c02..7486401ccbd3 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -2868,3 +2868,60 @@ err: xfs_trans_brelse(tp, agbp); return error; } + +struct xfs_alloc_query_range_info { + xfs_alloc_query_range_fn fn; + void *priv; +}; + +/* Format btree record and pass to our callback. */ +STATIC int +xfs_alloc_query_range_helper( + struct xfs_btree_cur *cur, + union xfs_btree_rec *rec, + void *priv) +{ + struct xfs_alloc_query_range_info *query = priv; + struct xfs_alloc_rec_incore irec; + + irec.ar_startblock = be32_to_cpu(rec->alloc.ar_startblock); + irec.ar_blockcount = be32_to_cpu(rec->alloc.ar_blockcount); + return query->fn(cur, &irec, query->priv); +} + +/* Find all free space within a given range of blocks. */ +int +xfs_alloc_query_range( + struct xfs_btree_cur *cur, + struct xfs_alloc_rec_incore *low_rec, + struct xfs_alloc_rec_incore *high_rec, + xfs_alloc_query_range_fn fn, + void *priv) +{ + union xfs_btree_irec low_brec; + union xfs_btree_irec high_brec; + struct xfs_alloc_query_range_info query; + + ASSERT(cur->bc_btnum == XFS_BTNUM_BNO); + low_brec.a = *low_rec; + high_brec.a = *high_rec; + query.priv = priv; + query.fn = fn; + return xfs_btree_query_range(cur, &low_brec, &high_brec, + xfs_alloc_query_range_helper, &query); +} + +/* Find all free space records. */ +int +xfs_alloc_query_all( + struct xfs_btree_cur *cur, + xfs_alloc_query_range_fn fn, + void *priv) +{ + struct xfs_alloc_query_range_info query; + + ASSERT(cur->bc_btnum == XFS_BTNUM_BNO); + query.priv = priv; + query.fn = fn; + return xfs_btree_query_all(cur, xfs_alloc_query_range_helper, &query); +} diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h index 2a8d0fa6fbbe..77d9c27330ab 100644 --- a/fs/xfs/libxfs/xfs_alloc.h +++ b/fs/xfs/libxfs/xfs_alloc.h @@ -219,4 +219,16 @@ int xfs_free_extent_fix_freelist(struct xfs_trans *tp, xfs_agnumber_t agno, xfs_extlen_t xfs_prealloc_blocks(struct xfs_mount *mp); +typedef int (*xfs_alloc_query_range_fn)( + struct xfs_btree_cur *cur, + struct xfs_alloc_rec_incore *rec, + void *priv); + +int xfs_alloc_query_range(struct xfs_btree_cur *cur, + struct xfs_alloc_rec_incore *low_rec, + struct xfs_alloc_rec_incore *high_rec, + xfs_alloc_query_range_fn fn, void *priv); +int xfs_alloc_query_all(struct xfs_btree_cur *cur, xfs_alloc_query_range_fn fn, + void *priv); + #endif /* __XFS_ALLOC_H__ */ diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c index efb467b10a71..e1fcfe7f0a9a 100644 --- a/fs/xfs/libxfs/xfs_alloc_btree.c +++ b/fs/xfs/libxfs/xfs_alloc_btree.c @@ -205,19 +205,37 @@ xfs_allocbt_init_key_from_rec( union xfs_btree_key *key, union xfs_btree_rec *rec) { - ASSERT(rec->alloc.ar_startblock != 0); - key->alloc.ar_startblock = rec->alloc.ar_startblock; key->alloc.ar_blockcount = rec->alloc.ar_blockcount; } STATIC void +xfs_bnobt_init_high_key_from_rec( + union xfs_btree_key *key, + union xfs_btree_rec *rec) +{ + __u32 x; + + x = be32_to_cpu(rec->alloc.ar_startblock); + x += be32_to_cpu(rec->alloc.ar_blockcount) - 1; + key->alloc.ar_startblock = cpu_to_be32(x); + key->alloc.ar_blockcount = 0; +} + +STATIC void +xfs_cntbt_init_high_key_from_rec( + union xfs_btree_key *key, + union xfs_btree_rec *rec) +{ + key->alloc.ar_blockcount = rec->alloc.ar_blockcount; + key->alloc.ar_startblock = 0; +} + +STATIC void xfs_allocbt_init_rec_from_cur( struct xfs_btree_cur *cur, union xfs_btree_rec *rec) { - ASSERT(cur->bc_rec.a.ar_startblock != 0); - rec->alloc.ar_startblock = cpu_to_be32(cur->bc_rec.a.ar_startblock); rec->alloc.ar_blockcount = cpu_to_be32(cur->bc_rec.a.ar_blockcount); } @@ -236,18 +254,24 @@ xfs_allocbt_init_ptr_from_cur( } STATIC __int64_t -xfs_allocbt_key_diff( +xfs_bnobt_key_diff( struct xfs_btree_cur *cur, union xfs_btree_key *key) { xfs_alloc_rec_incore_t *rec = &cur->bc_rec.a; xfs_alloc_key_t *kp = &key->alloc; - __int64_t diff; - if (cur->bc_btnum == XFS_BTNUM_BNO) { - return (__int64_t)be32_to_cpu(kp->ar_startblock) - - rec->ar_startblock; - } + return (__int64_t)be32_to_cpu(kp->ar_startblock) - rec->ar_startblock; +} + +STATIC __int64_t +xfs_cntbt_key_diff( + struct xfs_btree_cur *cur, + union xfs_btree_key *key) +{ + xfs_alloc_rec_incore_t *rec = &cur->bc_rec.a; + xfs_alloc_key_t *kp = &key->alloc; + __int64_t diff; diff = (__int64_t)be32_to_cpu(kp->ar_blockcount) - rec->ar_blockcount; if (diff) @@ -256,6 +280,33 @@ xfs_allocbt_key_diff( return (__int64_t)be32_to_cpu(kp->ar_startblock) - rec->ar_startblock; } +STATIC __int64_t +xfs_bnobt_diff_two_keys( + struct xfs_btree_cur *cur, + union xfs_btree_key *k1, + union xfs_btree_key *k2) +{ + return (__int64_t)be32_to_cpu(k1->alloc.ar_startblock) - + be32_to_cpu(k2->alloc.ar_startblock); +} + +STATIC __int64_t +xfs_cntbt_diff_two_keys( + struct xfs_btree_cur *cur, + union xfs_btree_key *k1, + union xfs_btree_key *k2) +{ + __int64_t diff; + + diff = be32_to_cpu(k1->alloc.ar_blockcount) - + be32_to_cpu(k2->alloc.ar_blockcount); + if (diff) + return diff; + + return be32_to_cpu(k1->alloc.ar_startblock) - + be32_to_cpu(k2->alloc.ar_startblock); +} + static bool xfs_allocbt_verify( struct xfs_buf *bp) @@ -346,44 +397,54 @@ const struct xfs_buf_ops xfs_allocbt_buf_ops = { #if defined(DEBUG) || defined(XFS_WARN) STATIC int -xfs_allocbt_keys_inorder( +xfs_bnobt_keys_inorder( struct xfs_btree_cur *cur, union xfs_btree_key *k1, union xfs_btree_key *k2) { - if (cur->bc_btnum == XFS_BTNUM_BNO) { - return be32_to_cpu(k1->alloc.ar_startblock) < - be32_to_cpu(k2->alloc.ar_startblock); - } else { - return be32_to_cpu(k1->alloc.ar_blockcount) < - be32_to_cpu(k2->alloc.ar_blockcount) || - (k1->alloc.ar_blockcount == k2->alloc.ar_blockcount && - be32_to_cpu(k1->alloc.ar_startblock) < - be32_to_cpu(k2->alloc.ar_startblock)); - } + return be32_to_cpu(k1->alloc.ar_startblock) < + be32_to_cpu(k2->alloc.ar_startblock); } STATIC int -xfs_allocbt_recs_inorder( +xfs_bnobt_recs_inorder( struct xfs_btree_cur *cur, union xfs_btree_rec *r1, union xfs_btree_rec *r2) { - if (cur->bc_btnum == XFS_BTNUM_BNO) { - return be32_to_cpu(r1->alloc.ar_startblock) + - be32_to_cpu(r1->alloc.ar_blockcount) <= - be32_to_cpu(r2->alloc.ar_startblock); - } else { - return be32_to_cpu(r1->alloc.ar_blockcount) < - be32_to_cpu(r2->alloc.ar_blockcount) || - (r1->alloc.ar_blockcount == r2->alloc.ar_blockcount && - be32_to_cpu(r1->alloc.ar_startblock) < - be32_to_cpu(r2->alloc.ar_startblock)); - } + return be32_to_cpu(r1->alloc.ar_startblock) + + be32_to_cpu(r1->alloc.ar_blockcount) <= + be32_to_cpu(r2->alloc.ar_startblock); +} + +STATIC int +xfs_cntbt_keys_inorder( + struct xfs_btree_cur *cur, + union xfs_btree_key *k1, + union xfs_btree_key *k2) +{ + return be32_to_cpu(k1->alloc.ar_blockcount) < + be32_to_cpu(k2->alloc.ar_blockcount) || + (k1->alloc.ar_blockcount == k2->alloc.ar_blockcount && + be32_to_cpu(k1->alloc.ar_startblock) < + be32_to_cpu(k2->alloc.ar_startblock)); } -#endif /* DEBUG */ -static const struct xfs_btree_ops xfs_allocbt_ops = { +STATIC int +xfs_cntbt_recs_inorder( + struct xfs_btree_cur *cur, + union xfs_btree_rec *r1, + union xfs_btree_rec *r2) +{ + return be32_to_cpu(r1->alloc.ar_blockcount) < + be32_to_cpu(r2->alloc.ar_blockcount) || + (r1->alloc.ar_blockcount == r2->alloc.ar_blockcount && + be32_to_cpu(r1->alloc.ar_startblock) < + be32_to_cpu(r2->alloc.ar_startblock)); +} +#endif /* DEBUG */ + +static const struct xfs_btree_ops xfs_bnobt_ops = { .rec_len = sizeof(xfs_alloc_rec_t), .key_len = sizeof(xfs_alloc_key_t), @@ -395,13 +456,39 @@ static const struct xfs_btree_ops xfs_allocbt_ops = { .get_minrecs = xfs_allocbt_get_minrecs, .get_maxrecs = xfs_allocbt_get_maxrecs, .init_key_from_rec = xfs_allocbt_init_key_from_rec, + .init_high_key_from_rec = xfs_bnobt_init_high_key_from_rec, .init_rec_from_cur = xfs_allocbt_init_rec_from_cur, .init_ptr_from_cur = xfs_allocbt_init_ptr_from_cur, - .key_diff = xfs_allocbt_key_diff, + .key_diff = xfs_bnobt_key_diff, .buf_ops = &xfs_allocbt_buf_ops, + .diff_two_keys = xfs_bnobt_diff_two_keys, #if defined(DEBUG) || defined(XFS_WARN) - .keys_inorder = xfs_allocbt_keys_inorder, - .recs_inorder = xfs_allocbt_recs_inorder, + .keys_inorder = xfs_bnobt_keys_inorder, + .recs_inorder = xfs_bnobt_recs_inorder, +#endif +}; + +static const struct xfs_btree_ops xfs_cntbt_ops = { + .rec_len = sizeof(xfs_alloc_rec_t), + .key_len = sizeof(xfs_alloc_key_t), + + .dup_cursor = xfs_allocbt_dup_cursor, + .set_root = xfs_allocbt_set_root, + .alloc_block = xfs_allocbt_alloc_block, + .free_block = xfs_allocbt_free_block, + .update_lastrec = xfs_allocbt_update_lastrec, + .get_minrecs = xfs_allocbt_get_minrecs, + .get_maxrecs = xfs_allocbt_get_maxrecs, + .init_key_from_rec = xfs_allocbt_init_key_from_rec, + .init_high_key_from_rec = xfs_cntbt_init_high_key_from_rec, + .init_rec_from_cur = xfs_allocbt_init_rec_from_cur, + .init_ptr_from_cur = xfs_allocbt_init_ptr_from_cur, + .key_diff = xfs_cntbt_key_diff, + .buf_ops = &xfs_allocbt_buf_ops, + .diff_two_keys = xfs_cntbt_diff_two_keys, +#if defined(DEBUG) || defined(XFS_WARN) + .keys_inorder = xfs_cntbt_keys_inorder, + .recs_inorder = xfs_cntbt_recs_inorder, #endif }; @@ -427,16 +514,15 @@ xfs_allocbt_init_cursor( cur->bc_mp = mp; cur->bc_btnum = btnum; cur->bc_blocklog = mp->m_sb.sb_blocklog; - cur->bc_ops = &xfs_allocbt_ops; - if (btnum == XFS_BTNUM_BNO) - cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_abtb_2); - else - cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_abtc_2); if (btnum == XFS_BTNUM_CNT) { + cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_abtc_2); + cur->bc_ops = &xfs_cntbt_ops; cur->bc_nlevels = be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]); cur->bc_flags = XFS_BTREE_LASTREC_UPDATE; } else { + cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_abtb_2); + cur->bc_ops = &xfs_bnobt_ops; cur->bc_nlevels = be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]); } diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 9bd104f32908..f02eb7673392 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -764,7 +764,6 @@ xfs_bmap_extents_to_btree( args.fsbno = XFS_INO_TO_FSB(mp, ip->i_ino); } else if (dfops->dop_low) { args.type = XFS_ALLOCTYPE_START_BNO; -try_another_ag: args.fsbno = *firstblock; } else { args.type = XFS_ALLOCTYPE_NEAR_BNO; @@ -779,20 +778,6 @@ try_another_ag: return error; } - /* - * During a CoW operation, the allocation and bmbt updates occur in - * different transactions. The mapping code tries to put new bmbt - * blocks near extents being mapped, but the only way to guarantee this - * is if the alloc and the mapping happen in a single transaction that - * has a block reservation. That isn't the case here, so if we run out - * of space we'll try again with another AG. - */ - if (xfs_sb_version_hasreflink(&cur->bc_mp->m_sb) && - args.fsbno == NULLFSBLOCK && - args.type == XFS_ALLOCTYPE_NEAR_BNO) { - args.type = XFS_ALLOCTYPE_FIRST_AG; - goto try_another_ag; - } if (WARN_ON_ONCE(args.fsbno == NULLFSBLOCK)) { xfs_iroot_realloc(ip, -1, whichfork); xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); @@ -925,7 +910,6 @@ xfs_bmap_local_to_extents( * file currently fits in an inode. */ if (*firstblock == NULLFSBLOCK) { -try_another_ag: args.fsbno = XFS_INO_TO_FSB(args.mp, ip->i_ino); args.type = XFS_ALLOCTYPE_START_BNO; } else { @@ -938,19 +922,6 @@ try_another_ag: if (error) goto done; - /* - * During a CoW operation, the allocation and bmbt updates occur in - * different transactions. The mapping code tries to put new bmbt - * blocks near extents being mapped, but the only way to guarantee this - * is if the alloc and the mapping happen in a single transaction that - * has a block reservation. That isn't the case here, so if we run out - * of space we'll try again with another AG. - */ - if (xfs_sb_version_hasreflink(&ip->i_mount->m_sb) && - args.fsbno == NULLFSBLOCK && - args.type == XFS_ALLOCTYPE_NEAR_BNO) { - goto try_another_ag; - } /* Can't fail, the space was reserved. */ ASSERT(args.fsbno != NULLFSBLOCK); ASSERT(args.len == 1); @@ -1260,7 +1231,6 @@ xfs_bmap_read_extents( xfs_fsblock_t bno; /* block # of "block" */ xfs_buf_t *bp; /* buffer for "block" */ int error; /* error return value */ - xfs_exntfmt_t exntf; /* XFS_EXTFMT_NOSTATE, if checking */ xfs_extnum_t i, j; /* index into the extents list */ xfs_ifork_t *ifp; /* fork structure */ int level; /* btree level, for checking */ @@ -1271,8 +1241,6 @@ xfs_bmap_read_extents( mp = ip->i_mount; ifp = XFS_IFORK_PTR(ip, whichfork); - exntf = (whichfork != XFS_DATA_FORK) ? XFS_EXTFMT_NOSTATE : - XFS_EXTFMT_INODE(ip); block = ifp->if_broot; /* * Root level must use BMAP_BROOT_PTR_ADDR macro to get ptr out. @@ -1340,18 +1308,9 @@ xfs_bmap_read_extents( xfs_bmbt_rec_host_t *trp = xfs_iext_get_ext(ifp, i); trp->l0 = be64_to_cpu(frp->l0); trp->l1 = be64_to_cpu(frp->l1); - } - if (exntf == XFS_EXTFMT_NOSTATE) { - /* - * Check all attribute bmap btree records and - * any "older" data bmap btree records for a - * set bit in the "extent flag" position. - */ - if (unlikely(xfs_check_nostate_extents(ifp, - start, num_recs))) { + if (!xfs_bmbt_validate_extent(mp, whichfork, trp)) { XFS_ERROR_REPORT("xfs_bmap_read_extents(2)", - XFS_ERRLEVEL_LOW, - ip->i_mount); + XFS_ERRLEVEL_LOW, mp); goto error0; } } @@ -2879,27 +2838,30 @@ xfs_bmap_add_extent_hole_delay( */ STATIC int /* error */ xfs_bmap_add_extent_hole_real( - struct xfs_bmalloca *bma, - int whichfork) + struct xfs_trans *tp, + struct xfs_inode *ip, + int whichfork, + xfs_extnum_t *idx, + struct xfs_btree_cur **curp, + struct xfs_bmbt_irec *new, + xfs_fsblock_t *first, + struct xfs_defer_ops *dfops, + int *logflagsp) { - struct xfs_bmbt_irec *new = &bma->got; + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + struct xfs_mount *mp = ip->i_mount; + struct xfs_btree_cur *cur = *curp; int error; /* error return value */ int i; /* temp state */ - xfs_ifork_t *ifp; /* inode fork pointer */ xfs_bmbt_irec_t left; /* left neighbor extent entry */ xfs_bmbt_irec_t right; /* right neighbor extent entry */ int rval=0; /* return value (logging flags) */ int state; /* state bits, accessed thru macros */ - struct xfs_mount *mp; - mp = bma->ip->i_mount; - ifp = XFS_IFORK_PTR(bma->ip, whichfork); - - ASSERT(bma->idx >= 0); - ASSERT(bma->idx <= xfs_iext_count(ifp)); + ASSERT(*idx >= 0); + ASSERT(*idx <= xfs_iext_count(ifp)); ASSERT(!isnullstartblock(new->br_startblock)); - ASSERT(!bma->cur || - !(bma->cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL)); + ASSERT(!cur || !(cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL)); XFS_STATS_INC(mp, xs_add_exlist); @@ -2912,9 +2874,9 @@ xfs_bmap_add_extent_hole_real( /* * Check and set flags if this segment has a left neighbor. */ - if (bma->idx > 0) { + if (*idx > 0) { state |= BMAP_LEFT_VALID; - xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx - 1), &left); + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx - 1), &left); if (isnullstartblock(left.br_startblock)) state |= BMAP_LEFT_DELAY; } @@ -2923,9 +2885,9 @@ xfs_bmap_add_extent_hole_real( * Check and set flags if this segment has a current value. * Not true if we're inserting into the "hole" at eof. */ - if (bma->idx < xfs_iext_count(ifp)) { + if (*idx < xfs_iext_count(ifp)) { state |= BMAP_RIGHT_VALID; - xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx), &right); + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx), &right); if (isnullstartblock(right.br_startblock)) state |= BMAP_RIGHT_DELAY; } @@ -2962,36 +2924,36 @@ xfs_bmap_add_extent_hole_real( * left and on the right. * Merge all three into a single extent record. */ - --bma->idx; - trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_); - xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, bma->idx), + --*idx; + trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); + xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx), left.br_blockcount + new->br_blockcount + right.br_blockcount); - trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_); + trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); - xfs_iext_remove(bma->ip, bma->idx + 1, 1, state); + xfs_iext_remove(ip, *idx + 1, 1, state); - XFS_IFORK_NEXT_SET(bma->ip, whichfork, - XFS_IFORK_NEXTENTS(bma->ip, whichfork) - 1); - if (bma->cur == NULL) { + XFS_IFORK_NEXT_SET(ip, whichfork, + XFS_IFORK_NEXTENTS(ip, whichfork) - 1); + if (cur == NULL) { rval = XFS_ILOG_CORE | xfs_ilog_fext(whichfork); } else { rval = XFS_ILOG_CORE; - error = xfs_bmbt_lookup_eq(bma->cur, right.br_startoff, + error = xfs_bmbt_lookup_eq(cur, right.br_startoff, right.br_startblock, right.br_blockcount, &i); if (error) goto done; XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); - error = xfs_btree_delete(bma->cur, &i); + error = xfs_btree_delete(cur, &i); if (error) goto done; XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); - error = xfs_btree_decrement(bma->cur, 0, &i); + error = xfs_btree_decrement(cur, 0, &i); if (error) goto done; XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); - error = xfs_bmbt_update(bma->cur, left.br_startoff, + error = xfs_bmbt_update(cur, left.br_startoff, left.br_startblock, left.br_blockcount + new->br_blockcount + @@ -3008,23 +2970,23 @@ xfs_bmap_add_extent_hole_real( * on the left. * Merge the new allocation with the left neighbor. */ - --bma->idx; - trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_); - xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, bma->idx), + --*idx; + trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); + xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx), left.br_blockcount + new->br_blockcount); - trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_); + trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); - if (bma->cur == NULL) { + if (cur == NULL) { rval = xfs_ilog_fext(whichfork); } else { rval = 0; - error = xfs_bmbt_lookup_eq(bma->cur, left.br_startoff, + error = xfs_bmbt_lookup_eq(cur, left.br_startoff, left.br_startblock, left.br_blockcount, &i); if (error) goto done; XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); - error = xfs_bmbt_update(bma->cur, left.br_startoff, + error = xfs_bmbt_update(cur, left.br_startoff, left.br_startblock, left.br_blockcount + new->br_blockcount, @@ -3040,25 +3002,25 @@ xfs_bmap_add_extent_hole_real( * on the right. * Merge the new allocation with the right neighbor. */ - trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_); - xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, bma->idx), + trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); + xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, *idx), new->br_startoff, new->br_startblock, new->br_blockcount + right.br_blockcount, right.br_state); - trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_); + trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); - if (bma->cur == NULL) { + if (cur == NULL) { rval = xfs_ilog_fext(whichfork); } else { rval = 0; - error = xfs_bmbt_lookup_eq(bma->cur, + error = xfs_bmbt_lookup_eq(cur, right.br_startoff, right.br_startblock, right.br_blockcount, &i); if (error) goto done; XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); - error = xfs_bmbt_update(bma->cur, new->br_startoff, + error = xfs_bmbt_update(cur, new->br_startoff, new->br_startblock, new->br_blockcount + right.br_blockcount, @@ -3074,22 +3036,22 @@ xfs_bmap_add_extent_hole_real( * real allocation. * Insert a new entry. */ - xfs_iext_insert(bma->ip, bma->idx, 1, new, state); - XFS_IFORK_NEXT_SET(bma->ip, whichfork, - XFS_IFORK_NEXTENTS(bma->ip, whichfork) + 1); - if (bma->cur == NULL) { + xfs_iext_insert(ip, *idx, 1, new, state); + XFS_IFORK_NEXT_SET(ip, whichfork, + XFS_IFORK_NEXTENTS(ip, whichfork) + 1); + if (cur == NULL) { rval = XFS_ILOG_CORE | xfs_ilog_fext(whichfork); } else { rval = XFS_ILOG_CORE; - error = xfs_bmbt_lookup_eq(bma->cur, + error = xfs_bmbt_lookup_eq(cur, new->br_startoff, new->br_startblock, new->br_blockcount, &i); if (error) goto done; XFS_WANT_CORRUPTED_GOTO(mp, i == 0, done); - bma->cur->bc_rec.b.br_state = new->br_state; - error = xfs_btree_insert(bma->cur, &i); + cur->bc_rec.b.br_state = new->br_state; + error = xfs_btree_insert(cur, &i); if (error) goto done; XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); @@ -3098,30 +3060,30 @@ xfs_bmap_add_extent_hole_real( } /* add reverse mapping */ - error = xfs_rmap_map_extent(mp, bma->dfops, bma->ip, whichfork, new); + error = xfs_rmap_map_extent(mp, dfops, ip, whichfork, new); if (error) goto done; /* convert to a btree if necessary */ - if (xfs_bmap_needs_btree(bma->ip, whichfork)) { + if (xfs_bmap_needs_btree(ip, whichfork)) { int tmp_logflags; /* partial log flag return val */ - ASSERT(bma->cur == NULL); - error = xfs_bmap_extents_to_btree(bma->tp, bma->ip, - bma->firstblock, bma->dfops, &bma->cur, + ASSERT(cur == NULL); + error = xfs_bmap_extents_to_btree(tp, ip, first, dfops, curp, 0, &tmp_logflags, whichfork); - bma->logflags |= tmp_logflags; + *logflagsp |= tmp_logflags; + cur = *curp; if (error) goto done; } /* clear out the allocated field, done with it now in any case. */ - if (bma->cur) - bma->cur->bc_private.b.allocated = 0; + if (cur) + cur->bc_private.b.allocated = 0; - xfs_bmap_check_leaf_extents(bma->cur, bma->ip, whichfork); + xfs_bmap_check_leaf_extents(cur, ip, whichfork); done: - bma->logflags |= rval; + *logflagsp |= rval; return error; } @@ -3853,60 +3815,6 @@ xfs_bmap_btalloc( } /* - * For a remap operation, just "allocate" an extent at the address that the - * caller passed in, and ensure that the AGFL is the right size. The caller - * will then map the "allocated" extent into the file somewhere. - */ -STATIC int -xfs_bmap_remap_alloc( - struct xfs_bmalloca *ap) -{ - struct xfs_trans *tp = ap->tp; - struct xfs_mount *mp = tp->t_mountp; - xfs_agblock_t bno; - struct xfs_alloc_arg args; - int error; - - /* - * validate that the block number is legal - the enables us to detect - * and handle a silent filesystem corruption rather than crashing. - */ - memset(&args, 0, sizeof(struct xfs_alloc_arg)); - args.tp = ap->tp; - args.mp = ap->tp->t_mountp; - bno = *ap->firstblock; - args.agno = XFS_FSB_TO_AGNO(mp, bno); - args.agbno = XFS_FSB_TO_AGBNO(mp, bno); - if (args.agno >= mp->m_sb.sb_agcount || - args.agbno >= mp->m_sb.sb_agblocks) - return -EFSCORRUPTED; - - /* "Allocate" the extent from the range we passed in. */ - trace_xfs_bmap_remap_alloc(ap->ip, *ap->firstblock, ap->length); - ap->blkno = bno; - ap->ip->i_d.di_nblocks += ap->length; - xfs_trans_log_inode(ap->tp, ap->ip, XFS_ILOG_CORE); - - /* Fix the freelist, like a real allocator does. */ - args.datatype = ap->datatype; - args.pag = xfs_perag_get(args.mp, args.agno); - ASSERT(args.pag); - - /* - * The freelist fixing code will decline the allocation if - * the size and shape of the free space doesn't allow for - * allocating the extent and updating all the metadata that - * happens during an allocation. We're remapping, not - * allocating, so skip that check by pretending to be freeing. - */ - error = xfs_alloc_fix_freelist(&args, XFS_ALLOC_FLAG_FREEING); - xfs_perag_put(args.pag); - if (error) - trace_xfs_bmap_remap_alloc_error(ap->ip, error, _RET_IP_); - return error; -} - -/* * xfs_bmap_alloc is called by xfs_bmapi to allocate an extent for a file. * It figures out where to ask the underlying allocator to put the new extent. */ @@ -3914,8 +3822,6 @@ STATIC int xfs_bmap_alloc( struct xfs_bmalloca *ap) /* bmap alloc argument struct */ { - if (ap->flags & XFS_BMAPI_REMAP) - return xfs_bmap_remap_alloc(ap); if (XFS_IS_REALTIME_INODE(ap->ip) && xfs_alloc_is_userdata(ap->datatype)) return xfs_bmap_rtalloc(ap); @@ -4386,7 +4292,9 @@ xfs_bmapi_allocate( if (bma->wasdel) error = xfs_bmap_add_extent_delay_real(bma, whichfork); else - error = xfs_bmap_add_extent_hole_real(bma, whichfork); + error = xfs_bmap_add_extent_hole_real(bma->tp, bma->ip, + whichfork, &bma->idx, &bma->cur, &bma->got, + bma->firstblock, bma->dfops, &bma->logflags); bma->logflags |= tmp_logflags; if (error) @@ -4549,9 +4457,7 @@ xfs_bmapi_write( ASSERT(len > 0); ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL); ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); - ASSERT(!(flags & XFS_BMAPI_REMAP) || whichfork == XFS_DATA_FORK); - ASSERT(!(flags & XFS_BMAPI_PREALLOC) || !(flags & XFS_BMAPI_REMAP)); - ASSERT(!(flags & XFS_BMAPI_CONVERT) || !(flags & XFS_BMAPI_REMAP)); + ASSERT(!(flags & XFS_BMAPI_REMAP)); /* zeroing is for currently only for data extents, not metadata */ ASSERT((flags & (XFS_BMAPI_METADATA | XFS_BMAPI_ZERO)) != @@ -4635,13 +4541,8 @@ xfs_bmapi_write( } else { need_alloc = true; } - } else { - /* - * Make sure we only reflink into a hole. - */ - ASSERT(!(flags & XFS_BMAPI_REMAP)); - if (isnullstartblock(bma.got.br_startblock)) - wasdelay = true; + } else if (isnullstartblock(bma.got.br_startblock)) { + wasdelay = true; } /* @@ -4770,6 +4671,93 @@ error0: return error; } +static int +xfs_bmapi_remap( + struct xfs_trans *tp, + struct xfs_inode *ip, + xfs_fileoff_t bno, + xfs_filblks_t len, + xfs_fsblock_t startblock, + struct xfs_defer_ops *dfops) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); + struct xfs_btree_cur *cur = NULL; + xfs_fsblock_t firstblock = NULLFSBLOCK; + struct xfs_bmbt_irec got; + xfs_extnum_t idx; + int logflags = 0, error; + + ASSERT(len > 0); + ASSERT(len <= (xfs_filblks_t)MAXEXTLEN); + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + + if (unlikely(XFS_TEST_ERROR( + (XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) != XFS_DINODE_FMT_EXTENTS && + XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) != XFS_DINODE_FMT_BTREE), + mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) { + XFS_ERROR_REPORT("xfs_bmapi_remap", XFS_ERRLEVEL_LOW, mp); + return -EFSCORRUPTED; + } + + if (XFS_FORCED_SHUTDOWN(mp)) + return -EIO; + + if (!(ifp->if_flags & XFS_IFEXTENTS)) { + error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK); + if (error) + return error; + } + + if (xfs_iext_lookup_extent(ip, ifp, bno, &idx, &got)) { + /* make sure we only reflink into a hole. */ + ASSERT(got.br_startoff > bno); + ASSERT(got.br_startoff - bno >= len); + } + + ip->i_d.di_nblocks += len; + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + + if (ifp->if_flags & XFS_IFBROOT) { + cur = xfs_bmbt_init_cursor(mp, tp, ip, XFS_DATA_FORK); + cur->bc_private.b.firstblock = firstblock; + cur->bc_private.b.dfops = dfops; + cur->bc_private.b.flags = 0; + } + + got.br_startoff = bno; + got.br_startblock = startblock; + got.br_blockcount = len; + got.br_state = XFS_EXT_NORM; + + error = xfs_bmap_add_extent_hole_real(tp, ip, XFS_DATA_FORK, &idx, &cur, + &got, &firstblock, dfops, &logflags); + if (error) + goto error0; + + if (xfs_bmap_wants_extents(ip, XFS_DATA_FORK)) { + int tmp_logflags = 0; + + error = xfs_bmap_btree_to_extents(tp, ip, cur, + &tmp_logflags, XFS_DATA_FORK); + logflags |= tmp_logflags; + } + +error0: + if (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS) + logflags &= ~XFS_ILOG_DEXT; + else if (ip->i_d.di_format != XFS_DINODE_FMT_BTREE) + logflags &= ~XFS_ILOG_DBROOT; + + if (logflags) + xfs_trans_log_inode(tp, ip, logflags); + if (cur) { + xfs_btree_del_cursor(cur, + error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR); + } + return error; +} + /* * When a delalloc extent is split (e.g., due to a hole punch), the original * indlen reservation must be shared across the two new extents that are left @@ -4887,7 +4875,7 @@ xfs_bmap_del_extent_delay( ASSERT(got_endoff >= del_endoff); if (isrt) { - int64_t rtexts = XFS_FSB_TO_B(mp, del->br_blockcount); + uint64_t rtexts = XFS_FSB_TO_B(mp, del->br_blockcount); do_div(rtexts, mp->m_sb.sb_rextsize); xfs_mod_frextents(mp, rtexts); @@ -6488,27 +6476,15 @@ xfs_bmap_finish_one( xfs_filblks_t blockcount, xfs_exntst_t state) { - struct xfs_bmbt_irec bmap; - int nimaps = 1; - xfs_fsblock_t firstfsb; - int flags = XFS_BMAPI_REMAP; - int done; - int error = 0; - - bmap.br_startblock = startblock; - bmap.br_startoff = startoff; - bmap.br_blockcount = blockcount; - bmap.br_state = state; + int error = 0, done; trace_xfs_bmap_deferred(tp->t_mountp, XFS_FSB_TO_AGNO(tp->t_mountp, startblock), type, XFS_FSB_TO_AGBNO(tp->t_mountp, startblock), ip->i_ino, whichfork, startoff, blockcount, state); - if (whichfork != XFS_DATA_FORK && whichfork != XFS_ATTR_FORK) + if (WARN_ON_ONCE(whichfork != XFS_DATA_FORK)) return -EFSCORRUPTED; - if (whichfork == XFS_ATTR_FORK) - flags |= XFS_BMAPI_ATTRFORK; if (XFS_TEST_ERROR(false, tp->t_mountp, XFS_ERRTAG_BMAP_FINISH_ONE, @@ -6517,16 +6493,12 @@ xfs_bmap_finish_one( switch (type) { case XFS_BMAP_MAP: - firstfsb = bmap.br_startblock; - error = xfs_bmapi_write(tp, ip, bmap.br_startoff, - bmap.br_blockcount, flags, &firstfsb, - bmap.br_blockcount, &bmap, &nimaps, - dfops); + error = xfs_bmapi_remap(tp, ip, startoff, blockcount, + startblock, dfops); break; case XFS_BMAP_UNMAP: - error = xfs_bunmapi(tp, ip, bmap.br_startoff, - bmap.br_blockcount, flags, 1, &firstfsb, - dfops, &done); + error = xfs_bunmapi(tp, ip, startoff, blockcount, + XFS_BMAPI_REMAP, 1, &startblock, dfops, &done); ASSERT(done); break; default: diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h index cdef87db5262..c35a14fa1527 100644 --- a/fs/xfs/libxfs/xfs_bmap.h +++ b/fs/xfs/libxfs/xfs_bmap.h @@ -172,6 +172,18 @@ static inline int xfs_bmapi_whichfork(int bmapi_flags) /* + * Return true if the extent is a real, allocated extent, or false if it is a + * delayed allocation, and unwritten extent or a hole. + */ +static inline bool xfs_bmap_is_real_extent(struct xfs_bmbt_irec *irec) +{ + return irec->br_state != XFS_EXT_UNWRITTEN && + irec->br_startblock != HOLESTARTBLOCK && + irec->br_startblock != DELAYSTARTBLOCK && + !isnullstartblock(irec->br_startblock); +} + +/* * This macro is used to determine how many extents will be shifted * in one write transaction. We could require two splits, * an extent move on the first and an extent merge on the second, @@ -232,8 +244,6 @@ int xfs_bmap_del_extent_delay(struct xfs_inode *ip, int whichfork, struct xfs_bmbt_irec *del); void xfs_bmap_del_extent_cow(struct xfs_inode *ip, xfs_extnum_t *idx, struct xfs_bmbt_irec *got, struct xfs_bmbt_irec *del); -int xfs_check_nostate_extents(struct xfs_ifork *ifp, xfs_extnum_t idx, - xfs_extnum_t num); uint xfs_default_attroffset(struct xfs_inode *ip); int xfs_bmap_shift_extents(struct xfs_trans *tp, struct xfs_inode *ip, xfs_fileoff_t *next_fsb, xfs_fileoff_t offset_shift_fsb, diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c index fd55db479385..6cba69aff077 100644 --- a/fs/xfs/libxfs/xfs_bmap_btree.c +++ b/fs/xfs/libxfs/xfs_bmap_btree.c @@ -366,32 +366,6 @@ xfs_bmbt_to_bmdr( memcpy(tpp, fpp, sizeof(*fpp) * dmxr); } -/* - * Check extent records, which have just been read, for - * any bit in the extent flag field. ASSERT on debug - * kernels, as this condition should not occur. - * Return an error condition (1) if any flags found, - * otherwise return 0. - */ - -int -xfs_check_nostate_extents( - xfs_ifork_t *ifp, - xfs_extnum_t idx, - xfs_extnum_t num) -{ - for (; num > 0; num--, idx++) { - xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, idx); - if ((ep->l0 >> - (64 - BMBT_EXNTFLAG_BITLEN)) != 0) { - ASSERT(0); - return 1; - } - } - return 0; -} - - STATIC struct xfs_btree_cur * xfs_bmbt_dup_cursor( struct xfs_btree_cur *cur) @@ -448,7 +422,6 @@ xfs_bmbt_alloc_block( if (args.fsbno == NULLFSBLOCK) { args.fsbno = be64_to_cpu(start->l); args.type = XFS_ALLOCTYPE_START_BNO; -try_another_ag: /* * Make sure there is sufficient room left in the AG to * complete a full tree split for an extent insert. If @@ -477,22 +450,6 @@ try_another_ag: if (error) goto error0; - /* - * During a CoW operation, the allocation and bmbt updates occur in - * different transactions. The mapping code tries to put new bmbt - * blocks near extents being mapped, but the only way to guarantee this - * is if the alloc and the mapping happen in a single transaction that - * has a block reservation. That isn't the case here, so if we run out - * of space we'll try again with another AG. - */ - if (xfs_sb_version_hasreflink(&cur->bc_mp->m_sb) && - args.fsbno == NULLFSBLOCK && - args.type == XFS_ALLOCTYPE_NEAR_BNO) { - args.fsbno = cur->bc_private.b.firstblock; - args.type = XFS_ALLOCTYPE_FIRST_AG; - goto try_another_ag; - } - if (args.fsbno == NULLFSBLOCK && args.minleft) { /* * Could not find an AG with enough free space to satisfy diff --git a/fs/xfs/libxfs/xfs_bmap_btree.h b/fs/xfs/libxfs/xfs_bmap_btree.h index 819a8a4dee95..9da5a8d4f184 100644 --- a/fs/xfs/libxfs/xfs_bmap_btree.h +++ b/fs/xfs/libxfs/xfs_bmap_btree.h @@ -25,14 +25,6 @@ struct xfs_inode; struct xfs_trans; /* - * Extent state and extent format macros. - */ -#define XFS_EXTFMT_INODE(x) \ - (xfs_sb_version_hasextflgbit(&((x)->i_mount->m_sb)) ? \ - XFS_EXTFMT_HASSTATE : XFS_EXTFMT_NOSTATE) -#define ISUNWRITTEN(x) ((x)->br_state == XFS_EXT_UNWRITTEN) - -/* * Btree block header size depends on a superblock flag. */ #define XFS_BMBT_BLOCK_LEN(mp) \ @@ -140,4 +132,18 @@ extern int xfs_bmbt_change_owner(struct xfs_trans *tp, struct xfs_inode *ip, extern struct xfs_btree_cur *xfs_bmbt_init_cursor(struct xfs_mount *, struct xfs_trans *, struct xfs_inode *, int); +/* + * Check that the extent does not contain an invalid unwritten extent flag. + */ +static inline bool xfs_bmbt_validate_extent(struct xfs_mount *mp, int whichfork, + struct xfs_bmbt_rec_host *ep) +{ + if (ep->l0 >> (64 - BMBT_EXNTFLAG_BITLEN) == 0) + return true; + if (whichfork == XFS_DATA_FORK && + xfs_sb_version_hasextflgbit(&mp->m_sb)) + return true; + return false; +} + #endif /* __XFS_BMAP_BTREE_H__ */ diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index c3decedc9455..5392674bf893 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -2886,7 +2886,7 @@ xfs_btree_split_worker( struct xfs_btree_split_args *args = container_of(work, struct xfs_btree_split_args, work); unsigned long pflags; - unsigned long new_pflags = PF_FSTRANS; + unsigned long new_pflags = PF_MEMALLOC_NOFS; /* * we are in a transaction context here, but may also be doing work @@ -4842,6 +4842,21 @@ xfs_btree_query_range( fn, priv); } +/* Query a btree for all records. */ +int +xfs_btree_query_all( + struct xfs_btree_cur *cur, + xfs_btree_query_range_fn fn, + void *priv) +{ + union xfs_btree_irec low_rec; + union xfs_btree_irec high_rec; + + memset(&low_rec, 0, sizeof(low_rec)); + memset(&high_rec, 0xFF, sizeof(high_rec)); + return xfs_btree_query_range(cur, &low_rec, &high_rec, fn, priv); +} + /* * Calculate the number of blocks needed to store a given number of records * in a short-format (per-AG metadata) btree. diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h index 4bb62580a7fd..27bed08261c5 100644 --- a/fs/xfs/libxfs/xfs_btree.h +++ b/fs/xfs/libxfs/xfs_btree.h @@ -496,6 +496,8 @@ typedef int (*xfs_btree_query_range_fn)(struct xfs_btree_cur *cur, int xfs_btree_query_range(struct xfs_btree_cur *cur, union xfs_btree_irec *low_rec, union xfs_btree_irec *high_rec, xfs_btree_query_range_fn fn, void *priv); +int xfs_btree_query_all(struct xfs_btree_cur *cur, xfs_btree_query_range_fn fn, + void *priv); typedef int (*xfs_btree_visit_blocks_fn)(struct xfs_btree_cur *cur, int level, void *data); diff --git a/fs/xfs/libxfs/xfs_dquot_buf.c b/fs/xfs/libxfs/xfs_dquot_buf.c index ac9a003dd29a..747085b4ef44 100644 --- a/fs/xfs/libxfs/xfs_dquot_buf.c +++ b/fs/xfs/libxfs/xfs_dquot_buf.c @@ -35,13 +35,8 @@ int xfs_calc_dquots_per_chunk( unsigned int nbblks) /* basic block units */ { - unsigned int ndquots; - ASSERT(nbblks > 0); - ndquots = BBTOB(nbblks); - do_div(ndquots, sizeof(xfs_dqblk_t)); - - return ndquots; + return BBTOB(nbblks) / sizeof(xfs_dqblk_t); } /* diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h index 6b7579e7b60a..a1dccd8d96bc 100644 --- a/fs/xfs/libxfs/xfs_format.h +++ b/fs/xfs/libxfs/xfs_format.h @@ -930,10 +930,8 @@ static inline uint xfs_dinode_size(int version) /* * The 32 bit link count in the inode theoretically maxes out at UINT_MAX. * Since the pathconf interface is signed, we use 2^31 - 1 instead. - * The old inode format had a 16 bit link count, so its maximum is USHRT_MAX. */ #define XFS_MAXLINK ((1U << 31) - 1U) -#define XFS_MAXLINK_1 65535U /* * Values for di_format @@ -1578,19 +1576,10 @@ static inline xfs_filblks_t startblockval(xfs_fsblock_t x) } /* - * Possible extent formats. - */ -typedef enum { - XFS_EXTFMT_NOSTATE = 0, - XFS_EXTFMT_HASSTATE -} xfs_exntfmt_t; - -/* * Possible extent states. */ typedef enum { XFS_EXT_NORM, XFS_EXT_UNWRITTEN, - XFS_EXT_DMAPI_OFFLINE, XFS_EXT_INVALID } xfs_exntst_t; /* diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h index b72dc821d78b..095bdf049a3f 100644 --- a/fs/xfs/libxfs/xfs_fs.h +++ b/fs/xfs/libxfs/xfs_fs.h @@ -92,6 +92,18 @@ struct getbmapx { #define BMV_OF_LAST 0x4 /* segment is the last in the file */ #define BMV_OF_SHARED 0x8 /* segment shared with another file */ +/* fmr_owner special values for FS_IOC_GETFSMAP */ +#define XFS_FMR_OWN_FREE FMR_OWN_FREE /* free space */ +#define XFS_FMR_OWN_UNKNOWN FMR_OWN_UNKNOWN /* unknown owner */ +#define XFS_FMR_OWN_FS FMR_OWNER('X', 1) /* static fs metadata */ +#define XFS_FMR_OWN_LOG FMR_OWNER('X', 2) /* journalling log */ +#define XFS_FMR_OWN_AG FMR_OWNER('X', 3) /* per-AG metadata */ +#define XFS_FMR_OWN_INOBT FMR_OWNER('X', 4) /* inode btree blocks */ +#define XFS_FMR_OWN_INODES FMR_OWNER('X', 5) /* inodes */ +#define XFS_FMR_OWN_REFC FMR_OWNER('X', 6) /* refcount tree */ +#define XFS_FMR_OWN_COW FMR_OWNER('X', 7) /* cow staging */ +#define XFS_FMR_OWN_DEFECTIVE FMR_OWNER('X', 8) /* bad blocks */ + /* * Structure for XFS_IOC_FSSETDM. * For use by backup and restore programs to set the XFS on-disk inode @@ -502,6 +514,7 @@ typedef struct xfs_swapext #define XFS_IOC_GETBMAPX _IOWR('X', 56, struct getbmap) #define XFS_IOC_ZERO_RANGE _IOW ('X', 57, struct xfs_flock64) #define XFS_IOC_FREE_EOFBLOCKS _IOR ('X', 58, struct xfs_fs_eofblocks) +/* XFS_IOC_GETFSMAP ------ hoisted 59 */ /* * ioctl commands that replace IRIX syssgi()'s diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c index d93f9d918cfc..09c3d1aecef2 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.c +++ b/fs/xfs/libxfs/xfs_inode_buf.c @@ -508,7 +508,7 @@ xfs_iread( /* even unallocated inodes are verified */ if (!xfs_dinode_verify(mp, ip->i_ino, dip)) { - xfs_alert(mp, "%s: validation failed for inode %lld failed", + xfs_alert(mp, "%s: validation failed for inode %lld", __func__, ip->i_ino); XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, dip); diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c index 8a37efe04de3..0e80f34fe97c 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.c +++ b/fs/xfs/libxfs/xfs_inode_fork.c @@ -42,35 +42,6 @@ STATIC int xfs_iformat_local(xfs_inode_t *, xfs_dinode_t *, int, int); STATIC int xfs_iformat_extents(xfs_inode_t *, xfs_dinode_t *, int); STATIC int xfs_iformat_btree(xfs_inode_t *, xfs_dinode_t *, int); -#ifdef DEBUG -/* - * Make sure that the extents in the given memory buffer - * are valid. - */ -void -xfs_validate_extents( - xfs_ifork_t *ifp, - int nrecs, - xfs_exntfmt_t fmt) -{ - xfs_bmbt_irec_t irec; - xfs_bmbt_rec_host_t rec; - int i; - - for (i = 0; i < nrecs; i++) { - xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i); - rec.l0 = get_unaligned(&ep->l0); - rec.l1 = get_unaligned(&ep->l1); - xfs_bmbt_get_all(&rec, &irec); - if (fmt == XFS_EXTFMT_NOSTATE) - ASSERT(irec.br_state == XFS_EXT_NORM); - } -} -#else /* DEBUG */ -#define xfs_validate_extents(ifp, nrecs, fmt) -#endif /* DEBUG */ - - /* * Move inode type and inode format specific information from the * on-disk inode to the in-core inode. For fifos, devs, and sockets @@ -352,40 +323,33 @@ xfs_iformat_local( } /* - * The file consists of a set of extents all - * of which fit into the on-disk inode. - * If there are few enough extents to fit into - * the if_inline_ext, then copy them there. - * Otherwise allocate a buffer for them and copy - * them into it. Either way, set if_extents - * to point at the extents. + * The file consists of a set of extents all of which fit into the on-disk + * inode. If there are few enough extents to fit into the if_inline_ext, then + * copy them there. Otherwise allocate a buffer for them and copy them into it. + * Either way, set if_extents to point at the extents. */ STATIC int xfs_iformat_extents( - xfs_inode_t *ip, - xfs_dinode_t *dip, - int whichfork) + struct xfs_inode *ip, + struct xfs_dinode *dip, + int whichfork) { - xfs_bmbt_rec_t *dp; - xfs_ifork_t *ifp; - int nex; - int size; - int i; - - ifp = XFS_IFORK_PTR(ip, whichfork); - nex = XFS_DFORK_NEXTENTS(dip, whichfork); - size = nex * (uint)sizeof(xfs_bmbt_rec_t); + struct xfs_mount *mp = ip->i_mount; + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + int nex = XFS_DFORK_NEXTENTS(dip, whichfork); + int size = nex * sizeof(xfs_bmbt_rec_t); + struct xfs_bmbt_rec *dp; + int i; /* - * If the number of extents is unreasonable, then something - * is wrong and we just bail out rather than crash in - * kmem_alloc() or memcpy() below. + * If the number of extents is unreasonable, then something is wrong and + * we just bail out rather than crash in kmem_alloc() or memcpy() below. */ - if (unlikely(size < 0 || size > XFS_DFORK_SIZE(dip, ip->i_mount, whichfork))) { + if (unlikely(size < 0 || size > XFS_DFORK_SIZE(dip, mp, whichfork))) { xfs_warn(ip->i_mount, "corrupt inode %Lu ((a)extents = %d).", (unsigned long long) ip->i_ino, nex); XFS_CORRUPTION_ERROR("xfs_iformat_extents(1)", XFS_ERRLEVEL_LOW, - ip->i_mount, dip); + mp, dip); return -EFSCORRUPTED; } @@ -400,22 +364,17 @@ xfs_iformat_extents( ifp->if_bytes = size; if (size) { dp = (xfs_bmbt_rec_t *) XFS_DFORK_PTR(dip, whichfork); - xfs_validate_extents(ifp, nex, XFS_EXTFMT_INODE(ip)); for (i = 0; i < nex; i++, dp++) { xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i); ep->l0 = get_unaligned_be64(&dp->l0); ep->l1 = get_unaligned_be64(&dp->l1); + if (!xfs_bmbt_validate_extent(mp, whichfork, ep)) { + XFS_ERROR_REPORT("xfs_iformat_extents(2)", + XFS_ERRLEVEL_LOW, mp); + return -EFSCORRUPTED; + } } XFS_BMAP_TRACE_EXLIST(ip, nex, whichfork); - if (whichfork != XFS_DATA_FORK || - XFS_EXTFMT_INODE(ip) == XFS_EXTFMT_NOSTATE) - if (unlikely(xfs_check_nostate_extents( - ifp, 0, nex))) { - XFS_ERROR_REPORT("xfs_iformat_extents(2)", - XFS_ERRLEVEL_LOW, - ip->i_mount); - return -EFSCORRUPTED; - } } ifp->if_flags |= XFS_IFEXTENTS; return 0; @@ -518,7 +477,6 @@ xfs_iread_extents( xfs_iext_destroy(ifp); return error; } - xfs_validate_extents(ifp, nextents, XFS_EXTFMT_INODE(ip)); ifp->if_flags |= XFS_IFEXTENTS; return 0; } @@ -837,6 +795,9 @@ xfs_iextents_copy( copied = 0; for (i = 0; i < nrecs; i++) { xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i); + + ASSERT(xfs_bmbt_validate_extent(ip->i_mount, whichfork, ep)); + start_block = xfs_bmbt_get_startblock(ep); if (isnullstartblock(start_block)) { /* @@ -852,7 +813,6 @@ xfs_iextents_copy( copied++; } ASSERT(copied != 0); - xfs_validate_extents(ifp, copied, XFS_EXTFMT_INODE(ip)); return (copied * (uint)sizeof(xfs_bmbt_rec_t)); } diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c index 3a8cc7139912..06cfb93c2ef9 100644 --- a/fs/xfs/libxfs/xfs_rmap.c +++ b/fs/xfs/libxfs/xfs_rmap.c @@ -2001,14 +2001,14 @@ xfs_rmap_query_range_helper( /* Find all rmaps between two keys. */ int xfs_rmap_query_range( - struct xfs_btree_cur *cur, - struct xfs_rmap_irec *low_rec, - struct xfs_rmap_irec *high_rec, - xfs_rmap_query_range_fn fn, - void *priv) + struct xfs_btree_cur *cur, + struct xfs_rmap_irec *low_rec, + struct xfs_rmap_irec *high_rec, + xfs_rmap_query_range_fn fn, + void *priv) { - union xfs_btree_irec low_brec; - union xfs_btree_irec high_brec; + union xfs_btree_irec low_brec; + union xfs_btree_irec high_brec; struct xfs_rmap_query_range_info query; low_brec.r = *low_rec; @@ -2019,6 +2019,20 @@ xfs_rmap_query_range( xfs_rmap_query_range_helper, &query); } +/* Find all rmaps. */ +int +xfs_rmap_query_all( + struct xfs_btree_cur *cur, + xfs_rmap_query_range_fn fn, + void *priv) +{ + struct xfs_rmap_query_range_info query; + + query.priv = priv; + query.fn = fn; + return xfs_btree_query_all(cur, xfs_rmap_query_range_helper, &query); +} + /* Clean up after calling xfs_rmap_finish_one. */ void xfs_rmap_finish_one_cleanup( @@ -2291,3 +2305,31 @@ xfs_rmap_free_extent( return __xfs_rmap_add(mp, dfops, XFS_RMAP_FREE, owner, XFS_DATA_FORK, &bmap); } + +/* Compare rmap records. Returns -1 if a < b, 1 if a > b, and 0 if equal. */ +int +xfs_rmap_compare( + const struct xfs_rmap_irec *a, + const struct xfs_rmap_irec *b) +{ + __u64 oa; + __u64 ob; + + oa = xfs_rmap_irec_offset_pack(a); + ob = xfs_rmap_irec_offset_pack(b); + + if (a->rm_startblock < b->rm_startblock) + return -1; + else if (a->rm_startblock > b->rm_startblock) + return 1; + else if (a->rm_owner < b->rm_owner) + return -1; + else if (a->rm_owner > b->rm_owner) + return 1; + else if (oa < ob) + return -1; + else if (oa > ob) + return 1; + else + return 0; +} diff --git a/fs/xfs/libxfs/xfs_rmap.h b/fs/xfs/libxfs/xfs_rmap.h index 789930599339..98f908fea103 100644 --- a/fs/xfs/libxfs/xfs_rmap.h +++ b/fs/xfs/libxfs/xfs_rmap.h @@ -162,6 +162,8 @@ typedef int (*xfs_rmap_query_range_fn)( int xfs_rmap_query_range(struct xfs_btree_cur *cur, struct xfs_rmap_irec *low_rec, struct xfs_rmap_irec *high_rec, xfs_rmap_query_range_fn fn, void *priv); +int xfs_rmap_query_all(struct xfs_btree_cur *cur, xfs_rmap_query_range_fn fn, + void *priv); enum xfs_rmap_intent_type { XFS_RMAP_MAP, @@ -212,5 +214,7 @@ int xfs_rmap_find_left_neighbor(struct xfs_btree_cur *cur, xfs_agblock_t bno, int xfs_rmap_lookup_le_range(struct xfs_btree_cur *cur, xfs_agblock_t bno, uint64_t owner, uint64_t offset, unsigned int flags, struct xfs_rmap_irec *irec, int *stat); +int xfs_rmap_compare(const struct xfs_rmap_irec *a, + const struct xfs_rmap_irec *b); #endif /* __XFS_RMAP_H__ */ diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c index ea45584a9913..e47b99e59f60 100644 --- a/fs/xfs/libxfs/xfs_rtbitmap.c +++ b/fs/xfs/libxfs/xfs_rtbitmap.c @@ -1016,3 +1016,73 @@ xfs_rtfree_extent( } return 0; } + +/* Find all the free records within a given range. */ +int +xfs_rtalloc_query_range( + struct xfs_trans *tp, + struct xfs_rtalloc_rec *low_rec, + struct xfs_rtalloc_rec *high_rec, + xfs_rtalloc_query_range_fn fn, + void *priv) +{ + struct xfs_rtalloc_rec rec; + struct xfs_mount *mp = tp->t_mountp; + xfs_rtblock_t rtstart; + xfs_rtblock_t rtend; + xfs_rtblock_t rem; + int is_free; + int error = 0; + + if (low_rec->ar_startblock > high_rec->ar_startblock) + return -EINVAL; + else if (low_rec->ar_startblock == high_rec->ar_startblock) + return 0; + + /* Iterate the bitmap, looking for discrepancies. */ + rtstart = low_rec->ar_startblock; + rem = high_rec->ar_startblock - rtstart; + while (rem) { + /* Is the first block free? */ + error = xfs_rtcheck_range(mp, tp, rtstart, 1, 1, &rtend, + &is_free); + if (error) + break; + + /* How long does the extent go for? */ + error = xfs_rtfind_forw(mp, tp, rtstart, + high_rec->ar_startblock - 1, &rtend); + if (error) + break; + + if (is_free) { + rec.ar_startblock = rtstart; + rec.ar_blockcount = rtend - rtstart + 1; + + error = fn(tp, &rec, priv); + if (error) + break; + } + + rem -= rtend - rtstart + 1; + rtstart = rtend + 1; + } + + return error; +} + +/* Find all the free records. */ +int +xfs_rtalloc_query_all( + struct xfs_trans *tp, + xfs_rtalloc_query_range_fn fn, + void *priv) +{ + struct xfs_rtalloc_rec keys[2]; + + keys[0].ar_startblock = 0; + keys[1].ar_startblock = tp->t_mountp->m_sb.sb_rblocks; + keys[0].ar_blockcount = keys[1].ar_blockcount = 0; + + return xfs_rtalloc_query_range(tp, &keys[0], &keys[1], fn, priv); +} diff --git a/fs/xfs/libxfs/xfs_trans_space.h b/fs/xfs/libxfs/xfs_trans_space.h index 7917f6e44286..d787c677d2a3 100644 --- a/fs/xfs/libxfs/xfs_trans_space.h +++ b/fs/xfs/libxfs/xfs_trans_space.h @@ -21,8 +21,20 @@ /* * Components of space reservations. */ + +/* Worst case number of rmaps that can be held in a block. */ #define XFS_MAX_CONTIG_RMAPS_PER_BLOCK(mp) \ (((mp)->m_rmap_mxr[0]) - ((mp)->m_rmap_mnr[0])) + +/* Adding one rmap could split every level up to the top of the tree. */ +#define XFS_RMAPADD_SPACE_RES(mp) ((mp)->m_rmap_maxlevels) + +/* Blocks we might need to add "b" rmaps to a tree. */ +#define XFS_NRMAPADD_SPACE_RES(mp, b)\ + (((b + XFS_MAX_CONTIG_RMAPS_PER_BLOCK(mp) - 1) / \ + XFS_MAX_CONTIG_RMAPS_PER_BLOCK(mp)) * \ + XFS_RMAPADD_SPACE_RES(mp)) + #define XFS_MAX_CONTIG_EXTENTS_PER_BLOCK(mp) \ (((mp)->m_alloc_mxr[0]) - ((mp)->m_alloc_mnr[0])) #define XFS_EXTENTADD_SPACE_RES(mp,w) (XFS_BM_MAXLEVELS(mp,w) - 1) @@ -30,13 +42,12 @@ (((b + XFS_MAX_CONTIG_EXTENTS_PER_BLOCK(mp) - 1) / \ XFS_MAX_CONTIG_EXTENTS_PER_BLOCK(mp)) * \ XFS_EXTENTADD_SPACE_RES(mp,w)) + +/* Blocks we might need to add "b" mappings & rmappings to a file. */ #define XFS_SWAP_RMAP_SPACE_RES(mp,b,w)\ - (((b + XFS_MAX_CONTIG_EXTENTS_PER_BLOCK(mp) - 1) / \ - XFS_MAX_CONTIG_EXTENTS_PER_BLOCK(mp)) * \ - XFS_EXTENTADD_SPACE_RES(mp,w) + \ - ((b + XFS_MAX_CONTIG_RMAPS_PER_BLOCK(mp) - 1) / \ - XFS_MAX_CONTIG_RMAPS_PER_BLOCK(mp)) * \ - (mp)->m_rmap_maxlevels) + (XFS_NEXTENTADD_SPACE_RES((mp), (b), (w)) + \ + XFS_NRMAPADD_SPACE_RES((mp), (b))) + #define XFS_DAENTER_1B(mp,w) \ ((w) == XFS_DATA_FORK ? (mp)->m_dir_geo->fsbcount : 1) #define XFS_DAENTER_DBS(mp,w) \ diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 61494295d92f..09af0f7cd55e 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -111,11 +111,11 @@ xfs_finish_page_writeback( bsize = bh->b_size; do { + if (off > end) + break; next = bh->b_this_page; if (off < bvec->bv_offset) goto next_bh; - if (off > end) - break; bh->b_end_io(bh, !error); next_bh: off += bsize; @@ -189,7 +189,7 @@ xfs_setfilesize_trans_alloc( * We hand off the transaction to the completion thread now, so * clear the flag here. */ - current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); + current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS); return 0; } @@ -252,7 +252,7 @@ xfs_setfilesize_ioend( * thus we need to mark ourselves as being in a transaction manually. * Similarly for freeze protection. */ - current_set_flags_nested(&tp->t_pflags, PF_FSTRANS); + current_set_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS); __sb_writers_acquired(VFS_I(ip)->i_sb, SB_FREEZE_FS); /* we abort the update if there was an IO error */ @@ -1016,7 +1016,7 @@ xfs_do_writepage( * Given that we do not allow direct reclaim to call us, we should * never be called while in a filesystem transaction. */ - if (WARN_ON_ONCE(current->flags & PF_FSTRANS)) + if (WARN_ON_ONCE(current->flags & PF_MEMALLOC_NOFS)) goto redirty; /* @@ -1261,8 +1261,8 @@ xfs_get_blocks( if (nimaps) { trace_xfs_get_blocks_found(ip, offset, size, - ISUNWRITTEN(&imap) ? XFS_IO_UNWRITTEN - : XFS_IO_OVERWRITE, &imap); + imap.br_state == XFS_EXT_UNWRITTEN ? + XFS_IO_UNWRITTEN : XFS_IO_OVERWRITE, &imap); xfs_iunlock(ip, lockmode); } else { trace_xfs_get_blocks_notfound(ip, offset, size); @@ -1276,9 +1276,7 @@ xfs_get_blocks( * For unwritten extents do not report a disk address in the buffered * read case (treat as if we're reading into a hole). */ - if (imap.br_startblock != HOLESTARTBLOCK && - imap.br_startblock != DELAYSTARTBLOCK && - !ISUNWRITTEN(&imap)) + if (xfs_bmap_is_real_extent(&imap)) xfs_map_buffer(inode, bh_result, &imap, offset); /* diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c index 9bf57c76623b..d419d23fa214 100644 --- a/fs/xfs/xfs_bmap_item.c +++ b/fs/xfs/xfs_bmap_item.c @@ -34,6 +34,8 @@ #include "xfs_bmap.h" #include "xfs_icache.h" #include "xfs_trace.h" +#include "xfs_bmap_btree.h" +#include "xfs_trans_space.h" kmem_zone_t *xfs_bui_zone; @@ -215,6 +217,7 @@ void xfs_bui_release( struct xfs_bui_log_item *buip) { + ASSERT(atomic_read(&buip->bui_refcount) > 0); if (atomic_dec_and_test(&buip->bui_refcount)) { xfs_trans_ail_remove(&buip->bui_item, SHUTDOWN_LOG_IO_ERROR); xfs_bui_item_free(buip); @@ -446,7 +449,8 @@ xfs_bui_recover( return -EIO; } - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, + XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK), 0, 0, &tp); if (error) return error; budp = xfs_trans_get_bud(tp, buip); diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 828532ce0adc..2b954308a1d6 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -81,7 +81,7 @@ xfs_zero_extent( return blkdev_issue_zeroout(xfs_find_bdev_for_inode(VFS_I(ip)), block << (mp->m_super->s_blocksize_bits - 9), count_fsb << (mp->m_super->s_blocksize_bits - 9), - GFP_NOFS, true); + GFP_NOFS, 0); } int @@ -448,10 +448,9 @@ xfs_getbmap_adjust_shared( next_map->br_blockcount = 0; /* Only written data blocks can be shared. */ - if (!xfs_is_reflink_inode(ip) || whichfork != XFS_DATA_FORK || - map->br_startblock == DELAYSTARTBLOCK || - map->br_startblock == HOLESTARTBLOCK || - ISUNWRITTEN(map)) + if (!xfs_is_reflink_inode(ip) || + whichfork != XFS_DATA_FORK || + !xfs_bmap_is_real_extent(map)) return 0; agno = XFS_FSB_TO_AGNO(mp, map->br_startblock); @@ -904,9 +903,9 @@ xfs_can_free_eofblocks(struct xfs_inode *ip, bool force) } /* - * This is called by xfs_inactive to free any blocks beyond eof - * when the link count isn't zero and by xfs_dm_punch_hole() when - * punching a hole to EOF. + * This is called to free any blocks beyond eof. The caller must hold + * IOLOCK_EXCL unless we are in the inode reclaim path and have the only + * reference to the inode. */ int xfs_free_eofblocks( @@ -921,8 +920,6 @@ xfs_free_eofblocks( struct xfs_bmbt_irec imap; struct xfs_mount *mp = ip->i_mount; - ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL)); - /* * Figure out if there are any blocks beyond the end * of the file. If not, then there is nothing to do. @@ -1209,11 +1206,8 @@ xfs_adjust_extent_unmap_boundaries( return error; if (nimap && imap.br_startblock != HOLESTARTBLOCK) { - xfs_daddr_t block; - ASSERT(imap.br_startblock != DELAYSTARTBLOCK); - block = imap.br_startblock; - mod = do_div(block, mp->m_sb.sb_rextsize); + mod = do_mod(imap.br_startblock, mp->m_sb.sb_rextsize); if (mod) *startoffset_fsb += mp->m_sb.sb_rextsize - mod; } diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index b6208728ba39..62fa39276a24 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -443,17 +443,17 @@ _xfs_buf_map_pages( bp->b_addr = NULL; } else { int retried = 0; - unsigned noio_flag; + unsigned nofs_flag; /* * vm_map_ram() will allocate auxillary structures (e.g. * pagetables) with GFP_KERNEL, yet we are likely to be under * GFP_NOFS context here. Hence we need to tell memory reclaim - * that we are in such a context via PF_MEMALLOC_NOIO to prevent + * that we are in such a context via PF_MEMALLOC_NOFS to prevent * memory reclaim re-entering the filesystem here and * potentially deadlocking. */ - noio_flag = memalloc_noio_save(); + nofs_flag = memalloc_nofs_save(); do { bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count, -1, PAGE_KERNEL); @@ -461,7 +461,7 @@ _xfs_buf_map_pages( break; vm_unmap_aliases(); } while (retried++ <= 1); - memalloc_noio_restore(noio_flag); + memalloc_nofs_restore(nofs_flag); if (!bp->b_addr) return -ENOMEM; @@ -1079,6 +1079,8 @@ void xfs_buf_unlock( struct xfs_buf *bp) { + ASSERT(xfs_buf_islocked(bp)); + XB_CLEAR_OWNER(bp); up(&bp->b_sema); @@ -1815,6 +1817,28 @@ error: } /* + * Cancel a delayed write list. + * + * Remove each buffer from the list, clear the delwri queue flag and drop the + * associated buffer reference. + */ +void +xfs_buf_delwri_cancel( + struct list_head *list) +{ + struct xfs_buf *bp; + + while (!list_empty(list)) { + bp = list_first_entry(list, struct xfs_buf, b_list); + + xfs_buf_lock(bp); + bp->b_flags &= ~_XBF_DELWRI_Q; + list_del_init(&bp->b_list); + xfs_buf_relse(bp); + } +} + +/* * Add a buffer to the delayed write list. * * This queues a buffer for writeout if it hasn't already been. Note that diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index 3c867e5a63e1..8d1d44f87ce9 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -291,7 +291,6 @@ xfs_buf_readahead( return xfs_buf_readahead_map(target, &map, 1, ops); } -struct xfs_buf *xfs_buf_get_empty(struct xfs_buftarg *target, size_t numblks); void xfs_buf_set_empty(struct xfs_buf *bp, size_t numblks); int xfs_buf_associate_memory(struct xfs_buf *bp, void *mem, size_t length); @@ -330,6 +329,7 @@ extern void *xfs_buf_offset(struct xfs_buf *, size_t); extern void xfs_buf_stale(struct xfs_buf *bp); /* Delayed Write Buffer Routines */ +extern void xfs_buf_delwri_cancel(struct list_head *); extern bool xfs_buf_delwri_queue(struct xfs_buf *, struct list_head *); extern int xfs_buf_delwri_submit(struct list_head *); extern int xfs_buf_delwri_submit_nowait(struct list_head *); diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c index ad9396e516f6..20b7a5c6eb2f 100644 --- a/fs/xfs/xfs_dir2_readdir.c +++ b/fs/xfs/xfs_dir2_readdir.c @@ -394,6 +394,7 @@ xfs_dir2_leaf_readbuf( /* * Do we need more readahead? + * Each loop tries to process 1 full dir blk; last may be partial. */ blk_start_plug(&plug); for (mip->ra_index = mip->ra_offset = i = 0; @@ -404,7 +405,8 @@ xfs_dir2_leaf_readbuf( * Read-ahead a contiguous directory block. */ if (i > mip->ra_current && - map[mip->ra_index].br_blockcount >= geo->fsbcount) { + (map[mip->ra_index].br_blockcount - mip->ra_offset) >= + geo->fsbcount) { xfs_dir3_data_readahead(dp, map[mip->ra_index].br_startoff + mip->ra_offset, XFS_FSB_TO_DADDR(dp->i_mount, @@ -425,14 +427,19 @@ xfs_dir2_leaf_readbuf( } /* - * Advance offset through the mapping table. + * Advance offset through the mapping table, processing a full + * dir block even if it is fragmented into several extents. + * But stop if we have consumed all valid mappings, even if + * it's not yet a full directory block. */ - for (j = 0; j < geo->fsbcount; j += length ) { + for (j = 0; + j < geo->fsbcount && mip->ra_index < mip->map_valid; + j += length ) { /* * The rest of this extent but not more than a dir * block. */ - length = min_t(int, geo->fsbcount, + length = min_t(int, geo->fsbcount - j, map[mip->ra_index].br_blockcount - mip->ra_offset); mip->ra_offset += length; diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c index d796ffac7296..6a05d278da64 100644 --- a/fs/xfs/xfs_discard.c +++ b/fs/xfs/xfs_discard.c @@ -132,6 +132,11 @@ next_extent: error = xfs_btree_decrement(cur, 0, &i); if (error) goto out_del_cursor; + + if (fatal_signal_pending(current)) { + error = -ERESTARTSYS; + goto out_del_cursor; + } } out_del_cursor: @@ -196,8 +201,11 @@ xfs_ioc_trim( for (agno = start_agno; agno <= end_agno; agno++) { error = xfs_trim_extents(mp, agno, start, end, minlen, &blocks_trimmed); - if (error) + if (error) { last_error = error; + if (error == -ERESTARTSYS) + break; + } } if (last_error) diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index d7bc14906af8..44f8c5451210 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -290,6 +290,7 @@ void xfs_efi_release( struct xfs_efi_log_item *efip) { + ASSERT(atomic_read(&efip->efi_refcount) > 0); if (atomic_dec_and_test(&efip->efi_refcount)) { xfs_trans_ail_remove(&efip->efi_item, SHUTDOWN_LOG_IO_ERROR); xfs_efi_item_free(efip); diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c new file mode 100644 index 000000000000..3683819887a5 --- /dev/null +++ b/fs/xfs/xfs_fsmap.c @@ -0,0 +1,940 @@ +/* + * Copyright (C) 2017 Oracle. All Rights Reserved. + * + * Author: Darrick J. Wong <darrick.wong@oracle.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_sb.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_inode.h" +#include "xfs_trans.h" +#include "xfs_error.h" +#include "xfs_btree.h" +#include "xfs_rmap_btree.h" +#include "xfs_trace.h" +#include "xfs_log.h" +#include "xfs_rmap.h" +#include "xfs_alloc.h" +#include "xfs_bit.h" +#include <linux/fsmap.h> +#include "xfs_fsmap.h" +#include "xfs_refcount.h" +#include "xfs_refcount_btree.h" +#include "xfs_alloc_btree.h" +#include "xfs_rtalloc.h" + +/* Convert an xfs_fsmap to an fsmap. */ +void +xfs_fsmap_from_internal( + struct fsmap *dest, + struct xfs_fsmap *src) +{ + dest->fmr_device = src->fmr_device; + dest->fmr_flags = src->fmr_flags; + dest->fmr_physical = BBTOB(src->fmr_physical); + dest->fmr_owner = src->fmr_owner; + dest->fmr_offset = BBTOB(src->fmr_offset); + dest->fmr_length = BBTOB(src->fmr_length); + dest->fmr_reserved[0] = 0; + dest->fmr_reserved[1] = 0; + dest->fmr_reserved[2] = 0; +} + +/* Convert an fsmap to an xfs_fsmap. */ +void +xfs_fsmap_to_internal( + struct xfs_fsmap *dest, + struct fsmap *src) +{ + dest->fmr_device = src->fmr_device; + dest->fmr_flags = src->fmr_flags; + dest->fmr_physical = BTOBBT(src->fmr_physical); + dest->fmr_owner = src->fmr_owner; + dest->fmr_offset = BTOBBT(src->fmr_offset); + dest->fmr_length = BTOBBT(src->fmr_length); +} + +/* Convert an fsmap owner into an rmapbt owner. */ +static int +xfs_fsmap_owner_to_rmap( + struct xfs_rmap_irec *dest, + struct xfs_fsmap *src) +{ + if (!(src->fmr_flags & FMR_OF_SPECIAL_OWNER)) { + dest->rm_owner = src->fmr_owner; + return 0; + } + + switch (src->fmr_owner) { + case 0: /* "lowest owner id possible" */ + case -1ULL: /* "highest owner id possible" */ + dest->rm_owner = 0; + break; + case XFS_FMR_OWN_FREE: + dest->rm_owner = XFS_RMAP_OWN_NULL; + break; + case XFS_FMR_OWN_UNKNOWN: + dest->rm_owner = XFS_RMAP_OWN_UNKNOWN; + break; + case XFS_FMR_OWN_FS: + dest->rm_owner = XFS_RMAP_OWN_FS; + break; + case XFS_FMR_OWN_LOG: + dest->rm_owner = XFS_RMAP_OWN_LOG; + break; + case XFS_FMR_OWN_AG: + dest->rm_owner = XFS_RMAP_OWN_AG; + break; + case XFS_FMR_OWN_INOBT: + dest->rm_owner = XFS_RMAP_OWN_INOBT; + break; + case XFS_FMR_OWN_INODES: + dest->rm_owner = XFS_RMAP_OWN_INODES; + break; + case XFS_FMR_OWN_REFC: + dest->rm_owner = XFS_RMAP_OWN_REFC; + break; + case XFS_FMR_OWN_COW: + dest->rm_owner = XFS_RMAP_OWN_COW; + break; + case XFS_FMR_OWN_DEFECTIVE: /* not implemented */ + /* fall through */ + default: + return -EINVAL; + } + return 0; +} + +/* Convert an rmapbt owner into an fsmap owner. */ +static int +xfs_fsmap_owner_from_rmap( + struct xfs_fsmap *dest, + struct xfs_rmap_irec *src) +{ + dest->fmr_flags = 0; + if (!XFS_RMAP_NON_INODE_OWNER(src->rm_owner)) { + dest->fmr_owner = src->rm_owner; + return 0; + } + dest->fmr_flags |= FMR_OF_SPECIAL_OWNER; + + switch (src->rm_owner) { + case XFS_RMAP_OWN_FS: + dest->fmr_owner = XFS_FMR_OWN_FS; + break; + case XFS_RMAP_OWN_LOG: + dest->fmr_owner = XFS_FMR_OWN_LOG; + break; + case XFS_RMAP_OWN_AG: + dest->fmr_owner = XFS_FMR_OWN_AG; + break; + case XFS_RMAP_OWN_INOBT: + dest->fmr_owner = XFS_FMR_OWN_INOBT; + break; + case XFS_RMAP_OWN_INODES: + dest->fmr_owner = XFS_FMR_OWN_INODES; + break; + case XFS_RMAP_OWN_REFC: + dest->fmr_owner = XFS_FMR_OWN_REFC; + break; + case XFS_RMAP_OWN_COW: + dest->fmr_owner = XFS_FMR_OWN_COW; + break; + case XFS_RMAP_OWN_NULL: /* "free" */ + dest->fmr_owner = XFS_FMR_OWN_FREE; + break; + default: + return -EFSCORRUPTED; + } + return 0; +} + +/* getfsmap query state */ +struct xfs_getfsmap_info { + struct xfs_fsmap_head *head; + xfs_fsmap_format_t formatter; /* formatting fn */ + void *format_arg; /* format buffer */ + struct xfs_buf *agf_bp; /* AGF, for refcount queries */ + xfs_daddr_t next_daddr; /* next daddr we expect */ + u64 missing_owner; /* owner of holes */ + u32 dev; /* device id */ + xfs_agnumber_t agno; /* AG number, if applicable */ + struct xfs_rmap_irec low; /* low rmap key */ + struct xfs_rmap_irec high; /* high rmap key */ + bool last; /* last extent? */ +}; + +/* Associate a device with a getfsmap handler. */ +struct xfs_getfsmap_dev { + u32 dev; + int (*fn)(struct xfs_trans *tp, + struct xfs_fsmap *keys, + struct xfs_getfsmap_info *info); +}; + +/* Compare two getfsmap device handlers. */ +static int +xfs_getfsmap_dev_compare( + const void *p1, + const void *p2) +{ + const struct xfs_getfsmap_dev *d1 = p1; + const struct xfs_getfsmap_dev *d2 = p2; + + return d1->dev - d2->dev; +} + +/* Decide if this mapping is shared. */ +STATIC int +xfs_getfsmap_is_shared( + struct xfs_trans *tp, + struct xfs_getfsmap_info *info, + struct xfs_rmap_irec *rec, + bool *stat) +{ + struct xfs_mount *mp = tp->t_mountp; + struct xfs_btree_cur *cur; + xfs_agblock_t fbno; + xfs_extlen_t flen; + int error; + + *stat = false; + if (!xfs_sb_version_hasreflink(&mp->m_sb)) + return 0; + /* rt files will have agno set to NULLAGNUMBER */ + if (info->agno == NULLAGNUMBER) + return 0; + + /* Are there any shared blocks here? */ + flen = 0; + cur = xfs_refcountbt_init_cursor(mp, tp, info->agf_bp, + info->agno, NULL); + + error = xfs_refcount_find_shared(cur, rec->rm_startblock, + rec->rm_blockcount, &fbno, &flen, false); + + xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR); + if (error) + return error; + + *stat = flen > 0; + return 0; +} + +/* + * Format a reverse mapping for getfsmap, having translated rm_startblock + * into the appropriate daddr units. + */ +STATIC int +xfs_getfsmap_helper( + struct xfs_trans *tp, + struct xfs_getfsmap_info *info, + struct xfs_rmap_irec *rec, + xfs_daddr_t rec_daddr) +{ + struct xfs_fsmap fmr; + struct xfs_mount *mp = tp->t_mountp; + bool shared; + int error; + + if (fatal_signal_pending(current)) + return -EINTR; + + /* + * Filter out records that start before our startpoint, if the + * caller requested that. + */ + if (xfs_rmap_compare(rec, &info->low) < 0) { + rec_daddr += XFS_FSB_TO_BB(mp, rec->rm_blockcount); + if (info->next_daddr < rec_daddr) + info->next_daddr = rec_daddr; + return XFS_BTREE_QUERY_RANGE_CONTINUE; + } + + /* Are we just counting mappings? */ + if (info->head->fmh_count == 0) { + if (rec_daddr > info->next_daddr) + info->head->fmh_entries++; + + if (info->last) + return XFS_BTREE_QUERY_RANGE_CONTINUE; + + info->head->fmh_entries++; + + rec_daddr += XFS_FSB_TO_BB(mp, rec->rm_blockcount); + if (info->next_daddr < rec_daddr) + info->next_daddr = rec_daddr; + return XFS_BTREE_QUERY_RANGE_CONTINUE; + } + + /* + * If the record starts past the last physical block we saw, + * then we've found a gap. Report the gap as being owned by + * whatever the caller specified is the missing owner. + */ + if (rec_daddr > info->next_daddr) { + if (info->head->fmh_entries >= info->head->fmh_count) + return XFS_BTREE_QUERY_RANGE_ABORT; + + fmr.fmr_device = info->dev; + fmr.fmr_physical = info->next_daddr; + fmr.fmr_owner = info->missing_owner; + fmr.fmr_offset = 0; + fmr.fmr_length = rec_daddr - info->next_daddr; + fmr.fmr_flags = FMR_OF_SPECIAL_OWNER; + error = info->formatter(&fmr, info->format_arg); + if (error) + return error; + info->head->fmh_entries++; + } + + if (info->last) + goto out; + + /* Fill out the extent we found */ + if (info->head->fmh_entries >= info->head->fmh_count) + return XFS_BTREE_QUERY_RANGE_ABORT; + + trace_xfs_fsmap_mapping(mp, info->dev, info->agno, rec); + + fmr.fmr_device = info->dev; + fmr.fmr_physical = rec_daddr; + error = xfs_fsmap_owner_from_rmap(&fmr, rec); + if (error) + return error; + fmr.fmr_offset = XFS_FSB_TO_BB(mp, rec->rm_offset); + fmr.fmr_length = XFS_FSB_TO_BB(mp, rec->rm_blockcount); + if (rec->rm_flags & XFS_RMAP_UNWRITTEN) + fmr.fmr_flags |= FMR_OF_PREALLOC; + if (rec->rm_flags & XFS_RMAP_ATTR_FORK) + fmr.fmr_flags |= FMR_OF_ATTR_FORK; + if (rec->rm_flags & XFS_RMAP_BMBT_BLOCK) + fmr.fmr_flags |= FMR_OF_EXTENT_MAP; + if (fmr.fmr_flags == 0) { + error = xfs_getfsmap_is_shared(tp, info, rec, &shared); + if (error) + return error; + if (shared) + fmr.fmr_flags |= FMR_OF_SHARED; + } + error = info->formatter(&fmr, info->format_arg); + if (error) + return error; + info->head->fmh_entries++; + +out: + rec_daddr += XFS_FSB_TO_BB(mp, rec->rm_blockcount); + if (info->next_daddr < rec_daddr) + info->next_daddr = rec_daddr; + return XFS_BTREE_QUERY_RANGE_CONTINUE; +} + +/* Transform a rmapbt irec into a fsmap */ +STATIC int +xfs_getfsmap_datadev_helper( + struct xfs_btree_cur *cur, + struct xfs_rmap_irec *rec, + void *priv) +{ + struct xfs_mount *mp = cur->bc_mp; + struct xfs_getfsmap_info *info = priv; + xfs_fsblock_t fsb; + xfs_daddr_t rec_daddr; + + fsb = XFS_AGB_TO_FSB(mp, cur->bc_private.a.agno, rec->rm_startblock); + rec_daddr = XFS_FSB_TO_DADDR(mp, fsb); + + return xfs_getfsmap_helper(cur->bc_tp, info, rec, rec_daddr); +} + +/* Transform a rtbitmap "record" into a fsmap */ +STATIC int +xfs_getfsmap_rtdev_rtbitmap_helper( + struct xfs_trans *tp, + struct xfs_rtalloc_rec *rec, + void *priv) +{ + struct xfs_mount *mp = tp->t_mountp; + struct xfs_getfsmap_info *info = priv; + struct xfs_rmap_irec irec; + xfs_daddr_t rec_daddr; + + rec_daddr = XFS_FSB_TO_BB(mp, rec->ar_startblock); + + irec.rm_startblock = rec->ar_startblock; + irec.rm_blockcount = rec->ar_blockcount; + irec.rm_owner = XFS_RMAP_OWN_NULL; /* "free" */ + irec.rm_offset = 0; + irec.rm_flags = 0; + + return xfs_getfsmap_helper(tp, info, &irec, rec_daddr); +} + +/* Transform a bnobt irec into a fsmap */ +STATIC int +xfs_getfsmap_datadev_bnobt_helper( + struct xfs_btree_cur *cur, + struct xfs_alloc_rec_incore *rec, + void *priv) +{ + struct xfs_mount *mp = cur->bc_mp; + struct xfs_getfsmap_info *info = priv; + struct xfs_rmap_irec irec; + xfs_daddr_t rec_daddr; + + rec_daddr = XFS_AGB_TO_DADDR(mp, cur->bc_private.a.agno, + rec->ar_startblock); + + irec.rm_startblock = rec->ar_startblock; + irec.rm_blockcount = rec->ar_blockcount; + irec.rm_owner = XFS_RMAP_OWN_NULL; /* "free" */ + irec.rm_offset = 0; + irec.rm_flags = 0; + + return xfs_getfsmap_helper(cur->bc_tp, info, &irec, rec_daddr); +} + +/* Set rmap flags based on the getfsmap flags */ +static void +xfs_getfsmap_set_irec_flags( + struct xfs_rmap_irec *irec, + struct xfs_fsmap *fmr) +{ + irec->rm_flags = 0; + if (fmr->fmr_flags & FMR_OF_ATTR_FORK) + irec->rm_flags |= XFS_RMAP_ATTR_FORK; + if (fmr->fmr_flags & FMR_OF_EXTENT_MAP) + irec->rm_flags |= XFS_RMAP_BMBT_BLOCK; + if (fmr->fmr_flags & FMR_OF_PREALLOC) + irec->rm_flags |= XFS_RMAP_UNWRITTEN; +} + +/* Execute a getfsmap query against the log device. */ +STATIC int +xfs_getfsmap_logdev( + struct xfs_trans *tp, + struct xfs_fsmap *keys, + struct xfs_getfsmap_info *info) +{ + struct xfs_mount *mp = tp->t_mountp; + struct xfs_rmap_irec rmap; + int error; + + /* Set up search keys */ + info->low.rm_startblock = XFS_BB_TO_FSBT(mp, keys[0].fmr_physical); + info->low.rm_offset = XFS_BB_TO_FSBT(mp, keys[0].fmr_offset); + error = xfs_fsmap_owner_to_rmap(&info->low, keys); + if (error) + return error; + info->low.rm_blockcount = 0; + xfs_getfsmap_set_irec_flags(&info->low, &keys[0]); + + error = xfs_fsmap_owner_to_rmap(&info->high, keys + 1); + if (error) + return error; + info->high.rm_startblock = -1U; + info->high.rm_owner = ULLONG_MAX; + info->high.rm_offset = ULLONG_MAX; + info->high.rm_blockcount = 0; + info->high.rm_flags = XFS_RMAP_KEY_FLAGS | XFS_RMAP_REC_FLAGS; + info->missing_owner = XFS_FMR_OWN_FREE; + + trace_xfs_fsmap_low_key(mp, info->dev, info->agno, &info->low); + trace_xfs_fsmap_high_key(mp, info->dev, info->agno, &info->high); + + if (keys[0].fmr_physical > 0) + return 0; + + /* Fabricate an rmap entry for the external log device. */ + rmap.rm_startblock = 0; + rmap.rm_blockcount = mp->m_sb.sb_logblocks; + rmap.rm_owner = XFS_RMAP_OWN_LOG; + rmap.rm_offset = 0; + rmap.rm_flags = 0; + + return xfs_getfsmap_helper(tp, info, &rmap, 0); +} + +/* Execute a getfsmap query against the realtime device. */ +STATIC int +__xfs_getfsmap_rtdev( + struct xfs_trans *tp, + struct xfs_fsmap *keys, + int (*query_fn)(struct xfs_trans *, + struct xfs_getfsmap_info *), + struct xfs_getfsmap_info *info) +{ + struct xfs_mount *mp = tp->t_mountp; + xfs_fsblock_t start_fsb; + xfs_fsblock_t end_fsb; + xfs_daddr_t eofs; + int error = 0; + + eofs = XFS_FSB_TO_BB(mp, mp->m_sb.sb_rblocks); + if (keys[0].fmr_physical >= eofs) + return 0; + if (keys[1].fmr_physical >= eofs) + keys[1].fmr_physical = eofs - 1; + start_fsb = XFS_BB_TO_FSBT(mp, keys[0].fmr_physical); + end_fsb = XFS_BB_TO_FSB(mp, keys[1].fmr_physical); + + /* Set up search keys */ + info->low.rm_startblock = start_fsb; + error = xfs_fsmap_owner_to_rmap(&info->low, &keys[0]); + if (error) + return error; + info->low.rm_offset = XFS_BB_TO_FSBT(mp, keys[0].fmr_offset); + info->low.rm_blockcount = 0; + xfs_getfsmap_set_irec_flags(&info->low, &keys[0]); + + info->high.rm_startblock = end_fsb; + error = xfs_fsmap_owner_to_rmap(&info->high, &keys[1]); + if (error) + return error; + info->high.rm_offset = XFS_BB_TO_FSBT(mp, keys[1].fmr_offset); + info->high.rm_blockcount = 0; + xfs_getfsmap_set_irec_flags(&info->high, &keys[1]); + + trace_xfs_fsmap_low_key(mp, info->dev, info->agno, &info->low); + trace_xfs_fsmap_high_key(mp, info->dev, info->agno, &info->high); + + return query_fn(tp, info); +} + +/* Actually query the realtime bitmap. */ +STATIC int +xfs_getfsmap_rtdev_rtbitmap_query( + struct xfs_trans *tp, + struct xfs_getfsmap_info *info) +{ + struct xfs_rtalloc_rec alow; + struct xfs_rtalloc_rec ahigh; + int error; + + xfs_ilock(tp->t_mountp->m_rbmip, XFS_ILOCK_SHARED); + + alow.ar_startblock = info->low.rm_startblock; + ahigh.ar_startblock = info->high.rm_startblock; + error = xfs_rtalloc_query_range(tp, &alow, &ahigh, + xfs_getfsmap_rtdev_rtbitmap_helper, info); + if (error) + goto err; + + /* Report any gaps at the end of the rtbitmap */ + info->last = true; + error = xfs_getfsmap_rtdev_rtbitmap_helper(tp, &ahigh, info); + if (error) + goto err; +err: + xfs_iunlock(tp->t_mountp->m_rbmip, XFS_ILOCK_SHARED); + return error; +} + +/* Execute a getfsmap query against the realtime device rtbitmap. */ +STATIC int +xfs_getfsmap_rtdev_rtbitmap( + struct xfs_trans *tp, + struct xfs_fsmap *keys, + struct xfs_getfsmap_info *info) +{ + info->missing_owner = XFS_FMR_OWN_UNKNOWN; + return __xfs_getfsmap_rtdev(tp, keys, xfs_getfsmap_rtdev_rtbitmap_query, + info); +} + +/* Execute a getfsmap query against the regular data device. */ +STATIC int +__xfs_getfsmap_datadev( + struct xfs_trans *tp, + struct xfs_fsmap *keys, + struct xfs_getfsmap_info *info, + int (*query_fn)(struct xfs_trans *, + struct xfs_getfsmap_info *, + struct xfs_btree_cur **, + void *), + void *priv) +{ + struct xfs_mount *mp = tp->t_mountp; + struct xfs_btree_cur *bt_cur = NULL; + xfs_fsblock_t start_fsb; + xfs_fsblock_t end_fsb; + xfs_agnumber_t start_ag; + xfs_agnumber_t end_ag; + xfs_daddr_t eofs; + int error = 0; + + eofs = XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks); + if (keys[0].fmr_physical >= eofs) + return 0; + if (keys[1].fmr_physical >= eofs) + keys[1].fmr_physical = eofs - 1; + start_fsb = XFS_DADDR_TO_FSB(mp, keys[0].fmr_physical); + end_fsb = XFS_DADDR_TO_FSB(mp, keys[1].fmr_physical); + + /* + * Convert the fsmap low/high keys to AG based keys. Initialize + * low to the fsmap low key and max out the high key to the end + * of the AG. + */ + info->low.rm_startblock = XFS_FSB_TO_AGBNO(mp, start_fsb); + info->low.rm_offset = XFS_BB_TO_FSBT(mp, keys[0].fmr_offset); + error = xfs_fsmap_owner_to_rmap(&info->low, &keys[0]); + if (error) + return error; + info->low.rm_blockcount = 0; + xfs_getfsmap_set_irec_flags(&info->low, &keys[0]); + + info->high.rm_startblock = -1U; + info->high.rm_owner = ULLONG_MAX; + info->high.rm_offset = ULLONG_MAX; + info->high.rm_blockcount = 0; + info->high.rm_flags = XFS_RMAP_KEY_FLAGS | XFS_RMAP_REC_FLAGS; + + start_ag = XFS_FSB_TO_AGNO(mp, start_fsb); + end_ag = XFS_FSB_TO_AGNO(mp, end_fsb); + + /* Query each AG */ + for (info->agno = start_ag; info->agno <= end_ag; info->agno++) { + /* + * Set the AG high key from the fsmap high key if this + * is the last AG that we're querying. + */ + if (info->agno == end_ag) { + info->high.rm_startblock = XFS_FSB_TO_AGBNO(mp, + end_fsb); + info->high.rm_offset = XFS_BB_TO_FSBT(mp, + keys[1].fmr_offset); + error = xfs_fsmap_owner_to_rmap(&info->high, &keys[1]); + if (error) + goto err; + xfs_getfsmap_set_irec_flags(&info->high, &keys[1]); + } + + if (bt_cur) { + xfs_btree_del_cursor(bt_cur, XFS_BTREE_NOERROR); + bt_cur = NULL; + xfs_trans_brelse(tp, info->agf_bp); + info->agf_bp = NULL; + } + + error = xfs_alloc_read_agf(mp, tp, info->agno, 0, + &info->agf_bp); + if (error) + goto err; + + trace_xfs_fsmap_low_key(mp, info->dev, info->agno, &info->low); + trace_xfs_fsmap_high_key(mp, info->dev, info->agno, + &info->high); + + error = query_fn(tp, info, &bt_cur, priv); + if (error) + goto err; + + /* + * Set the AG low key to the start of the AG prior to + * moving on to the next AG. + */ + if (info->agno == start_ag) { + info->low.rm_startblock = 0; + info->low.rm_owner = 0; + info->low.rm_offset = 0; + info->low.rm_flags = 0; + } + } + + /* Report any gap at the end of the AG */ + info->last = true; + error = query_fn(tp, info, &bt_cur, priv); + if (error) + goto err; + +err: + if (bt_cur) + xfs_btree_del_cursor(bt_cur, error < 0 ? XFS_BTREE_ERROR : + XFS_BTREE_NOERROR); + if (info->agf_bp) { + xfs_trans_brelse(tp, info->agf_bp); + info->agf_bp = NULL; + } + + return error; +} + +/* Actually query the rmap btree. */ +STATIC int +xfs_getfsmap_datadev_rmapbt_query( + struct xfs_trans *tp, + struct xfs_getfsmap_info *info, + struct xfs_btree_cur **curpp, + void *priv) +{ + /* Report any gap at the end of the last AG. */ + if (info->last) + return xfs_getfsmap_datadev_helper(*curpp, &info->high, info); + + /* Allocate cursor for this AG and query_range it. */ + *curpp = xfs_rmapbt_init_cursor(tp->t_mountp, tp, info->agf_bp, + info->agno); + return xfs_rmap_query_range(*curpp, &info->low, &info->high, + xfs_getfsmap_datadev_helper, info); +} + +/* Execute a getfsmap query against the regular data device rmapbt. */ +STATIC int +xfs_getfsmap_datadev_rmapbt( + struct xfs_trans *tp, + struct xfs_fsmap *keys, + struct xfs_getfsmap_info *info) +{ + info->missing_owner = XFS_FMR_OWN_FREE; + return __xfs_getfsmap_datadev(tp, keys, info, + xfs_getfsmap_datadev_rmapbt_query, NULL); +} + +/* Actually query the bno btree. */ +STATIC int +xfs_getfsmap_datadev_bnobt_query( + struct xfs_trans *tp, + struct xfs_getfsmap_info *info, + struct xfs_btree_cur **curpp, + void *priv) +{ + struct xfs_alloc_rec_incore *key = priv; + + /* Report any gap at the end of the last AG. */ + if (info->last) + return xfs_getfsmap_datadev_bnobt_helper(*curpp, &key[1], info); + + /* Allocate cursor for this AG and query_range it. */ + *curpp = xfs_allocbt_init_cursor(tp->t_mountp, tp, info->agf_bp, + info->agno, XFS_BTNUM_BNO); + key->ar_startblock = info->low.rm_startblock; + key[1].ar_startblock = info->high.rm_startblock; + return xfs_alloc_query_range(*curpp, key, &key[1], + xfs_getfsmap_datadev_bnobt_helper, info); +} + +/* Execute a getfsmap query against the regular data device's bnobt. */ +STATIC int +xfs_getfsmap_datadev_bnobt( + struct xfs_trans *tp, + struct xfs_fsmap *keys, + struct xfs_getfsmap_info *info) +{ + struct xfs_alloc_rec_incore akeys[2]; + + info->missing_owner = XFS_FMR_OWN_UNKNOWN; + return __xfs_getfsmap_datadev(tp, keys, info, + xfs_getfsmap_datadev_bnobt_query, &akeys[0]); +} + +/* Do we recognize the device? */ +STATIC bool +xfs_getfsmap_is_valid_device( + struct xfs_mount *mp, + struct xfs_fsmap *fm) +{ + if (fm->fmr_device == 0 || fm->fmr_device == UINT_MAX || + fm->fmr_device == new_encode_dev(mp->m_ddev_targp->bt_dev)) + return true; + if (mp->m_logdev_targp && + fm->fmr_device == new_encode_dev(mp->m_logdev_targp->bt_dev)) + return true; + if (mp->m_rtdev_targp && + fm->fmr_device == new_encode_dev(mp->m_rtdev_targp->bt_dev)) + return true; + return false; +} + +/* Ensure that the low key is less than the high key. */ +STATIC bool +xfs_getfsmap_check_keys( + struct xfs_fsmap *low_key, + struct xfs_fsmap *high_key) +{ + if (low_key->fmr_device > high_key->fmr_device) + return false; + if (low_key->fmr_device < high_key->fmr_device) + return true; + + if (low_key->fmr_physical > high_key->fmr_physical) + return false; + if (low_key->fmr_physical < high_key->fmr_physical) + return true; + + if (low_key->fmr_owner > high_key->fmr_owner) + return false; + if (low_key->fmr_owner < high_key->fmr_owner) + return true; + + if (low_key->fmr_offset > high_key->fmr_offset) + return false; + if (low_key->fmr_offset < high_key->fmr_offset) + return true; + + return false; +} + +#define XFS_GETFSMAP_DEVS 3 +/* + * Get filesystem's extents as described in head, and format for + * output. Calls formatter to fill the user's buffer until all + * extents are mapped, until the passed-in head->fmh_count slots have + * been filled, or until the formatter short-circuits the loop, if it + * is tracking filled-in extents on its own. + * + * Key to Confusion + * ---------------- + * There are multiple levels of keys and counters at work here: + * xfs_fsmap_head.fmh_keys -- low and high fsmap keys passed in; + * these reflect fs-wide sector addrs. + * dkeys -- fmh_keys used to query each device; + * these are fmh_keys but w/ the low key + * bumped up by fmr_length. + * xfs_getfsmap_info.next_daddr -- next disk addr we expect to see; this + * is how we detect gaps in the fsmap + records and report them. + * xfs_getfsmap_info.low/high -- per-AG low/high keys computed from + * dkeys; used to query the metadata. + */ +int +xfs_getfsmap( + struct xfs_mount *mp, + struct xfs_fsmap_head *head, + xfs_fsmap_format_t formatter, + void *arg) +{ + struct xfs_trans *tp = NULL; + struct xfs_fsmap dkeys[2]; /* per-dev keys */ + struct xfs_getfsmap_dev handlers[XFS_GETFSMAP_DEVS]; + struct xfs_getfsmap_info info = { NULL }; + int i; + int error = 0; + + if (head->fmh_iflags & ~FMH_IF_VALID) + return -EINVAL; + if (!xfs_getfsmap_is_valid_device(mp, &head->fmh_keys[0]) || + !xfs_getfsmap_is_valid_device(mp, &head->fmh_keys[1])) + return -EINVAL; + + head->fmh_entries = 0; + + /* Set up our device handlers. */ + memset(handlers, 0, sizeof(handlers)); + handlers[0].dev = new_encode_dev(mp->m_ddev_targp->bt_dev); + if (xfs_sb_version_hasrmapbt(&mp->m_sb)) + handlers[0].fn = xfs_getfsmap_datadev_rmapbt; + else + handlers[0].fn = xfs_getfsmap_datadev_bnobt; + if (mp->m_logdev_targp != mp->m_ddev_targp) { + handlers[1].dev = new_encode_dev(mp->m_logdev_targp->bt_dev); + handlers[1].fn = xfs_getfsmap_logdev; + } + if (mp->m_rtdev_targp) { + handlers[2].dev = new_encode_dev(mp->m_rtdev_targp->bt_dev); + handlers[2].fn = xfs_getfsmap_rtdev_rtbitmap; + } + + xfs_sort(handlers, XFS_GETFSMAP_DEVS, sizeof(struct xfs_getfsmap_dev), + xfs_getfsmap_dev_compare); + + /* + * To continue where we left off, we allow userspace to use the + * last mapping from a previous call as the low key of the next. + * This is identified by a non-zero length in the low key. We + * have to increment the low key in this scenario to ensure we + * don't return the same mapping again, and instead return the + * very next mapping. + * + * If the low key mapping refers to file data, the same physical + * blocks could be mapped to several other files/offsets. + * According to rmapbt record ordering, the minimal next + * possible record for the block range is the next starting + * offset in the same inode. Therefore, bump the file offset to + * continue the search appropriately. For all other low key + * mapping types (attr blocks, metadata), bump the physical + * offset as there can be no other mapping for the same physical + * block range. + */ + dkeys[0] = head->fmh_keys[0]; + if (dkeys[0].fmr_flags & (FMR_OF_SPECIAL_OWNER | FMR_OF_EXTENT_MAP)) { + dkeys[0].fmr_physical += dkeys[0].fmr_length; + dkeys[0].fmr_owner = 0; + if (dkeys[0].fmr_offset) + return -EINVAL; + } else + dkeys[0].fmr_offset += dkeys[0].fmr_length; + dkeys[0].fmr_length = 0; + memset(&dkeys[1], 0xFF, sizeof(struct xfs_fsmap)); + + if (!xfs_getfsmap_check_keys(dkeys, &head->fmh_keys[1])) + return -EINVAL; + + info.next_daddr = head->fmh_keys[0].fmr_physical + + head->fmh_keys[0].fmr_length; + info.formatter = formatter; + info.format_arg = arg; + info.head = head; + + /* For each device we support... */ + for (i = 0; i < XFS_GETFSMAP_DEVS; i++) { + /* Is this device within the range the user asked for? */ + if (!handlers[i].fn) + continue; + if (head->fmh_keys[0].fmr_device > handlers[i].dev) + continue; + if (head->fmh_keys[1].fmr_device < handlers[i].dev) + break; + + /* + * If this device number matches the high key, we have + * to pass the high key to the handler to limit the + * query results. If the device number exceeds the + * low key, zero out the low key so that we get + * everything from the beginning. + */ + if (handlers[i].dev == head->fmh_keys[1].fmr_device) + dkeys[1] = head->fmh_keys[1]; + if (handlers[i].dev > head->fmh_keys[0].fmr_device) + memset(&dkeys[0], 0, sizeof(struct xfs_fsmap)); + + error = xfs_trans_alloc_empty(mp, &tp); + if (error) + break; + + info.dev = handlers[i].dev; + info.last = false; + info.agno = NULLAGNUMBER; + error = handlers[i].fn(tp, dkeys, &info); + if (error) + break; + xfs_trans_cancel(tp); + tp = NULL; + info.next_daddr = 0; + } + + if (tp) + xfs_trans_cancel(tp); + head->fmh_oflags = FMH_OF_DEV_T; + return error; +} diff --git a/fs/xfs/xfs_fsmap.h b/fs/xfs/xfs_fsmap.h new file mode 100644 index 000000000000..0b9bf822595c --- /dev/null +++ b/fs/xfs/xfs_fsmap.h @@ -0,0 +1,53 @@ +/* + * Copyright (C) 2017 Oracle. All Rights Reserved. + * + * Author: Darrick J. Wong <darrick.wong@oracle.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. + */ +#ifndef __XFS_FSMAP_H__ +#define __XFS_FSMAP_H__ + +struct fsmap; + +/* internal fsmap representation */ +struct xfs_fsmap { + dev_t fmr_device; /* device id */ + uint32_t fmr_flags; /* mapping flags */ + uint64_t fmr_physical; /* device offset of segment */ + uint64_t fmr_owner; /* owner id */ + xfs_fileoff_t fmr_offset; /* file offset of segment */ + xfs_filblks_t fmr_length; /* length of segment, blocks */ +}; + +struct xfs_fsmap_head { + uint32_t fmh_iflags; /* control flags */ + uint32_t fmh_oflags; /* output flags */ + unsigned int fmh_count; /* # of entries in array incl. input */ + unsigned int fmh_entries; /* # of entries filled in (output). */ + + struct xfs_fsmap fmh_keys[2]; /* low and high keys */ +}; + +void xfs_fsmap_from_internal(struct fsmap *dest, struct xfs_fsmap *src); +void xfs_fsmap_to_internal(struct xfs_fsmap *dest, struct fsmap *src); + +/* fsmap to userspace formatter - copy to user & advance pointer */ +typedef int (*xfs_fsmap_format_t)(struct xfs_fsmap *, void *); + +int xfs_getfsmap(struct xfs_mount *mp, struct xfs_fsmap_head *head, + xfs_fsmap_format_t formatter, void *arg); + +#endif /* __XFS_FSMAP_H__ */ diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 3531f8f72fa5..f61c84f8e31a 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -262,6 +262,22 @@ xfs_inode_clear_reclaim_tag( xfs_perag_clear_reclaim_tag(pag); } +static void +xfs_inew_wait( + struct xfs_inode *ip) +{ + wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_INEW_BIT); + DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_INEW_BIT); + + do { + prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE); + if (!xfs_iflags_test(ip, XFS_INEW)) + break; + schedule(); + } while (true); + finish_wait(wq, &wait.wait); +} + /* * When we recycle a reclaimable inode, we need to re-initialise the VFS inode * part of the structure. This is made more complex by the fact we store @@ -366,14 +382,17 @@ xfs_iget_cache_hit( error = xfs_reinit_inode(mp, inode); if (error) { + bool wake; /* * Re-initializing the inode failed, and we are in deep * trouble. Try to re-add it to the reclaim list. */ rcu_read_lock(); spin_lock(&ip->i_flags_lock); - + wake = !!__xfs_iflags_test(ip, XFS_INEW); ip->i_flags &= ~(XFS_INEW | XFS_IRECLAIM); + if (wake) + wake_up_bit(&ip->i_flags, __XFS_INEW_BIT); ASSERT(ip->i_flags & XFS_IRECLAIMABLE); trace_xfs_iget_reclaim_fail(ip); goto out_error; @@ -623,9 +642,11 @@ out_error_or_again: STATIC int xfs_inode_ag_walk_grab( - struct xfs_inode *ip) + struct xfs_inode *ip, + int flags) { struct inode *inode = VFS_I(ip); + bool newinos = !!(flags & XFS_AGITER_INEW_WAIT); ASSERT(rcu_read_lock_held()); @@ -643,7 +664,8 @@ xfs_inode_ag_walk_grab( goto out_unlock_noent; /* avoid new or reclaimable inodes. Leave for reclaim code to flush */ - if (__xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM)) + if ((!newinos && __xfs_iflags_test(ip, XFS_INEW)) || + __xfs_iflags_test(ip, XFS_IRECLAIMABLE | XFS_IRECLAIM)) goto out_unlock_noent; spin_unlock(&ip->i_flags_lock); @@ -671,7 +693,8 @@ xfs_inode_ag_walk( void *args), int flags, void *args, - int tag) + int tag, + int iter_flags) { uint32_t first_index; int last_error = 0; @@ -713,7 +736,7 @@ restart: for (i = 0; i < nr_found; i++) { struct xfs_inode *ip = batch[i]; - if (done || xfs_inode_ag_walk_grab(ip)) + if (done || xfs_inode_ag_walk_grab(ip, iter_flags)) batch[i] = NULL; /* @@ -741,6 +764,9 @@ restart: for (i = 0; i < nr_found; i++) { if (!batch[i]) continue; + if ((iter_flags & XFS_AGITER_INEW_WAIT) && + xfs_iflags_test(batch[i], XFS_INEW)) + xfs_inew_wait(batch[i]); error = execute(batch[i], flags, args); IRELE(batch[i]); if (error == -EAGAIN) { @@ -820,12 +846,13 @@ xfs_cowblocks_worker( } int -xfs_inode_ag_iterator( +xfs_inode_ag_iterator_flags( struct xfs_mount *mp, int (*execute)(struct xfs_inode *ip, int flags, void *args), int flags, - void *args) + void *args, + int iter_flags) { struct xfs_perag *pag; int error = 0; @@ -835,7 +862,8 @@ xfs_inode_ag_iterator( ag = 0; while ((pag = xfs_perag_get(mp, ag))) { ag = pag->pag_agno + 1; - error = xfs_inode_ag_walk(mp, pag, execute, flags, args, -1); + error = xfs_inode_ag_walk(mp, pag, execute, flags, args, -1, + iter_flags); xfs_perag_put(pag); if (error) { last_error = error; @@ -847,6 +875,17 @@ xfs_inode_ag_iterator( } int +xfs_inode_ag_iterator( + struct xfs_mount *mp, + int (*execute)(struct xfs_inode *ip, int flags, + void *args), + int flags, + void *args) +{ + return xfs_inode_ag_iterator_flags(mp, execute, flags, args, 0); +} + +int xfs_inode_ag_iterator_tag( struct xfs_mount *mp, int (*execute)(struct xfs_inode *ip, int flags, @@ -863,7 +902,8 @@ xfs_inode_ag_iterator_tag( ag = 0; while ((pag = xfs_perag_get_tag(mp, ag, tag))) { ag = pag->pag_agno + 1; - error = xfs_inode_ag_walk(mp, pag, execute, flags, args, tag); + error = xfs_inode_ag_walk(mp, pag, execute, flags, args, tag, + 0); xfs_perag_put(pag); if (error) { last_error = error; diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h index 8a7c849b4dea..9183f77958ef 100644 --- a/fs/xfs/xfs_icache.h +++ b/fs/xfs/xfs_icache.h @@ -48,6 +48,11 @@ struct xfs_eofblocks { #define XFS_IGET_UNTRUSTED 0x2 #define XFS_IGET_DONTCACHE 0x4 +/* + * flags for AG inode iterator + */ +#define XFS_AGITER_INEW_WAIT 0x1 /* wait on new inodes */ + int xfs_iget(struct xfs_mount *mp, struct xfs_trans *tp, xfs_ino_t ino, uint flags, uint lock_flags, xfs_inode_t **ipp); @@ -79,6 +84,9 @@ void xfs_cowblocks_worker(struct work_struct *); int xfs_inode_ag_iterator(struct xfs_mount *mp, int (*execute)(struct xfs_inode *ip, int flags, void *args), int flags, void *args); +int xfs_inode_ag_iterator_flags(struct xfs_mount *mp, + int (*execute)(struct xfs_inode *ip, int flags, void *args), + int flags, void *args, int iter_flags); int xfs_inode_ag_iterator_tag(struct xfs_mount *mp, int (*execute)(struct xfs_inode *ip, int flags, void *args), int flags, void *args, int tag); diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 7605d8396596..ec9826c56500 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -1906,12 +1906,13 @@ xfs_inactive( * force is true because we are evicting an inode from the * cache. Post-eof blocks must be freed, lest we end up with * broken free space accounting. + * + * Note: don't bother with iolock here since lockdep complains + * about acquiring it in reclaim context. We have the only + * reference to the inode at this point anyways. */ - if (xfs_can_free_eofblocks(ip, true)) { - xfs_ilock(ip, XFS_IOLOCK_EXCL); + if (xfs_can_free_eofblocks(ip, true)) xfs_free_eofblocks(ip); - xfs_iunlock(ip, XFS_IOLOCK_EXCL); - } return; } diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 10dcf27b4c85..10e89fcb49d7 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -216,7 +216,8 @@ static inline bool xfs_is_reflink_inode(struct xfs_inode *ip) #define XFS_IRECLAIM (1 << 0) /* started reclaiming this inode */ #define XFS_ISTALE (1 << 1) /* inode has been staled */ #define XFS_IRECLAIMABLE (1 << 2) /* inode can be reclaimed */ -#define XFS_INEW (1 << 3) /* inode has just been allocated */ +#define __XFS_INEW_BIT 3 /* inode has just been allocated */ +#define XFS_INEW (1 << __XFS_INEW_BIT) #define XFS_ITRUNCATED (1 << 5) /* truncated down so flush-on-close */ #define XFS_IDIRTY_RELEASE (1 << 6) /* dirty release already seen */ #define __XFS_IFLOCK_BIT 7 /* inode is being flushed right now */ @@ -464,6 +465,7 @@ static inline void xfs_finish_inode_setup(struct xfs_inode *ip) xfs_iflags_clear(ip, XFS_INEW); barrier(); unlock_new_inode(VFS_I(ip)); + wake_up_bit(&ip->i_flags, __XFS_INEW_BIT); } static inline void xfs_setup_existing_inode(struct xfs_inode *ip) diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index d90e7811ccdd..08cb7d1a4a3a 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -731,22 +731,27 @@ xfs_iflush_done( * holding the lock before removing the inode from the AIL. */ if (need_ail) { - struct xfs_log_item *log_items[need_ail]; - int i = 0; + bool mlip_changed = false; + + /* this is an opencoded batch version of xfs_trans_ail_delete */ spin_lock(&ailp->xa_lock); for (blip = lip; blip; blip = blip->li_bio_list) { - iip = INODE_ITEM(blip); - if (iip->ili_logged && - blip->li_lsn == iip->ili_flush_lsn) { - log_items[i++] = blip; - } - ASSERT(i <= need_ail); + if (INODE_ITEM(blip)->ili_logged && + blip->li_lsn == INODE_ITEM(blip)->ili_flush_lsn) + mlip_changed |= xfs_ail_delete_one(ailp, blip); } - /* xfs_trans_ail_delete_bulk() drops the AIL lock. */ - xfs_trans_ail_delete_bulk(ailp, log_items, i, - SHUTDOWN_CORRUPT_INCORE); - } + if (mlip_changed) { + if (!XFS_FORCED_SHUTDOWN(ailp->xa_mount)) + xlog_assign_tail_lsn_locked(ailp->xa_mount); + if (list_empty(&ailp->xa_ail)) + wake_up_all(&ailp->xa_empty); + } + spin_unlock(&ailp->xa_lock); + + if (mlip_changed) + xfs_log_space_wake(ailp->xa_mount); + } /* * clean up and unlock the flush lock now we are done. We can clear the diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 2fd7fdf5438f..6190697603c9 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -41,6 +41,9 @@ #include "xfs_trans.h" #include "xfs_pnfs.h" #include "xfs_acl.h" +#include "xfs_btree.h" +#include <linux/fsmap.h> +#include "xfs_fsmap.h" #include <linux/capability.h> #include <linux/cred.h> @@ -1543,10 +1546,11 @@ xfs_ioc_getbmap( unsigned int cmd, void __user *arg) { - struct getbmapx bmx; + struct getbmapx bmx = { 0 }; int error; - if (copy_from_user(&bmx, arg, sizeof(struct getbmapx))) + /* struct getbmap is a strict subset of struct getbmapx. */ + if (copy_from_user(&bmx, arg, offsetof(struct getbmapx, bmv_iflags))) return -EFAULT; if (bmx.bmv_count < 2) @@ -1608,6 +1612,84 @@ xfs_ioc_getbmapx( return 0; } +struct getfsmap_info { + struct xfs_mount *mp; + struct fsmap_head __user *data; + unsigned int idx; + __u32 last_flags; +}; + +STATIC int +xfs_getfsmap_format(struct xfs_fsmap *xfm, void *priv) +{ + struct getfsmap_info *info = priv; + struct fsmap fm; + + trace_xfs_getfsmap_mapping(info->mp, xfm); + + info->last_flags = xfm->fmr_flags; + xfs_fsmap_from_internal(&fm, xfm); + if (copy_to_user(&info->data->fmh_recs[info->idx++], &fm, + sizeof(struct fsmap))) + return -EFAULT; + + return 0; +} + +STATIC int +xfs_ioc_getfsmap( + struct xfs_inode *ip, + struct fsmap_head __user *arg) +{ + struct getfsmap_info info = { NULL }; + struct xfs_fsmap_head xhead = {0}; + struct fsmap_head head; + bool aborted = false; + int error; + + if (copy_from_user(&head, arg, sizeof(struct fsmap_head))) + return -EFAULT; + if (memchr_inv(head.fmh_reserved, 0, sizeof(head.fmh_reserved)) || + memchr_inv(head.fmh_keys[0].fmr_reserved, 0, + sizeof(head.fmh_keys[0].fmr_reserved)) || + memchr_inv(head.fmh_keys[1].fmr_reserved, 0, + sizeof(head.fmh_keys[1].fmr_reserved))) + return -EINVAL; + + xhead.fmh_iflags = head.fmh_iflags; + xhead.fmh_count = head.fmh_count; + xfs_fsmap_to_internal(&xhead.fmh_keys[0], &head.fmh_keys[0]); + xfs_fsmap_to_internal(&xhead.fmh_keys[1], &head.fmh_keys[1]); + + trace_xfs_getfsmap_low_key(ip->i_mount, &xhead.fmh_keys[0]); + trace_xfs_getfsmap_high_key(ip->i_mount, &xhead.fmh_keys[1]); + + info.mp = ip->i_mount; + info.data = arg; + error = xfs_getfsmap(ip->i_mount, &xhead, xfs_getfsmap_format, &info); + if (error == XFS_BTREE_QUERY_RANGE_ABORT) { + error = 0; + aborted = true; + } else if (error) + return error; + + /* If we didn't abort, set the "last" flag in the last fmx */ + if (!aborted && info.idx) { + info.last_flags |= FMR_OF_LAST; + if (copy_to_user(&info.data->fmh_recs[info.idx - 1].fmr_flags, + &info.last_flags, sizeof(info.last_flags))) + return -EFAULT; + } + + /* copy back header */ + head.fmh_entries = xhead.fmh_entries; + head.fmh_oflags = xhead.fmh_oflags; + if (copy_to_user(arg, &head, sizeof(struct fsmap_head))) + return -EFAULT; + + return 0; +} + int xfs_ioc_swapext( xfs_swapext_t *sxp) @@ -1788,6 +1870,9 @@ xfs_file_ioctl( case XFS_IOC_GETBMAPX: return xfs_ioc_getbmapx(ip, arg); + case FS_IOC_GETFSMAP: + return xfs_ioc_getfsmap(ip, arg); + case XFS_IOC_FD_TO_HANDLE: case XFS_IOC_PATH_TO_HANDLE: case XFS_IOC_PATH_TO_FSHANDLE: { diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c index 7c49938c5aed..fa0bc4d46065 100644 --- a/fs/xfs/xfs_ioctl32.c +++ b/fs/xfs/xfs_ioctl32.c @@ -20,6 +20,7 @@ #include <linux/mount.h> #include <linux/slab.h> #include <linux/uaccess.h> +#include <linux/fsmap.h> #include "xfs.h" #include "xfs_fs.h" #include "xfs_format.h" @@ -554,6 +555,7 @@ xfs_file_compat_ioctl( case XFS_IOC_GOINGDOWN: case XFS_IOC_ERROR_INJECTION: case XFS_IOC_ERROR_CLEARALL: + case FS_IOC_GETFSMAP: return xfs_file_ioctl(filp, cmd, p); #ifndef BROKEN_X86_ALIGNMENT /* These are handled fine if no alignment issues */ diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 288ee5b840d7..a63f61c256bd 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -240,7 +240,7 @@ xfs_iomap_write_direct( */ if (IS_DAX(VFS_I(ip))) { bmapi_flags = XFS_BMAPI_CONVERT | XFS_BMAPI_ZERO; - if (ISUNWRITTEN(imap)) { + if (imap->br_state == XFS_EXT_UNWRITTEN) { tflags |= XFS_TRANS_RESERVE; resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0) << 1; } @@ -945,7 +945,7 @@ static inline bool imap_needs_alloc(struct inode *inode, return !nimaps || imap->br_startblock == HOLESTARTBLOCK || imap->br_startblock == DELAYSTARTBLOCK || - (IS_DAX(inode) && ISUNWRITTEN(imap)); + (IS_DAX(inode) && imap->br_state == XFS_EXT_UNWRITTEN); } static inline bool need_excl_ilock(struct xfs_inode *ip, unsigned flags) @@ -976,6 +976,7 @@ xfs_file_iomap_begin( int nimaps = 1, error = 0; bool shared = false, trimmed = false; unsigned lockmode; + struct block_device *bdev; if (XFS_FORCED_SHUTDOWN(mp)) return -EIO; @@ -1063,6 +1064,14 @@ xfs_file_iomap_begin( } xfs_bmbt_to_iomap(ip, iomap, &imap); + + /* optionally associate a dax device with the iomap bdev */ + bdev = iomap->bdev; + if (blk_queue_dax(bdev->bd_queue)) + iomap->dax_dev = dax_get_by_host(bdev->bd_disk->disk_name); + else + iomap->dax_dev = NULL; + if (shared) iomap->flags |= IOMAP_F_SHARED; return 0; @@ -1140,6 +1149,7 @@ xfs_file_iomap_end( unsigned flags, struct iomap *iomap) { + put_dax(iomap->dax_dev); if ((flags & IOMAP_WRITE) && iomap->type == IOMAP_DELALLOC) return xfs_file_iomap_end_delalloc(XFS_I(inode), offset, length, written, iomap); @@ -1170,10 +1180,10 @@ xfs_xattr_iomap_begin( if (XFS_FORCED_SHUTDOWN(mp)) return -EIO; - lockmode = xfs_ilock_data_map_shared(ip); + lockmode = xfs_ilock_attr_map_shared(ip); /* if there are no attribute fork or extents, return ENOENT */ - if (XFS_IFORK_Q(ip) || !ip->i_d.di_anextents) { + if (!XFS_IFORK_Q(ip) || !ip->i_d.di_anextents) { error = -ENOENT; goto out_unlock; } diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h index 592fdf7111cb..044fb0e15390 100644 --- a/fs/xfs/xfs_linux.h +++ b/fs/xfs/xfs_linux.h @@ -212,88 +212,6 @@ static inline kgid_t xfs_gid_to_kgid(__uint32_t gid) #define xfs_sort(a,n,s,fn) sort(a,n,s,fn,NULL) #define xfs_stack_trace() dump_stack() - -/* Move the kernel do_div definition off to one side */ - -#if defined __i386__ -/* For ia32 we need to pull some tricks to get past various versions - * of the compiler which do not like us using do_div in the middle - * of large functions. - */ -static inline __u32 xfs_do_div(void *a, __u32 b, int n) -{ - __u32 mod; - - switch (n) { - case 4: - mod = *(__u32 *)a % b; - *(__u32 *)a = *(__u32 *)a / b; - return mod; - case 8: - { - unsigned long __upper, __low, __high, __mod; - __u64 c = *(__u64 *)a; - __upper = __high = c >> 32; - __low = c; - if (__high) { - __upper = __high % (b); - __high = __high / (b); - } - asm("divl %2":"=a" (__low), "=d" (__mod):"rm" (b), "0" (__low), "1" (__upper)); - asm("":"=A" (c):"a" (__low),"d" (__high)); - *(__u64 *)a = c; - return __mod; - } - } - - /* NOTREACHED */ - return 0; -} - -/* Side effect free 64 bit mod operation */ -static inline __u32 xfs_do_mod(void *a, __u32 b, int n) -{ - switch (n) { - case 4: - return *(__u32 *)a % b; - case 8: - { - unsigned long __upper, __low, __high, __mod; - __u64 c = *(__u64 *)a; - __upper = __high = c >> 32; - __low = c; - if (__high) { - __upper = __high % (b); - __high = __high / (b); - } - asm("divl %2":"=a" (__low), "=d" (__mod):"rm" (b), "0" (__low), "1" (__upper)); - asm("":"=A" (c):"a" (__low),"d" (__high)); - return __mod; - } - } - - /* NOTREACHED */ - return 0; -} -#else -static inline __u32 xfs_do_div(void *a, __u32 b, int n) -{ - __u32 mod; - - switch (n) { - case 4: - mod = *(__u32 *)a % b; - *(__u32 *)a = *(__u32 *)a / b; - return mod; - case 8: - mod = do_div(*(__u64 *)a, b); - return mod; - } - - /* NOTREACHED */ - return 0; -} - /* Side effect free 64 bit mod operation */ static inline __u32 xfs_do_mod(void *a, __u32 b, int n) { @@ -310,10 +228,7 @@ static inline __u32 xfs_do_mod(void *a, __u32 b, int n) /* NOTREACHED */ return 0; } -#endif -#undef do_div -#define do_div(a, b) xfs_do_div(&(a), (b), sizeof(a)) #define do_mod(a, b) xfs_do_mod(&(a), (b), sizeof(a)) static inline __uint64_t roundup_64(__uint64_t x, __uint32_t y) diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index b1469f0a91a6..3731f13f63e9 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -1293,7 +1293,7 @@ void xfs_log_work_queue( struct xfs_mount *mp) { - queue_delayed_work(mp->m_log_workqueue, &mp->m_log->l_work, + queue_delayed_work(mp->m_sync_workqueue, &mp->m_log->l_work, msecs_to_jiffies(xfs_syncd_centisecs * 10)); } @@ -1852,7 +1852,7 @@ xlog_sync( */ if (log->l_badcrc_factor && (prandom_u32() % log->l_badcrc_factor == 0)) { - iclog->ic_header.h_crc &= 0xAAAAAAAA; + iclog->ic_header.h_crc &= cpu_to_le32(0xAAAAAAAA); iclog->ic_state |= XLOG_STATE_IOABORT; xfs_warn(log->l_mp, "Intentionally corrupted log record at LSN 0x%llx. Shutdown imminent.", diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 688ebff1f663..2eaf81859166 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -73,6 +73,10 @@ xfs_uuid_mount( uuid_t *uuid = &mp->m_sb.sb_uuid; int hole, i; + /* Publish UUID in struct super_block */ + BUILD_BUG_ON(sizeof(mp->m_super->s_uuid) != sizeof(uuid_t)); + memcpy(&mp->m_super->s_uuid, uuid, sizeof(uuid_t)); + if (mp->m_flags & XFS_MOUNT_NOUUID) return 0; diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 6db6fd6b82b0..9fa312a41c93 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -183,6 +183,7 @@ typedef struct xfs_mount { struct workqueue_struct *m_reclaim_workqueue; struct workqueue_struct *m_log_workqueue; struct workqueue_struct *m_eofblocks_workqueue; + struct workqueue_struct *m_sync_workqueue; /* * Generation of the filesysyem layout. This is incremented by each @@ -312,7 +313,7 @@ void xfs_do_force_shutdown(struct xfs_mount *mp, int flags, char *fname, static inline xfs_agnumber_t xfs_daddr_to_agno(struct xfs_mount *mp, xfs_daddr_t d) { - xfs_daddr_t ld = XFS_BB_TO_FSBT(mp, d); + xfs_rfsblock_t ld = XFS_BB_TO_FSBT(mp, d); do_div(ld, mp->m_sb.sb_agblocks); return (xfs_agnumber_t) ld; } @@ -320,7 +321,7 @@ xfs_daddr_to_agno(struct xfs_mount *mp, xfs_daddr_t d) static inline xfs_agblock_t xfs_daddr_to_agbno(struct xfs_mount *mp, xfs_daddr_t d) { - xfs_daddr_t ld = XFS_BB_TO_FSBT(mp, d); + xfs_rfsblock_t ld = XFS_BB_TO_FSBT(mp, d); return (xfs_agblock_t) do_div(ld, mp->m_sb.sb_agblocks); } diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index b669b123287b..5fe6e70b88ef 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -851,8 +851,8 @@ xfs_qm_reset_dqcounts( * started afresh by xfs_qm_quotacheck. */ #ifdef DEBUG - j = XFS_FSB_TO_B(mp, XFS_DQUOT_CLUSTER_SIZE_FSB); - do_div(j, sizeof(xfs_dqblk_t)); + j = (int)XFS_FSB_TO_B(mp, XFS_DQUOT_CLUSTER_SIZE_FSB) / + sizeof(xfs_dqblk_t); ASSERT(mp->m_quotainfo->qi_dqperchunk == j); #endif dqb = bp->b_addr; @@ -1384,12 +1384,7 @@ xfs_qm_quotacheck( mp->m_qflags |= flags; error_return: - while (!list_empty(&buffer_list)) { - struct xfs_buf *bp = - list_first_entry(&buffer_list, struct xfs_buf, b_list); - list_del_init(&bp->b_list); - xfs_buf_relse(bp); - } + xfs_buf_delwri_cancel(&buffer_list); if (error) { xfs_warn(mp, diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c index 475a3882a81f..9cb5c381b01c 100644 --- a/fs/xfs/xfs_qm_syscalls.c +++ b/fs/xfs/xfs_qm_syscalls.c @@ -759,5 +759,6 @@ xfs_qm_dqrele_all_inodes( uint flags) { ASSERT(mp->m_quotainfo); - xfs_inode_ag_iterator(mp, xfs_dqrele_inode, flags, NULL); + xfs_inode_ag_iterator_flags(mp, xfs_dqrele_inode, flags, NULL, + XFS_AGITER_INEW_WAIT); } diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c index 6e4c7446c3d4..96fe209b5eb6 100644 --- a/fs/xfs/xfs_refcount_item.c +++ b/fs/xfs/xfs_refcount_item.c @@ -221,6 +221,7 @@ void xfs_cui_release( struct xfs_cui_log_item *cuip) { + ASSERT(atomic_read(&cuip->cui_refcount) > 0); if (atomic_dec_and_test(&cuip->cui_refcount)) { xfs_trans_ail_remove(&cuip->cui_item, SHUTDOWN_LOG_IO_ERROR); xfs_cui_item_free(cuip); diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index 4a84c5ea266d..ffe6fe7a7eb5 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -206,11 +206,7 @@ xfs_reflink_trim_around_shared( int error = 0; /* Holes, unwritten, and delalloc extents cannot be shared */ - if (!xfs_is_reflink_inode(ip) || - ISUNWRITTEN(irec) || - irec->br_startblock == HOLESTARTBLOCK || - irec->br_startblock == DELAYSTARTBLOCK || - isnullstartblock(irec->br_startblock)) { + if (!xfs_is_reflink_inode(ip) || !xfs_bmap_is_real_extent(irec)) { *shared = false; return 0; } @@ -709,8 +705,22 @@ xfs_reflink_end_cow( offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset); end_fsb = XFS_B_TO_FSB(ip->i_mount, offset + count); - /* Start a rolling transaction to switch the mappings */ - resblks = XFS_EXTENTADD_SPACE_RES(ip->i_mount, XFS_DATA_FORK); + /* + * Start a rolling transaction to switch the mappings. We're + * unlikely ever to have to remap 16T worth of single-block + * extents, so just cap the worst case extent count to 2^32-1. + * Stick a warning in just in case, and avoid 64-bit division. + */ + BUILD_BUG_ON(MAX_RW_COUNT > UINT_MAX); + if (end_fsb - offset_fsb > UINT_MAX) { + error = -EFSCORRUPTED; + xfs_force_shutdown(ip->i_mount, SHUTDOWN_CORRUPT_INCORE); + ASSERT(0); + goto out; + } + resblks = XFS_NEXTENTADD_SPACE_RES(ip->i_mount, + (unsigned int)(end_fsb - offset_fsb), + XFS_DATA_FORK); error = xfs_trans_alloc(ip->i_mount, &M_RES(ip->i_mount)->tr_write, resblks, 0, 0, &tp); if (error) @@ -1045,12 +1055,12 @@ xfs_reflink_remap_extent( xfs_off_t new_isize) { struct xfs_mount *mp = ip->i_mount; + bool real_extent = xfs_bmap_is_real_extent(irec); struct xfs_trans *tp; xfs_fsblock_t firstfsb; unsigned int resblks; struct xfs_defer_ops dfops; struct xfs_bmbt_irec uirec; - bool real_extent; xfs_filblks_t rlen; xfs_filblks_t unmap_len; xfs_off_t newlen; @@ -1059,11 +1069,6 @@ xfs_reflink_remap_extent( unmap_len = irec->br_startoff + irec->br_blockcount - destoff; trace_xfs_reflink_punch_range(ip, destoff, unmap_len); - /* Only remap normal extents. */ - real_extent = (irec->br_startblock != HOLESTARTBLOCK && - irec->br_startblock != DELAYSTARTBLOCK && - !ISUNWRITTEN(irec)); - /* No reflinking if we're low on space */ if (real_extent) { error = xfs_reflink_ag_has_free_space(mp, @@ -1359,9 +1364,7 @@ xfs_reflink_dirty_extents( goto out; if (nmaps == 0) break; - if (map[0].br_startblock == HOLESTARTBLOCK || - map[0].br_startblock == DELAYSTARTBLOCK || - ISUNWRITTEN(&map[0])) + if (!xfs_bmap_is_real_extent(&map[0])) goto next; map[1] = map[0]; @@ -1435,9 +1438,7 @@ xfs_reflink_clear_inode_flag( return error; if (nmaps == 0) break; - if (map.br_startblock == HOLESTARTBLOCK || - map.br_startblock == DELAYSTARTBLOCK || - ISUNWRITTEN(&map)) + if (!xfs_bmap_is_real_extent(&map)) goto next; agno = XFS_FSB_TO_AGNO(mp, map.br_startblock); diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c index 73c827831551..f3b139c9aa16 100644 --- a/fs/xfs/xfs_rmap_item.c +++ b/fs/xfs/xfs_rmap_item.c @@ -243,6 +243,7 @@ void xfs_rui_release( struct xfs_rui_log_item *ruip) { + ASSERT(atomic_read(&ruip->rui_refcount) > 0); if (atomic_dec_and_test(&ruip->rui_refcount)) { xfs_trans_ail_remove(&ruip->rui_item, SHUTDOWN_LOG_IO_ERROR); xfs_rui_item_free(ruip); diff --git a/fs/xfs/xfs_rtalloc.h b/fs/xfs/xfs_rtalloc.h index 51dd3c726608..f13133e6f19f 100644 --- a/fs/xfs/xfs_rtalloc.h +++ b/fs/xfs/xfs_rtalloc.h @@ -23,6 +23,16 @@ struct xfs_mount; struct xfs_trans; +struct xfs_rtalloc_rec { + xfs_rtblock_t ar_startblock; + xfs_rtblock_t ar_blockcount; +}; + +typedef int (*xfs_rtalloc_query_range_fn)( + struct xfs_trans *tp, + struct xfs_rtalloc_rec *rec, + void *priv); + #ifdef CONFIG_XFS_RT /* * Function prototypes for exported functions. @@ -118,13 +128,21 @@ int xfs_rtmodify_summary(struct xfs_mount *mp, struct xfs_trans *tp, int log, int xfs_rtfree_range(struct xfs_mount *mp, struct xfs_trans *tp, xfs_rtblock_t start, xfs_extlen_t len, struct xfs_buf **rbpp, xfs_fsblock_t *rsb); - - +int xfs_rtalloc_query_range(struct xfs_trans *tp, + struct xfs_rtalloc_rec *low_rec, + struct xfs_rtalloc_rec *high_rec, + xfs_rtalloc_query_range_fn fn, + void *priv); +int xfs_rtalloc_query_all(struct xfs_trans *tp, + xfs_rtalloc_query_range_fn fn, + void *priv); #else # define xfs_rtallocate_extent(t,b,min,max,l,f,p,rb) (ENOSYS) # define xfs_rtfree_extent(t,b,l) (ENOSYS) # define xfs_rtpick_extent(m,t,l,rb) (ENOSYS) # define xfs_growfs_rt(mp,in) (ENOSYS) +# define xfs_rtalloc_query_range(t,l,h,f,p) (ENOSYS) +# define xfs_rtalloc_query_all(t,f,p) (ENOSYS) static inline int /* error */ xfs_rtmount_init( xfs_mount_t *mp) /* file system mount structure */ diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 685c042a120f..47d239dcf3f4 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -877,8 +877,15 @@ xfs_init_mount_workqueues( if (!mp->m_eofblocks_workqueue) goto out_destroy_log; + mp->m_sync_workqueue = alloc_workqueue("xfs-sync/%s", WQ_FREEZABLE, 0, + mp->m_fsname); + if (!mp->m_sync_workqueue) + goto out_destroy_eofb; + return 0; +out_destroy_eofb: + destroy_workqueue(mp->m_eofblocks_workqueue); out_destroy_log: destroy_workqueue(mp->m_log_workqueue); out_destroy_reclaim: @@ -899,6 +906,7 @@ STATIC void xfs_destroy_mount_workqueues( struct xfs_mount *mp) { + destroy_workqueue(mp->m_sync_workqueue); destroy_workqueue(mp->m_eofblocks_workqueue); destroy_workqueue(mp->m_log_workqueue); destroy_workqueue(mp->m_reclaim_workqueue); diff --git a/fs/xfs/xfs_trace.c b/fs/xfs/xfs_trace.c index 7f17ae6d709a..5d95fe348294 100644 --- a/fs/xfs/xfs_trace.c +++ b/fs/xfs/xfs_trace.c @@ -47,6 +47,7 @@ #include "xfs_inode_item.h" #include "xfs_bmap_btree.h" #include "xfs_filestream.h" +#include "xfs_fsmap.h" /* * We include this last to have the helpers above available for the trace diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 383ac227ce2c..7c5a16528d8b 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -40,6 +40,8 @@ struct xfs_inode_log_format; struct xfs_bmbt_irec; struct xfs_btree_cur; struct xfs_refcount_irec; +struct xfs_fsmap; +struct xfs_rmap_irec; DECLARE_EVENT_CLASS(xfs_attr_list_class, TP_PROTO(struct xfs_attr_list_context *ctx), @@ -2190,7 +2192,7 @@ DECLARE_EVENT_CLASS(xfs_discard_class, __entry->agbno = agbno; __entry->len = len; ), - TP_printk("dev %d:%d agno %u agbno %u len %u\n", + TP_printk("dev %d:%d agno %u agbno %u len %u", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, __entry->agbno, @@ -2253,8 +2255,8 @@ DECLARE_EVENT_CLASS(xfs_defer_class, TP_STRUCT__entry( __field(dev_t, dev) __field(void *, dop) - __field(bool, committed) - __field(bool, low) + __field(char, committed) + __field(char, low) ), TP_fast_assign( __entry->dev = mp ? mp->m_super->s_dev : 0; @@ -2262,7 +2264,7 @@ DECLARE_EVENT_CLASS(xfs_defer_class, __entry->committed = dop->dop_committed; __entry->low = dop->dop_low; ), - TP_printk("dev %d:%d ops %p committed %d low %d\n", + TP_printk("dev %d:%d ops %p committed %d low %d", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->dop, __entry->committed, @@ -2279,8 +2281,8 @@ DECLARE_EVENT_CLASS(xfs_defer_error_class, TP_STRUCT__entry( __field(dev_t, dev) __field(void *, dop) - __field(bool, committed) - __field(bool, low) + __field(char, committed) + __field(char, low) __field(int, error) ), TP_fast_assign( @@ -2290,7 +2292,7 @@ DECLARE_EVENT_CLASS(xfs_defer_error_class, __entry->low = dop->dop_low; __entry->error = error; ), - TP_printk("dev %d:%d ops %p committed %d low %d err %d\n", + TP_printk("dev %d:%d ops %p committed %d low %d err %d", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->dop, __entry->committed, @@ -2309,7 +2311,7 @@ DECLARE_EVENT_CLASS(xfs_defer_pending_class, __field(dev_t, dev) __field(int, type) __field(void *, intent) - __field(bool, committed) + __field(char, committed) __field(int, nr) ), TP_fast_assign( @@ -2319,7 +2321,7 @@ DECLARE_EVENT_CLASS(xfs_defer_pending_class, __entry->committed = dfp->dfp_done != NULL; __entry->nr = dfp->dfp_count; ), - TP_printk("dev %d:%d optype %d intent %p committed %d nr %d\n", + TP_printk("dev %d:%d optype %d intent %p committed %d nr %d", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->type, __entry->intent, @@ -2614,7 +2616,8 @@ DECLARE_EVENT_CLASS(xfs_ag_resv_class, __entry->asked = r ? r->ar_asked : 0; __entry->len = len; ), - TP_printk("dev %d:%d agno %u resv %d freeblks %u flcount %u resv %u ask %u len %u\n", + TP_printk("dev %d:%d agno %u resv %d freeblks %u flcount %u " + "resv %u ask %u len %u", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, __entry->resv, @@ -2667,7 +2670,7 @@ DECLARE_EVENT_CLASS(xfs_ag_btree_lookup_class, __entry->agbno = agbno; __entry->dir = dir; ), - TP_printk("dev %d:%d agno %u agbno %u cmp %s(%d)\n", + TP_printk("dev %d:%d agno %u agbno %u cmp %s(%d)", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, __entry->agbno, @@ -2700,7 +2703,7 @@ DECLARE_EVENT_CLASS(xfs_refcount_extent_class, __entry->blockcount = irec->rc_blockcount; __entry->refcount = irec->rc_refcount; ), - TP_printk("dev %d:%d agno %u agbno %u len %u refcount %u\n", + TP_printk("dev %d:%d agno %u agbno %u len %u refcount %u", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, __entry->startblock, @@ -2735,7 +2738,7 @@ DECLARE_EVENT_CLASS(xfs_refcount_extent_at_class, __entry->refcount = irec->rc_refcount; __entry->agbno = agbno; ), - TP_printk("dev %d:%d agno %u agbno %u len %u refcount %u @ agbno %u\n", + TP_printk("dev %d:%d agno %u agbno %u len %u refcount %u @ agbno %u", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, __entry->startblock, @@ -2776,7 +2779,7 @@ DECLARE_EVENT_CLASS(xfs_refcount_double_extent_class, __entry->i2_refcount = i2->rc_refcount; ), TP_printk("dev %d:%d agno %u agbno %u len %u refcount %u -- " - "agbno %u len %u refcount %u\n", + "agbno %u len %u refcount %u", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, __entry->i1_startblock, @@ -2822,7 +2825,7 @@ DECLARE_EVENT_CLASS(xfs_refcount_double_extent_at_class, __entry->agbno = agbno; ), TP_printk("dev %d:%d agno %u agbno %u len %u refcount %u -- " - "agbno %u len %u refcount %u @ agbno %u\n", + "agbno %u len %u refcount %u @ agbno %u", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, __entry->i1_startblock, @@ -2875,7 +2878,7 @@ DECLARE_EVENT_CLASS(xfs_refcount_triple_extent_class, ), TP_printk("dev %d:%d agno %u agbno %u len %u refcount %u -- " "agbno %u len %u refcount %u -- " - "agbno %u len %u refcount %u\n", + "agbno %u len %u refcount %u", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, __entry->i1_startblock, @@ -3001,31 +3004,6 @@ DEFINE_EVENT(xfs_inode_error_class, name, \ unsigned long caller_ip), \ TP_ARGS(ip, error, caller_ip)) -/* reflink allocator */ -TRACE_EVENT(xfs_bmap_remap_alloc, - TP_PROTO(struct xfs_inode *ip, xfs_fsblock_t fsbno, - xfs_extlen_t len), - TP_ARGS(ip, fsbno, len), - TP_STRUCT__entry( - __field(dev_t, dev) - __field(xfs_ino_t, ino) - __field(xfs_fsblock_t, fsbno) - __field(xfs_extlen_t, len) - ), - TP_fast_assign( - __entry->dev = VFS_I(ip)->i_sb->s_dev; - __entry->ino = ip->i_ino; - __entry->fsbno = fsbno; - __entry->len = len; - ), - TP_printk("dev %d:%d ino 0x%llx fsbno 0x%llx len %x", - MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->ino, - __entry->fsbno, - __entry->len) -); -DEFINE_INODE_ERROR_EVENT(xfs_bmap_remap_alloc_error); - /* reflink tracepoint classes */ /* two-file io tracepoint class */ @@ -3227,7 +3205,7 @@ TRACE_EVENT(xfs_ioctl_clone, ), TP_printk("dev %d:%d " "ino 0x%lx isize 0x%llx -> " - "ino 0x%lx isize 0x%llx\n", + "ino 0x%lx isize 0x%llx", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->src_ino, __entry->src_isize, @@ -3267,6 +3245,88 @@ DEFINE_INODE_IREC_EVENT(xfs_swap_extent_rmap_remap); DEFINE_INODE_IREC_EVENT(xfs_swap_extent_rmap_remap_piece); DEFINE_INODE_ERROR_EVENT(xfs_swap_extent_rmap_error); +/* fsmap traces */ +DECLARE_EVENT_CLASS(xfs_fsmap_class, + TP_PROTO(struct xfs_mount *mp, u32 keydev, xfs_agnumber_t agno, + struct xfs_rmap_irec *rmap), + TP_ARGS(mp, keydev, agno, rmap), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(dev_t, keydev) + __field(xfs_agnumber_t, agno) + __field(xfs_fsblock_t, bno) + __field(xfs_filblks_t, len) + __field(__uint64_t, owner) + __field(__uint64_t, offset) + __field(unsigned int, flags) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->keydev = new_decode_dev(keydev); + __entry->agno = agno; + __entry->bno = rmap->rm_startblock; + __entry->len = rmap->rm_blockcount; + __entry->owner = rmap->rm_owner; + __entry->offset = rmap->rm_offset; + __entry->flags = rmap->rm_flags; + ), + TP_printk("dev %d:%d keydev %d:%d agno %u bno %llu len %llu owner %lld offset %llu flags 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + MAJOR(__entry->keydev), MINOR(__entry->keydev), + __entry->agno, + __entry->bno, + __entry->len, + __entry->owner, + __entry->offset, + __entry->flags) +) +#define DEFINE_FSMAP_EVENT(name) \ +DEFINE_EVENT(xfs_fsmap_class, name, \ + TP_PROTO(struct xfs_mount *mp, u32 keydev, xfs_agnumber_t agno, \ + struct xfs_rmap_irec *rmap), \ + TP_ARGS(mp, keydev, agno, rmap)) +DEFINE_FSMAP_EVENT(xfs_fsmap_low_key); +DEFINE_FSMAP_EVENT(xfs_fsmap_high_key); +DEFINE_FSMAP_EVENT(xfs_fsmap_mapping); + +DECLARE_EVENT_CLASS(xfs_getfsmap_class, + TP_PROTO(struct xfs_mount *mp, struct xfs_fsmap *fsmap), + TP_ARGS(mp, fsmap), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(dev_t, keydev) + __field(xfs_daddr_t, block) + __field(xfs_daddr_t, len) + __field(__uint64_t, owner) + __field(__uint64_t, offset) + __field(__uint64_t, flags) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->keydev = new_decode_dev(fsmap->fmr_device); + __entry->block = fsmap->fmr_physical; + __entry->len = fsmap->fmr_length; + __entry->owner = fsmap->fmr_owner; + __entry->offset = fsmap->fmr_offset; + __entry->flags = fsmap->fmr_flags; + ), + TP_printk("dev %d:%d keydev %d:%d block %llu len %llu owner %lld offset %llu flags 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + MAJOR(__entry->keydev), MINOR(__entry->keydev), + __entry->block, + __entry->len, + __entry->owner, + __entry->offset, + __entry->flags) +) +#define DEFINE_GETFSMAP_EVENT(name) \ +DEFINE_EVENT(xfs_getfsmap_class, name, \ + TP_PROTO(struct xfs_mount *mp, struct xfs_fsmap *fsmap), \ + TP_ARGS(mp, fsmap)) +DEFINE_GETFSMAP_EVENT(xfs_getfsmap_low_key); +DEFINE_GETFSMAP_EVENT(xfs_getfsmap_high_key); +DEFINE_GETFSMAP_EVENT(xfs_getfsmap_mapping); + #endif /* _TRACE_XFS_H */ #undef TRACE_INCLUDE_PATH diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index 70f42ea86dfb..2011620008de 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -134,7 +134,7 @@ xfs_trans_reserve( bool rsvd = (tp->t_flags & XFS_TRANS_RESERVE) != 0; /* Mark this thread as being in a transaction */ - current_set_flags_nested(&tp->t_pflags, PF_FSTRANS); + current_set_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS); /* * Attempt to reserve the needed disk blocks by decrementing @@ -144,7 +144,7 @@ xfs_trans_reserve( if (blocks > 0) { error = xfs_mod_fdblocks(tp->t_mountp, -((int64_t)blocks), rsvd); if (error != 0) { - current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); + current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS); return -ENOSPC; } tp->t_blk_res += blocks; @@ -221,7 +221,7 @@ undo_blocks: tp->t_blk_res = 0; } - current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); + current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS); return error; } @@ -263,6 +263,28 @@ xfs_trans_alloc( } /* + * Create an empty transaction with no reservation. This is a defensive + * mechanism for routines that query metadata without actually modifying + * them -- if the metadata being queried is somehow cross-linked (think a + * btree block pointer that points higher in the tree), we risk deadlock. + * However, blocks grabbed as part of a transaction can be re-grabbed. + * The verifiers will notice the corrupt block and the operation will fail + * back to userspace without deadlocking. + * + * Note the zero-length reservation; this transaction MUST be cancelled + * without any dirty data. + */ +int +xfs_trans_alloc_empty( + struct xfs_mount *mp, + struct xfs_trans **tpp) +{ + struct xfs_trans_res resv = {0}; + + return xfs_trans_alloc(mp, &resv, 0, 0, XFS_TRANS_NO_WRITECOUNT, tpp); +} + +/* * Record the indicated change to the given field for application * to the file system's superblock when the transaction commits. * For now, just store the change in the transaction structure. @@ -914,7 +936,7 @@ __xfs_trans_commit( xfs_log_commit_cil(mp, tp, &commit_lsn, regrant); - current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); + current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS); xfs_trans_free(tp); /* @@ -944,7 +966,7 @@ out_unreserve: if (commit_lsn == -1 && !error) error = -EIO; } - current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); + current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS); xfs_trans_free_items(tp, NULLCOMMITLSN, !!error); xfs_trans_free(tp); @@ -998,7 +1020,7 @@ xfs_trans_cancel( xfs_log_done(mp, tp->t_ticket, NULL, false); /* mark this thread as no longer being in a transaction */ - current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); + current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS); xfs_trans_free_items(tp, NULLCOMMITLSN, dirty); xfs_trans_free(tp); @@ -1012,17 +1034,14 @@ xfs_trans_cancel( * chunk we've been working on and get a new transaction to continue. */ int -__xfs_trans_roll( +xfs_trans_roll( struct xfs_trans **tpp, - struct xfs_inode *dp, - int *committed) + struct xfs_inode *dp) { struct xfs_trans *trans; struct xfs_trans_res tres; int error; - *committed = 0; - /* * Ensure that the inode is always logged. */ @@ -1048,7 +1067,6 @@ __xfs_trans_roll( if (error) return error; - *committed = 1; trans = *tpp; /* @@ -1071,12 +1089,3 @@ __xfs_trans_roll( xfs_trans_ijoin(trans, dp, 0); return 0; } - -int -xfs_trans_roll( - struct xfs_trans **tpp, - struct xfs_inode *dp) -{ - int committed; - return __xfs_trans_roll(tpp, dp, &committed); -} diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index 1646f659b60f..a07acbf0bd8a 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -158,6 +158,8 @@ typedef struct xfs_trans { int xfs_trans_alloc(struct xfs_mount *mp, struct xfs_trans_res *resp, uint blocks, uint rtextents, uint flags, struct xfs_trans **tpp); +int xfs_trans_alloc_empty(struct xfs_mount *mp, + struct xfs_trans **tpp); void xfs_trans_mod_sb(xfs_trans_t *, uint, int64_t); struct xfs_buf *xfs_trans_get_buf_map(struct xfs_trans *tp, @@ -226,7 +228,6 @@ int xfs_trans_free_extent(struct xfs_trans *, struct xfs_efd_log_item *, xfs_fsblock_t, xfs_extlen_t, struct xfs_owner_info *); int xfs_trans_commit(struct xfs_trans *); -int __xfs_trans_roll(struct xfs_trans **, struct xfs_inode *, int *); int xfs_trans_roll(struct xfs_trans **, struct xfs_inode *); void xfs_trans_cancel(xfs_trans_t *); int xfs_trans_ail_init(struct xfs_mount *); diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c index d6c9c3e9e02b..9056c0f34a3c 100644 --- a/fs/xfs/xfs_trans_ail.c +++ b/fs/xfs/xfs_trans_ail.c @@ -684,8 +684,23 @@ xfs_trans_ail_update_bulk( } } -/* - * xfs_trans_ail_delete_bulk - remove multiple log items from the AIL +bool +xfs_ail_delete_one( + struct xfs_ail *ailp, + struct xfs_log_item *lip) +{ + struct xfs_log_item *mlip = xfs_ail_min(ailp); + + trace_xfs_ail_delete(lip, mlip->li_lsn, lip->li_lsn); + xfs_ail_delete(ailp, lip); + lip->li_flags &= ~XFS_LI_IN_AIL; + lip->li_lsn = 0; + + return mlip == lip; +} + +/** + * Remove a log items from the AIL * * @xfs_trans_ail_delete_bulk takes an array of log items that all need to * removed from the AIL. The caller is already holding the AIL lock, and done @@ -706,52 +721,36 @@ xfs_trans_ail_update_bulk( * before returning. */ void -xfs_trans_ail_delete_bulk( +xfs_trans_ail_delete( struct xfs_ail *ailp, - struct xfs_log_item **log_items, - int nr_items, + struct xfs_log_item *lip, int shutdown_type) __releases(ailp->xa_lock) { - xfs_log_item_t *mlip; - int mlip_changed = 0; - int i; + struct xfs_mount *mp = ailp->xa_mount; + bool mlip_changed; - mlip = xfs_ail_min(ailp); - - for (i = 0; i < nr_items; i++) { - struct xfs_log_item *lip = log_items[i]; - if (!(lip->li_flags & XFS_LI_IN_AIL)) { - struct xfs_mount *mp = ailp->xa_mount; - - spin_unlock(&ailp->xa_lock); - if (!XFS_FORCED_SHUTDOWN(mp)) { - xfs_alert_tag(mp, XFS_PTAG_AILDELETE, - "%s: attempting to delete a log item that is not in the AIL", - __func__); - xfs_force_shutdown(mp, shutdown_type); - } - return; + if (!(lip->li_flags & XFS_LI_IN_AIL)) { + spin_unlock(&ailp->xa_lock); + if (!XFS_FORCED_SHUTDOWN(mp)) { + xfs_alert_tag(mp, XFS_PTAG_AILDELETE, + "%s: attempting to delete a log item that is not in the AIL", + __func__); + xfs_force_shutdown(mp, shutdown_type); } - - trace_xfs_ail_delete(lip, mlip->li_lsn, lip->li_lsn); - xfs_ail_delete(ailp, lip); - lip->li_flags &= ~XFS_LI_IN_AIL; - lip->li_lsn = 0; - if (mlip == lip) - mlip_changed = 1; + return; } + mlip_changed = xfs_ail_delete_one(ailp, lip); if (mlip_changed) { - if (!XFS_FORCED_SHUTDOWN(ailp->xa_mount)) - xlog_assign_tail_lsn_locked(ailp->xa_mount); + if (!XFS_FORCED_SHUTDOWN(mp)) + xlog_assign_tail_lsn_locked(mp); if (list_empty(&ailp->xa_ail)) wake_up_all(&ailp->xa_empty); - spin_unlock(&ailp->xa_lock); + } + spin_unlock(&ailp->xa_lock); + if (mlip_changed) xfs_log_space_wake(ailp->xa_mount); - } else { - spin_unlock(&ailp->xa_lock); - } } int diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h index 49931b72da8a..d91706c56c63 100644 --- a/fs/xfs/xfs_trans_priv.h +++ b/fs/xfs/xfs_trans_priv.h @@ -106,18 +106,9 @@ xfs_trans_ail_update( xfs_trans_ail_update_bulk(ailp, NULL, &lip, 1, lsn); } -void xfs_trans_ail_delete_bulk(struct xfs_ail *ailp, - struct xfs_log_item **log_items, int nr_items, - int shutdown_type) - __releases(ailp->xa_lock); -static inline void -xfs_trans_ail_delete( - struct xfs_ail *ailp, - xfs_log_item_t *lip, - int shutdown_type) __releases(ailp->xa_lock) -{ - xfs_trans_ail_delete_bulk(ailp, &lip, 1, shutdown_type); -} +bool xfs_ail_delete_one(struct xfs_ail *ailp, struct xfs_log_item *lip); +void xfs_trans_ail_delete(struct xfs_ail *ailp, struct xfs_log_item *lip, + int shutdown_type) __releases(ailp->xa_lock); static inline void xfs_trans_ail_remove( |