From b4caecd48005fbed3949dde6c1cb233142fd69e9 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 14 Jan 2015 10:42:32 +0100 Subject: fs: introduce f_op->mmap_capabilities for nommu mmap support Since "BDI: Provide backing device capability information [try #3]" the backing_dev_info structure also provides flags for the kind of mmap operation available in a nommu environment, which is entirely unrelated to it's original purpose. Introduce a new nommu-only file operation to provide this information to the nommu mmap code instead. Splitting this from the backing_dev_info structure allows to remove lots of backing_dev_info instance that aren't otherwise needed, and entirely gets rid of the concept of providing a backing_dev_info for a character device. It also removes the need for the mtd_inodefs filesystem. Signed-off-by: Christoph Hellwig Reviewed-by: Tejun Heo Acked-by: Brian Norris Signed-off-by: Jens Axboe --- mm/backing-dev.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'mm/backing-dev.c') diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 0ae0df55000b..16c68958aeda 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -17,8 +17,6 @@ static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0); struct backing_dev_info default_backing_dev_info = { .name = "default", .ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE, - .state = 0, - .capabilities = BDI_CAP_MAP_COPY, }; EXPORT_SYMBOL_GPL(default_backing_dev_info); @@ -513,13 +511,12 @@ EXPORT_SYMBOL(bdi_destroy); * For use from filesystems to quickly init and register a bdi associated * with dirty writeback */ -int bdi_setup_and_register(struct backing_dev_info *bdi, char *name, - unsigned int cap) +int bdi_setup_and_register(struct backing_dev_info *bdi, char *name) { int err; bdi->name = name; - bdi->capabilities = cap; + bdi->capabilities = 0; err = bdi_init(bdi); if (err) return err; -- cgit v1.2.3 From b83ae6d421435c6204150300f1c25bfbd39cd62b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 14 Jan 2015 10:42:37 +0100 Subject: fs: remove mapping->backing_dev_info Now that we never use the backing_dev_info pointer in struct address_space we can simply remove it and save 4 to 8 bytes in every inode. Signed-off-by: Christoph Hellwig Acked-by: Ryusuke Konishi Reviewed-by: Tejun Heo Reviewed-by: Jan Kara Signed-off-by: Jens Axboe --- drivers/char/raw.c | 4 +--- fs/aio.c | 1 - fs/block_dev.c | 26 +------------------------- fs/btrfs/disk-io.c | 1 - fs/btrfs/inode.c | 6 ------ fs/ceph/inode.c | 2 -- fs/cifs/inode.c | 2 -- fs/configfs/inode.c | 1 - fs/ecryptfs/inode.c | 1 - fs/exofs/inode.c | 2 -- fs/fuse/inode.c | 1 - fs/gfs2/glock.c | 1 - fs/gfs2/ops_fstype.c | 1 - fs/hugetlbfs/inode.c | 1 - fs/inode.c | 13 ------------- fs/kernfs/inode.c | 1 - fs/ncpfs/inode.c | 1 - fs/nfs/inode.c | 1 - fs/nilfs2/gcinode.c | 1 - fs/nilfs2/mdt.c | 6 ++---- fs/nilfs2/page.c | 4 +--- fs/nilfs2/page.h | 3 +-- fs/nilfs2/super.c | 2 +- fs/ocfs2/dlmfs/dlmfs.c | 2 -- fs/ramfs/inode.c | 1 - fs/romfs/super.c | 3 --- fs/ubifs/dir.c | 2 -- fs/ubifs/super.c | 3 --- include/linux/fs.h | 3 +-- mm/backing-dev.c | 1 - mm/shmem.c | 1 - mm/swap_state.c | 1 - 32 files changed, 8 insertions(+), 91 deletions(-) (limited to 'mm/backing-dev.c') diff --git a/drivers/char/raw.c b/drivers/char/raw.c index a24891b97547..6e29bf2db536 100644 --- a/drivers/char/raw.c +++ b/drivers/char/raw.c @@ -104,11 +104,9 @@ static int raw_release(struct inode *inode, struct file *filp) mutex_lock(&raw_mutex); bdev = raw_devices[minor].binding; - if (--raw_devices[minor].inuse == 0) { + if (--raw_devices[minor].inuse == 0) /* Here inode->i_mapping == bdev->bd_inode->i_mapping */ inode->i_mapping = &inode->i_data; - inode->i_mapping->backing_dev_info = &default_backing_dev_info; - } mutex_unlock(&raw_mutex); blkdev_put(bdev, filp->f_mode | FMODE_EXCL); diff --git a/fs/aio.c b/fs/aio.c index 6f13d3fab07f..3bf8b1d250c3 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -176,7 +176,6 @@ static struct file *aio_private_file(struct kioctx *ctx, loff_t nr_pages) inode->i_mapping->a_ops = &aio_ctx_aops; inode->i_mapping->private_data = ctx; - inode->i_mapping->backing_dev_info = &noop_backing_dev_info; inode->i_size = PAGE_SIZE * nr_pages; path.dentry = d_alloc_pseudo(aio_mnt->mnt_sb, &this); diff --git a/fs/block_dev.c b/fs/block_dev.c index 026ca7b8431c..a9f92794d7a0 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -60,19 +60,6 @@ static void bdev_write_inode(struct inode *inode) spin_unlock(&inode->i_lock); } -/* - * Move the inode from its current bdi to a new bdi. Make sure the inode - * is clean before moving so that it doesn't linger on the old bdi. - */ -static void bdev_inode_switch_bdi(struct inode *inode, - struct backing_dev_info *dst) -{ - spin_lock(&inode->i_lock); - WARN_ON_ONCE(inode->i_state & I_DIRTY); - inode->i_data.backing_dev_info = dst; - spin_unlock(&inode->i_lock); -} - /* Kill _all_ buffers and pagecache , dirty or not.. */ void kill_bdev(struct block_device *bdev) { @@ -589,7 +576,6 @@ struct block_device *bdget(dev_t dev) inode->i_bdev = bdev; inode->i_data.a_ops = &def_blk_aops; mapping_set_gfp_mask(&inode->i_data, GFP_USER); - inode->i_data.backing_dev_info = &default_backing_dev_info; spin_lock(&bdev_lock); list_add(&bdev->bd_list, &all_bdevs); spin_unlock(&bdev_lock); @@ -1150,8 +1136,6 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) bdev->bd_queue = disk->queue; bdev->bd_contains = bdev; if (!partno) { - struct backing_dev_info *bdi; - ret = -ENXIO; bdev->bd_part = disk_get_part(disk, partno); if (!bdev->bd_part) @@ -1177,11 +1161,8 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) } } - if (!ret) { + if (!ret) bd_set_size(bdev,(loff_t)get_capacity(disk)<<9); - bdi = blk_get_backing_dev_info(bdev); - bdev_inode_switch_bdi(bdev->bd_inode, bdi); - } /* * If the device is invalidated, rescan partition @@ -1208,8 +1189,6 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) if (ret) goto out_clear; bdev->bd_contains = whole; - bdev_inode_switch_bdi(bdev->bd_inode, - whole->bd_inode->i_data.backing_dev_info); bdev->bd_part = disk_get_part(disk, partno); if (!(disk->flags & GENHD_FL_UP) || !bdev->bd_part || !bdev->bd_part->nr_sects) { @@ -1249,7 +1228,6 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) bdev->bd_disk = NULL; bdev->bd_part = NULL; bdev->bd_queue = NULL; - bdev_inode_switch_bdi(bdev->bd_inode, &default_backing_dev_info); if (bdev != bdev->bd_contains) __blkdev_put(bdev->bd_contains, mode, 1); bdev->bd_contains = NULL; @@ -1474,8 +1452,6 @@ static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part) * dirty data before. */ bdev_write_inode(bdev->bd_inode); - bdev_inode_switch_bdi(bdev->bd_inode, - &default_backing_dev_info); } if (bdev->bd_contains == bdev) { if (disk->fops->release) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index afc4092989cd..1ec872e3a926 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2318,7 +2318,6 @@ int open_ctree(struct super_block *sb, */ fs_info->btree_inode->i_size = OFFSET_MAX; fs_info->btree_inode->i_mapping->a_ops = &btree_aops; - fs_info->btree_inode->i_mapping->backing_dev_info = &fs_info->bdi; RB_CLEAR_NODE(&BTRFS_I(fs_info->btree_inode)->rb_node); extent_io_tree_init(&BTRFS_I(fs_info->btree_inode)->io_tree, diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 8bf326affb94..54bcf639d1cf 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3608,7 +3608,6 @@ cache_acl: switch (inode->i_mode & S_IFMT) { case S_IFREG: inode->i_mapping->a_ops = &btrfs_aops; - inode->i_mapping->backing_dev_info = &root->fs_info->bdi; BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; inode->i_fop = &btrfs_file_operations; inode->i_op = &btrfs_file_inode_operations; @@ -3623,7 +3622,6 @@ cache_acl: case S_IFLNK: inode->i_op = &btrfs_symlink_inode_operations; inode->i_mapping->a_ops = &btrfs_symlink_aops; - inode->i_mapping->backing_dev_info = &root->fs_info->bdi; break; default: inode->i_op = &btrfs_special_inode_operations; @@ -6088,7 +6086,6 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry, inode->i_fop = &btrfs_file_operations; inode->i_op = &btrfs_file_inode_operations; inode->i_mapping->a_ops = &btrfs_aops; - inode->i_mapping->backing_dev_info = &root->fs_info->bdi; err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); if (err) @@ -9203,7 +9200,6 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry, inode->i_fop = &btrfs_file_operations; inode->i_op = &btrfs_file_inode_operations; inode->i_mapping->a_ops = &btrfs_aops; - inode->i_mapping->backing_dev_info = &root->fs_info->bdi; BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); @@ -9247,7 +9243,6 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry, inode->i_op = &btrfs_symlink_inode_operations; inode->i_mapping->a_ops = &btrfs_symlink_aops; - inode->i_mapping->backing_dev_info = &root->fs_info->bdi; inode_set_bytes(inode, name_len); btrfs_i_size_write(inode, name_len); err = btrfs_update_inode(trans, root, inode); @@ -9459,7 +9454,6 @@ static int btrfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) inode->i_op = &btrfs_file_inode_operations; inode->i_mapping->a_ops = &btrfs_aops; - inode->i_mapping->backing_dev_info = &root->fs_info->bdi; BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; ret = btrfs_init_inode_security(trans, inode, dir, NULL); diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index f61a74115beb..6b5173605154 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -783,8 +783,6 @@ static int fill_inode(struct inode *inode, struct page *locked_page, } inode->i_mapping->a_ops = &ceph_aops; - inode->i_mapping->backing_dev_info = - &ceph_sb_to_client(inode->i_sb)->backing_dev_info; switch (inode->i_mode & S_IFMT) { case S_IFIFO: diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 0c3ce464cae4..2d4f37235ed0 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -937,8 +937,6 @@ retry_iget5_locked: inode->i_flags |= S_NOATIME | S_NOCMTIME; if (inode->i_state & I_NEW) { inode->i_ino = hash; - if (S_ISREG(inode->i_mode)) - inode->i_data.backing_dev_info = sb->s_bdi; #ifdef CONFIG_CIFS_FSCACHE /* initialize per-inode cache cookie pointer */ CIFS_I(inode)->fscache = NULL; diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c index 0ad6b4d6de00..65af86147154 100644 --- a/fs/configfs/inode.c +++ b/fs/configfs/inode.c @@ -131,7 +131,6 @@ struct inode *configfs_new_inode(umode_t mode, struct configfs_dirent *sd, if (inode) { inode->i_ino = get_next_ino(); inode->i_mapping->a_ops = &configfs_aops; - inode->i_mapping->backing_dev_info = &noop_backing_dev_info; inode->i_op = &configfs_inode_operations; if (sd->s_iattr) { diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index 1686dc2da9fd..34b36a504059 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -67,7 +67,6 @@ static int ecryptfs_inode_set(struct inode *inode, void *opaque) inode->i_ino = lower_inode->i_ino; inode->i_version++; inode->i_mapping->a_ops = &ecryptfs_aops; - inode->i_mapping->backing_dev_info = inode->i_sb->s_bdi; if (S_ISLNK(inode->i_mode)) inode->i_op = &ecryptfs_symlink_iops; diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c index f1d3d4eb8c4f..6fc91df99ff8 100644 --- a/fs/exofs/inode.c +++ b/fs/exofs/inode.c @@ -1214,7 +1214,6 @@ struct inode *exofs_iget(struct super_block *sb, unsigned long ino) memcpy(oi->i_data, fcb.i_data, sizeof(fcb.i_data)); } - inode->i_mapping->backing_dev_info = sb->s_bdi; if (S_ISREG(inode->i_mode)) { inode->i_op = &exofs_file_inode_operations; inode->i_fop = &exofs_file_operations; @@ -1314,7 +1313,6 @@ struct inode *exofs_new_inode(struct inode *dir, umode_t mode) set_obj_2bcreated(oi); - inode->i_mapping->backing_dev_info = sb->s_bdi; inode_init_owner(inode, dir, mode); inode->i_ino = sbi->s_nextid++; inode->i_blkbits = EXOFS_BLKSHIFT; diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index f38256e4476e..e8799c11424b 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -308,7 +308,6 @@ struct inode *fuse_iget(struct super_block *sb, u64 nodeid, if (!fc->writeback_cache || !S_ISREG(attr->mode)) inode->i_flags |= S_NOCMTIME; inode->i_generation = generation; - inode->i_data.backing_dev_info = &fc->bdi; fuse_init_inode(inode, attr); unlock_new_inode(inode); } else if ((inode->i_mode ^ attr->mode) & S_IFMT) { diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index a23524aa3eac..08ea717981f7 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -775,7 +775,6 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number, mapping->flags = 0; mapping_set_gfp_mask(mapping, GFP_NOFS); mapping->private_data = NULL; - mapping->backing_dev_info = s->s_bdi; mapping->writeback_index = 0; } diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index 8633ad328ee2..efc8e254787c 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -112,7 +112,6 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb) mapping->flags = 0; mapping_set_gfp_mask(mapping, GFP_NOFS); mapping->private_data = NULL; - mapping->backing_dev_info = sb->s_bdi; mapping->writeback_index = 0; spin_lock_init(&sdp->sd_log_lock); diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index de7c95c7d840..c274aca8e8dc 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -492,7 +492,6 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, lockdep_set_class(&inode->i_mapping->i_mmap_rwsem, &hugetlbfs_i_mmap_rwsem_key); inode->i_mapping->a_ops = &hugetlbfs_aops; - inode->i_mapping->backing_dev_info = &noop_backing_dev_info; inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; inode->i_mapping->private_data = resv_map; info = HUGETLBFS_I(inode); diff --git a/fs/inode.c b/fs/inode.c index aa149e7262ac..e4e8caa7464c 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -170,20 +170,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode) atomic_set(&mapping->i_mmap_writable, 0); mapping_set_gfp_mask(mapping, GFP_HIGHUSER_MOVABLE); mapping->private_data = NULL; - mapping->backing_dev_info = &default_backing_dev_info; mapping->writeback_index = 0; - - /* - * If the block_device provides a backing_dev_info for client - * inodes then use that. Otherwise the inode share the bdev's - * backing_dev_info. - */ - if (sb->s_bdev) { - struct backing_dev_info *bdi; - - bdi = sb->s_bdev->bd_inode->i_mapping->backing_dev_info; - mapping->backing_dev_info = bdi; - } inode->i_private = NULL; inode->i_mapping = mapping; INIT_HLIST_HEAD(&inode->i_dentry); /* buggered by rcu freeing */ diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c index 06f06887b2d2..9000874a945b 100644 --- a/fs/kernfs/inode.c +++ b/fs/kernfs/inode.c @@ -286,7 +286,6 @@ static void kernfs_init_inode(struct kernfs_node *kn, struct inode *inode) kernfs_get(kn); inode->i_private = kn; inode->i_mapping->a_ops = &kernfs_aops; - inode->i_mapping->backing_dev_info = &noop_backing_dev_info; inode->i_op = &kernfs_iops; set_default_inode_attr(inode, kn->mode); diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c index a699a3fc62c0..01a9e16e9782 100644 --- a/fs/ncpfs/inode.c +++ b/fs/ncpfs/inode.c @@ -267,7 +267,6 @@ ncp_iget(struct super_block *sb, struct ncp_entry_info *info) if (inode) { atomic_set(&NCP_FINFO(inode)->opened, info->opened); - inode->i_mapping->backing_dev_info = sb->s_bdi; inode->i_ino = info->ino; ncp_set_attr(inode, info); if (S_ISREG(inode->i_mode)) { diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 4bffe637ea32..24aac72420f4 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -387,7 +387,6 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr, st if (S_ISREG(inode->i_mode)) { inode->i_fop = NFS_SB(sb)->nfs_client->rpc_ops->file_ops; inode->i_data.a_ops = &nfs_file_aops; - inode->i_data.backing_dev_info = &NFS_SB(sb)->backing_dev_info; } else if (S_ISDIR(inode->i_mode)) { inode->i_op = NFS_SB(sb)->nfs_client->rpc_ops->dir_inode_ops; inode->i_fop = &nfs_dir_operations; diff --git a/fs/nilfs2/gcinode.c b/fs/nilfs2/gcinode.c index 57ceaf33d177..748ca238915a 100644 --- a/fs/nilfs2/gcinode.c +++ b/fs/nilfs2/gcinode.c @@ -172,7 +172,6 @@ int nilfs_init_gcinode(struct inode *inode) inode->i_mode = S_IFREG; mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS); inode->i_mapping->a_ops = &empty_aops; - inode->i_mapping->backing_dev_info = inode->i_sb->s_bdi; ii->i_flags = 0; nilfs_bmap_init_gc(ii->i_bmap); diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c index c4dcd1db57ee..892cf5ffdb8e 100644 --- a/fs/nilfs2/mdt.c +++ b/fs/nilfs2/mdt.c @@ -429,7 +429,6 @@ int nilfs_mdt_init(struct inode *inode, gfp_t gfp_mask, size_t objsz) inode->i_mode = S_IFREG; mapping_set_gfp_mask(inode->i_mapping, gfp_mask); - inode->i_mapping->backing_dev_info = inode->i_sb->s_bdi; inode->i_op = &def_mdt_iops; inode->i_fop = &def_mdt_fops; @@ -457,13 +456,12 @@ int nilfs_mdt_setup_shadow_map(struct inode *inode, struct nilfs_shadow_map *shadow) { struct nilfs_mdt_info *mi = NILFS_MDT(inode); - struct backing_dev_info *bdi = inode->i_sb->s_bdi; INIT_LIST_HEAD(&shadow->frozen_buffers); address_space_init_once(&shadow->frozen_data); - nilfs_mapping_init(&shadow->frozen_data, inode, bdi); + nilfs_mapping_init(&shadow->frozen_data, inode); address_space_init_once(&shadow->frozen_btnodes); - nilfs_mapping_init(&shadow->frozen_btnodes, inode, bdi); + nilfs_mapping_init(&shadow->frozen_btnodes, inode); mi->mi_shadow = shadow; return 0; } diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c index da276640f776..700ecbcca55d 100644 --- a/fs/nilfs2/page.c +++ b/fs/nilfs2/page.c @@ -461,14 +461,12 @@ unsigned nilfs_page_count_clean_buffers(struct page *page, return nc; } -void nilfs_mapping_init(struct address_space *mapping, struct inode *inode, - struct backing_dev_info *bdi) +void nilfs_mapping_init(struct address_space *mapping, struct inode *inode) { mapping->host = inode; mapping->flags = 0; mapping_set_gfp_mask(mapping, GFP_NOFS); mapping->private_data = NULL; - mapping->backing_dev_info = bdi; mapping->a_ops = &empty_aops; } diff --git a/fs/nilfs2/page.h b/fs/nilfs2/page.h index ef30c5c2426f..a43b8287d012 100644 --- a/fs/nilfs2/page.h +++ b/fs/nilfs2/page.h @@ -57,8 +57,7 @@ int nilfs_copy_dirty_pages(struct address_space *, struct address_space *); void nilfs_copy_back_pages(struct address_space *, struct address_space *); void nilfs_clear_dirty_page(struct page *, bool); void nilfs_clear_dirty_pages(struct address_space *, bool); -void nilfs_mapping_init(struct address_space *mapping, struct inode *inode, - struct backing_dev_info *bdi); +void nilfs_mapping_init(struct address_space *mapping, struct inode *inode); unsigned nilfs_page_count_clean_buffers(struct page *, unsigned, unsigned); unsigned long nilfs_find_uncommitted_extent(struct inode *inode, sector_t start_blk, diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c index 3d4bbac36bea..5bc2a1cf73c3 100644 --- a/fs/nilfs2/super.c +++ b/fs/nilfs2/super.c @@ -166,7 +166,7 @@ struct inode *nilfs_alloc_inode(struct super_block *sb) ii->i_state = 0; ii->i_cno = 0; ii->vfs_inode.i_version = 1; - nilfs_mapping_init(&ii->i_btnode_cache, &ii->vfs_inode, sb->s_bdi); + nilfs_mapping_init(&ii->i_btnode_cache, &ii->vfs_inode); return &ii->vfs_inode; } diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c index 6000d3029b26..061ba6a91bf2 100644 --- a/fs/ocfs2/dlmfs/dlmfs.c +++ b/fs/ocfs2/dlmfs/dlmfs.c @@ -398,7 +398,6 @@ static struct inode *dlmfs_get_root_inode(struct super_block *sb) if (inode) { inode->i_ino = get_next_ino(); inode_init_owner(inode, NULL, mode); - inode->i_mapping->backing_dev_info = &noop_backing_dev_info; inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; inc_nlink(inode); @@ -422,7 +421,6 @@ static struct inode *dlmfs_get_inode(struct inode *parent, inode->i_ino = get_next_ino(); inode_init_owner(inode, parent, mode); - inode->i_mapping->backing_dev_info = &noop_backing_dev_info; inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; ip = DLMFS_I(inode); diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c index ad4d712002f4..889d558b4e05 100644 --- a/fs/ramfs/inode.c +++ b/fs/ramfs/inode.c @@ -59,7 +59,6 @@ struct inode *ramfs_get_inode(struct super_block *sb, inode->i_ino = get_next_ino(); inode_init_owner(inode, dir, mode); inode->i_mapping->a_ops = &ramfs_aops; - inode->i_mapping->backing_dev_info = &noop_backing_dev_info; mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER); mapping_set_unevictable(inode->i_mapping); inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; diff --git a/fs/romfs/super.c b/fs/romfs/super.c index e98dd88197d5..268733cda397 100644 --- a/fs/romfs/super.c +++ b/fs/romfs/super.c @@ -355,9 +355,6 @@ static struct inode *romfs_iget(struct super_block *sb, unsigned long pos) case ROMFH_REG: i->i_fop = &romfs_ro_fops; i->i_data.a_ops = &romfs_aops; - if (i->i_sb->s_mtd) - i->i_data.backing_dev_info = - i->i_sb->s_mtd->backing_dev_info; if (nextfh & ROMFH_EXEC) mode |= S_IXUGO; break; diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c index ea41649e4ca5..c49b1981ac95 100644 --- a/fs/ubifs/dir.c +++ b/fs/ubifs/dir.c @@ -108,8 +108,6 @@ struct inode *ubifs_new_inode(struct ubifs_info *c, const struct inode *dir, inode->i_mtime = inode->i_atime = inode->i_ctime = ubifs_current_time(inode); inode->i_mapping->nrpages = 0; - /* Disable readahead */ - inode->i_mapping->backing_dev_info = &c->bdi; switch (mode & S_IFMT) { case S_IFREG: diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index ed93dc6ae245..6197154f36ca 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -156,9 +156,6 @@ struct inode *ubifs_iget(struct super_block *sb, unsigned long inum) if (err) goto out_invalid; - /* Disable read-ahead */ - inode->i_mapping->backing_dev_info = &c->bdi; - switch (inode->i_mode & S_IFMT) { case S_IFREG: inode->i_mapping->a_ops = &ubifs_file_address_operations; diff --git a/include/linux/fs.h b/include/linux/fs.h index 1dada399aa23..65d02de342e1 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -34,6 +34,7 @@ #include #include +struct backing_dev_info; struct export_operations; struct hd_geometry; struct iovec; @@ -394,7 +395,6 @@ int pagecache_write_end(struct file *, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct page *page, void *fsdata); -struct backing_dev_info; struct address_space { struct inode *host; /* owner: inode, block_device */ struct radix_tree_root page_tree; /* radix tree of all pages */ @@ -409,7 +409,6 @@ struct address_space { pgoff_t writeback_index;/* writeback starts here */ const struct address_space_operations *a_ops; /* methods */ unsigned long flags; /* error bits/gfp mask */ - struct backing_dev_info *backing_dev_info; /* device readahead, etc */ spinlock_t private_lock; /* for use by the address_space */ struct list_head private_list; /* ditto */ void *private_data; /* ditto */ diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 16c68958aeda..52e0c7652448 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -24,7 +24,6 @@ struct backing_dev_info noop_backing_dev_info = { .name = "noop", .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, }; -EXPORT_SYMBOL_GPL(noop_backing_dev_info); static struct class *bdi_class; diff --git a/mm/shmem.c b/mm/shmem.c index 1b77eaf589fd..4c61d3d5bfb4 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1410,7 +1410,6 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode inode->i_ino = get_next_ino(); inode_init_owner(inode, dir, mode); inode->i_blocks = 0; - inode->i_mapping->backing_dev_info = &noop_backing_dev_info; inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; inode->i_generation = get_seconds(); info = SHMEM_I(inode); diff --git a/mm/swap_state.c b/mm/swap_state.c index 1c137b69ecde..405923f77334 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -37,7 +37,6 @@ struct address_space swapper_spaces[MAX_SWAPFILES] = { .page_tree = RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN), .i_mmap_writable = ATOMIC_INIT(0), .a_ops = &swap_aops, - .backing_dev_info = &noop_backing_dev_info, } }; -- cgit v1.2.3 From c4db59d31e39ea067c32163ac961e9c80198fd37 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 20 Jan 2015 14:05:00 -0700 Subject: fs: don't reassign dirty inodes to default_backing_dev_info If we have dirty inodes we need to call the filesystem for it, even if the device has been removed and the filesystem will error out early. The current code does that by reassining all dirty inodes to the default backing_dev_info when a bdi is unlinked, but that's pretty pointless given that the bdi must always outlive the super block. Instead of stopping writeback at unregister time and moving inodes to the default bdi just keep the current bdi alive until it is destroyed. The containing objects of the bdi ensure this doesn't happen until all writeback has finished by erroring out. Signed-off-by: Christoph Hellwig Reviewed-by: Tejun Heo Reviewed-by: Jan Kara Killed the redundant WARN_ON(), as noticed by Jan. Signed-off-by: Jens Axboe --- mm/backing-dev.c | 90 +++++++++++++++----------------------------------------- 1 file changed, 23 insertions(+), 67 deletions(-) (limited to 'mm/backing-dev.c') diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 52e0c7652448..1725adb242e0 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -37,17 +37,6 @@ LIST_HEAD(bdi_list); /* bdi_wq serves all asynchronous writeback tasks */ struct workqueue_struct *bdi_wq; -static void bdi_lock_two(struct bdi_writeback *wb1, struct bdi_writeback *wb2) -{ - if (wb1 < wb2) { - spin_lock(&wb1->list_lock); - spin_lock_nested(&wb2->list_lock, 1); - } else { - spin_lock(&wb2->list_lock); - spin_lock_nested(&wb1->list_lock, 1); - } -} - #ifdef CONFIG_DEBUG_FS #include #include @@ -352,19 +341,19 @@ EXPORT_SYMBOL(bdi_register_dev); */ static void bdi_wb_shutdown(struct backing_dev_info *bdi) { - if (!bdi_cap_writeback_dirty(bdi)) + /* Make sure nobody queues further work */ + spin_lock_bh(&bdi->wb_lock); + if (!test_and_clear_bit(BDI_registered, &bdi->state)) { + spin_unlock_bh(&bdi->wb_lock); return; + } + spin_unlock_bh(&bdi->wb_lock); /* * Make sure nobody finds us on the bdi_list anymore */ bdi_remove_from_list(bdi); - /* Make sure nobody queues further work */ - spin_lock_bh(&bdi->wb_lock); - clear_bit(BDI_registered, &bdi->state); - spin_unlock_bh(&bdi->wb_lock); - /* * Drain work list and shutdown the delayed_work. At this point, * @bdi->bdi_list is empty telling bdi_Writeback_workfn() that @bdi @@ -372,37 +361,22 @@ static void bdi_wb_shutdown(struct backing_dev_info *bdi) */ mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0); flush_delayed_work(&bdi->wb.dwork); - WARN_ON(!list_empty(&bdi->work_list)); - WARN_ON(delayed_work_pending(&bdi->wb.dwork)); } /* - * This bdi is going away now, make sure that no super_blocks point to it + * Called when the device behind @bdi has been removed or ejected. + * + * We can't really do much here except for reducing the dirty ratio at + * the moment. In the future we should be able to set a flag so that + * the filesystem can handle errors at mark_inode_dirty time instead + * of only at writeback time. */ -static void bdi_prune_sb(struct backing_dev_info *bdi) -{ - struct super_block *sb; - - spin_lock(&sb_lock); - list_for_each_entry(sb, &super_blocks, s_list) { - if (sb->s_bdi == bdi) - sb->s_bdi = &default_backing_dev_info; - } - spin_unlock(&sb_lock); -} - void bdi_unregister(struct backing_dev_info *bdi) { - if (bdi->dev) { - bdi_set_min_ratio(bdi, 0); - trace_writeback_bdi_unregister(bdi); - bdi_prune_sb(bdi); + if (WARN_ON_ONCE(!bdi->dev)) + return; - bdi_wb_shutdown(bdi); - bdi_debug_unregister(bdi); - device_unregister(bdi->dev); - bdi->dev = NULL; - } + bdi_set_min_ratio(bdi, 0); } EXPORT_SYMBOL(bdi_unregister); @@ -471,37 +445,19 @@ void bdi_destroy(struct backing_dev_info *bdi) { int i; - /* - * Splice our entries to the default_backing_dev_info. This - * condition shouldn't happen. @wb must be empty at this point and - * dirty inodes on it might cause other issues. This workaround is - * added by ce5f8e779519 ("writeback: splice dirty inode entries to - * default bdi on bdi_destroy()") without root-causing the issue. - * - * http://lkml.kernel.org/g/1253038617-30204-11-git-send-email-jens.axboe@oracle.com - * http://thread.gmane.org/gmane.linux.file-systems/35341/focus=35350 - * - * We should probably add WARN_ON() to find out whether it still - * happens and track it down if so. - */ - if (bdi_has_dirty_io(bdi)) { - struct bdi_writeback *dst = &default_backing_dev_info.wb; - - bdi_lock_two(&bdi->wb, dst); - list_splice(&bdi->wb.b_dirty, &dst->b_dirty); - list_splice(&bdi->wb.b_io, &dst->b_io); - list_splice(&bdi->wb.b_more_io, &dst->b_more_io); - spin_unlock(&bdi->wb.list_lock); - spin_unlock(&dst->list_lock); - } - - bdi_unregister(bdi); + bdi_wb_shutdown(bdi); + WARN_ON(!list_empty(&bdi->work_list)); WARN_ON(delayed_work_pending(&bdi->wb.dwork)); + if (bdi->dev) { + bdi_debug_unregister(bdi); + device_unregister(bdi->dev); + bdi->dev = NULL; + } + for (i = 0; i < NR_BDI_STAT_ITEMS; i++) percpu_counter_destroy(&bdi->bdi_stat[i]); - fprop_local_destroy_percpu(&bdi->completions); } EXPORT_SYMBOL(bdi_destroy); -- cgit v1.2.3 From df0ce26cb4ee8bc233d50213b97213532aff0a3c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 14 Jan 2015 10:42:41 +0100 Subject: fs: remove default_backing_dev_info Now that default_backing_dev_info is not used for writeback purposes we can git rid of it easily: - instead of using it's name for tracing unregistered bdi we just use "unknown" - btrfs and ceph can just assign the default read ahead window themselves like several other filesystems already do. - we can assign noop_backing_dev_info as the default one in alloc_super. All filesystems already either assigned their own or noop_backing_dev_info. Signed-off-by: Christoph Hellwig Reviewed-by: Tejun Heo Reviewed-by: Jan Kara Signed-off-by: Jens Axboe --- fs/btrfs/disk-io.c | 2 +- fs/ceph/super.c | 2 +- fs/super.c | 8 ++------ include/linux/backing-dev.h | 1 - include/trace/events/writeback.h | 6 ++---- mm/backing-dev.c | 9 --------- 6 files changed, 6 insertions(+), 22 deletions(-) (limited to 'mm/backing-dev.c') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 1ec872e3a926..1afb18226da8 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1719,7 +1719,7 @@ static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi) if (err) return err; - bdi->ra_pages = default_backing_dev_info.ra_pages; + bdi->ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE; bdi->congested_fn = btrfs_congested_fn; bdi->congested_data = info; return 0; diff --git a/fs/ceph/super.c b/fs/ceph/super.c index e350cc1611e4..5ae62587a71d 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -899,7 +899,7 @@ static int ceph_register_bdi(struct super_block *sb, >> PAGE_SHIFT; else fsc->backing_dev_info.ra_pages = - default_backing_dev_info.ra_pages; + VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE; err = bdi_register(&fsc->backing_dev_info, NULL, "ceph-%ld", atomic_long_inc_return(&bdi_seq)); diff --git a/fs/super.c b/fs/super.c index eae088f6aaae..3b4dadafdd60 100644 --- a/fs/super.c +++ b/fs/super.c @@ -185,8 +185,8 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags) } init_waitqueue_head(&s->s_writers.wait); init_waitqueue_head(&s->s_writers.wait_unfrozen); + s->s_bdi = &noop_backing_dev_info; s->s_flags = flags; - s->s_bdi = &default_backing_dev_info; INIT_HLIST_NODE(&s->s_instances); INIT_HLIST_BL_HEAD(&s->s_anon); INIT_LIST_HEAD(&s->s_inodes); @@ -863,10 +863,7 @@ EXPORT_SYMBOL(free_anon_bdev); int set_anon_super(struct super_block *s, void *data) { - int error = get_anon_bdev(&s->s_dev); - if (!error) - s->s_bdi = &noop_backing_dev_info; - return error; + return get_anon_bdev(&s->s_dev); } EXPORT_SYMBOL(set_anon_super); @@ -1111,7 +1108,6 @@ mount_fs(struct file_system_type *type, int flags, const char *name, void *data) sb = root->d_sb; BUG_ON(!sb); WARN_ON(!sb->s_bdi); - WARN_ON(sb->s_bdi == &default_backing_dev_info); sb->s_flags |= MS_BORN; error = security_sb_kern_mount(sb, flags, secdata); diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index ed59dee03a71..d94077fea1f8 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -241,7 +241,6 @@ int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio); #define BDI_CAP_NO_ACCT_AND_WRITEBACK \ (BDI_CAP_NO_WRITEBACK | BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_ACCT_WB) -extern struct backing_dev_info default_backing_dev_info; extern struct backing_dev_info noop_backing_dev_info; int writeback_in_progress(struct backing_dev_info *bdi); diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h index 74f5207bd090..0e9310905413 100644 --- a/include/trace/events/writeback.h +++ b/include/trace/events/writeback.h @@ -156,10 +156,8 @@ DECLARE_EVENT_CLASS(writeback_work_class, __field(int, reason) ), TP_fast_assign( - struct device *dev = bdi->dev; - if (!dev) - dev = default_backing_dev_info.dev; - strncpy(__entry->name, dev_name(dev), 32); + strncpy(__entry->name, + bdi->dev ? dev_name(bdi->dev) : "(unknown)", 32); __entry->nr_pages = work->nr_pages; __entry->sb_dev = work->sb ? work->sb->s_dev : 0; __entry->sync_mode = work->sync_mode; diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 1725adb242e0..7690ec77c722 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -14,12 +14,6 @@ static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0); -struct backing_dev_info default_backing_dev_info = { - .name = "default", - .ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE, -}; -EXPORT_SYMBOL_GPL(default_backing_dev_info); - struct backing_dev_info noop_backing_dev_info = { .name = "noop", .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, @@ -250,9 +244,6 @@ static int __init default_bdi_init(void) if (!bdi_wq) return -ENOMEM; - err = bdi_init(&default_backing_dev_info); - if (!err) - bdi_register(&default_backing_dev_info, NULL, "default"); err = bdi_init(&noop_backing_dev_info); return err; -- cgit v1.2.3 From 0ae45f63d4ef8d8eeec49c7d8b44a1775fff13e8 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Mon, 2 Feb 2015 00:37:00 -0500 Subject: vfs: add support for a lazytime mount option Add a new mount option which enables a new "lazytime" mode. This mode causes atime, mtime, and ctime updates to only be made to the in-memory version of the inode. The on-disk times will only get updated when (a) if the inode needs to be updated for some non-time related change, (b) if userspace calls fsync(), syncfs() or sync(), or (c) just before an undeleted inode is evicted from memory. This is OK according to POSIX because there are no guarantees after a crash unless userspace explicitly requests via a fsync(2) call. For workloads which feature a large number of random write to a preallocated file, the lazytime mount option significantly reduces writes to the inode table. The repeated 4k writes to a single block will result in undesirable stress on flash devices and SMR disk drives. Even on conventional HDD's, the repeated writes to the inode table block will trigger Adjacent Track Interference (ATI) remediation latencies, which very negatively impact long tail latencies --- which is a very big deal for web serving tiers (for example). Google-Bug-Id: 18297052 Signed-off-by: Theodore Ts'o Signed-off-by: Al Viro --- fs/ext4/inode.c | 6 ++++ fs/fs-writeback.c | 62 +++++++++++++++++++++++++++++++++------- fs/gfs2/file.c | 4 +-- fs/inode.c | 56 +++++++++++++++++++++++++----------- fs/jfs/file.c | 2 +- fs/libfs.c | 2 +- fs/proc_namespace.c | 1 + fs/sync.c | 8 ++++++ include/linux/backing-dev.h | 1 + include/linux/fs.h | 5 ++++ include/trace/events/writeback.h | 60 +++++++++++++++++++++++++++++++++++++- include/uapi/linux/fs.h | 4 ++- mm/backing-dev.c | 10 +++++-- 13 files changed, 186 insertions(+), 35 deletions(-) (limited to 'mm/backing-dev.c') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 5653fa42930b..628df5ba44a6 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4840,11 +4840,17 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode) * If the inode is marked synchronous, we don't honour that here - doing * so would cause a commit on atime updates, which we don't bother doing. * We handle synchronous inodes at the highest possible level. + * + * If only the I_DIRTY_TIME flag is set, we can skip everything. If + * I_DIRTY_TIME and I_DIRTY_SYNC is set, the only inode fields we need + * to copy into the on-disk inode structure are the timestamp files. */ void ext4_dirty_inode(struct inode *inode, int flags) { handle_t *handle; + if (flags == I_DIRTY_TIME) + return; handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); if (IS_ERR(handle)) goto out; diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 2d609a5fbfea..004686191354 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -247,14 +247,19 @@ static bool inode_dirtied_after(struct inode *inode, unsigned long t) return ret; } +#define EXPIRE_DIRTY_ATIME 0x0001 + /* * Move expired (dirtied before work->older_than_this) dirty inodes from * @delaying_queue to @dispatch_queue. */ static int move_expired_inodes(struct list_head *delaying_queue, struct list_head *dispatch_queue, + int flags, struct wb_writeback_work *work) { + unsigned long *older_than_this = NULL; + unsigned long expire_time; LIST_HEAD(tmp); struct list_head *pos, *node; struct super_block *sb = NULL; @@ -262,13 +267,21 @@ static int move_expired_inodes(struct list_head *delaying_queue, int do_sb_sort = 0; int moved = 0; + if ((flags & EXPIRE_DIRTY_ATIME) == 0) + older_than_this = work->older_than_this; + else if ((work->reason == WB_REASON_SYNC) == 0) { + expire_time = jiffies - (HZ * 86400); + older_than_this = &expire_time; + } while (!list_empty(delaying_queue)) { inode = wb_inode(delaying_queue->prev); - if (work->older_than_this && - inode_dirtied_after(inode, *work->older_than_this)) + if (older_than_this && + inode_dirtied_after(inode, *older_than_this)) break; list_move(&inode->i_wb_list, &tmp); moved++; + if (flags & EXPIRE_DIRTY_ATIME) + set_bit(__I_DIRTY_TIME_EXPIRED, &inode->i_state); if (sb_is_blkdev_sb(inode->i_sb)) continue; if (sb && sb != inode->i_sb) @@ -309,9 +322,12 @@ out: static void queue_io(struct bdi_writeback *wb, struct wb_writeback_work *work) { int moved; + assert_spin_locked(&wb->list_lock); list_splice_init(&wb->b_more_io, &wb->b_io); - moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, work); + moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, 0, work); + moved += move_expired_inodes(&wb->b_dirty_time, &wb->b_io, + EXPIRE_DIRTY_ATIME, work); trace_writeback_queue_io(wb, work, moved); } @@ -435,6 +451,8 @@ static void requeue_inode(struct inode *inode, struct bdi_writeback *wb, * updates after data IO completion. */ redirty_tail(inode, wb); + } else if (inode->i_state & I_DIRTY_TIME) { + list_move(&inode->i_wb_list, &wb->b_dirty_time); } else { /* The inode is clean. Remove from writeback lists. */ list_del_init(&inode->i_wb_list); @@ -481,7 +499,13 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc) spin_lock(&inode->i_lock); dirty = inode->i_state & I_DIRTY; - inode->i_state &= ~I_DIRTY; + if (((dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) && + (inode->i_state & I_DIRTY_TIME)) || + (inode->i_state & I_DIRTY_TIME_EXPIRED)) { + dirty |= I_DIRTY_TIME | I_DIRTY_TIME_EXPIRED; + trace_writeback_lazytime(inode); + } + inode->i_state &= ~dirty; /* * Paired with smp_mb() in __mark_inode_dirty(). This allows @@ -501,8 +525,10 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc) spin_unlock(&inode->i_lock); + if (dirty & I_DIRTY_TIME) + mark_inode_dirty_sync(inode); /* Don't write the inode if only I_DIRTY_PAGES was set */ - if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) { + if (dirty & ~I_DIRTY_PAGES) { int err = write_inode(inode, wbc); if (ret == 0) ret = err; @@ -550,7 +576,7 @@ writeback_single_inode(struct inode *inode, struct bdi_writeback *wb, * make sure inode is on some writeback list and leave it there unless * we have completely cleaned the inode. */ - if (!(inode->i_state & I_DIRTY) && + if (!(inode->i_state & I_DIRTY_ALL) && (wbc->sync_mode != WB_SYNC_ALL || !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_WRITEBACK))) goto out; @@ -565,7 +591,7 @@ writeback_single_inode(struct inode *inode, struct bdi_writeback *wb, * If inode is clean, remove it from writeback lists. Otherwise don't * touch it. See comment above for explanation. */ - if (!(inode->i_state & I_DIRTY)) + if (!(inode->i_state & I_DIRTY_ALL)) list_del_init(&inode->i_wb_list); spin_unlock(&wb->list_lock); inode_sync_complete(inode); @@ -707,7 +733,7 @@ static long writeback_sb_inodes(struct super_block *sb, wrote += write_chunk - wbc.nr_to_write; spin_lock(&wb->list_lock); spin_lock(&inode->i_lock); - if (!(inode->i_state & I_DIRTY)) + if (!(inode->i_state & I_DIRTY_ALL)) wrote++; requeue_inode(inode, wb, &wbc); inode_sync_complete(inode); @@ -1145,16 +1171,20 @@ static noinline void block_dump___mark_inode_dirty(struct inode *inode) * page->mapping->host, so the page-dirtying time is recorded in the internal * blockdev inode. */ +#define I_DIRTY_INODE (I_DIRTY_SYNC | I_DIRTY_DATASYNC) void __mark_inode_dirty(struct inode *inode, int flags) { struct super_block *sb = inode->i_sb; struct backing_dev_info *bdi = NULL; + int dirtytime; + + trace_writeback_mark_inode_dirty(inode, flags); /* * Don't do this for I_DIRTY_PAGES - that doesn't actually * dirty the inode itself */ - if (flags & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) { + if (flags & (I_DIRTY_SYNC | I_DIRTY_DATASYNC | I_DIRTY_TIME)) { trace_writeback_dirty_inode_start(inode, flags); if (sb->s_op->dirty_inode) @@ -1162,6 +1192,9 @@ void __mark_inode_dirty(struct inode *inode, int flags) trace_writeback_dirty_inode(inode, flags); } + if (flags & I_DIRTY_INODE) + flags &= ~I_DIRTY_TIME; + dirtytime = flags & I_DIRTY_TIME; /* * Paired with smp_mb() in __writeback_single_inode() for the @@ -1169,16 +1202,21 @@ void __mark_inode_dirty(struct inode *inode, int flags) */ smp_mb(); - if ((inode->i_state & flags) == flags) + if (((inode->i_state & flags) == flags) || + (dirtytime && (inode->i_state & I_DIRTY_INODE))) return; if (unlikely(block_dump)) block_dump___mark_inode_dirty(inode); spin_lock(&inode->i_lock); + if (dirtytime && (inode->i_state & I_DIRTY_INODE)) + goto out_unlock_inode; if ((inode->i_state & flags) != flags) { const int was_dirty = inode->i_state & I_DIRTY; + if (flags & I_DIRTY_INODE) + inode->i_state &= ~I_DIRTY_TIME; inode->i_state |= flags; /* @@ -1225,8 +1263,10 @@ void __mark_inode_dirty(struct inode *inode, int flags) } inode->dirtied_when = jiffies; - list_move(&inode->i_wb_list, &bdi->wb.b_dirty); + list_move(&inode->i_wb_list, dirtytime ? + &bdi->wb.b_dirty_time : &bdi->wb.b_dirty); spin_unlock(&bdi->wb.list_lock); + trace_writeback_dirty_inode_enqueue(inode); if (wakeup_bdi) bdi_wakeup_thread_delayed(bdi); diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 6e600abf694a..15c44cf457cc 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -655,7 +655,7 @@ static int gfs2_fsync(struct file *file, loff_t start, loff_t end, { struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; - int sync_state = inode->i_state & I_DIRTY; + int sync_state = inode->i_state & I_DIRTY_ALL; struct gfs2_inode *ip = GFS2_I(inode); int ret = 0, ret1 = 0; @@ -668,7 +668,7 @@ static int gfs2_fsync(struct file *file, loff_t start, loff_t end, if (!gfs2_is_jdata(ip)) sync_state &= ~I_DIRTY_PAGES; if (datasync) - sync_state &= ~I_DIRTY_SYNC; + sync_state &= ~(I_DIRTY_SYNC | I_DIRTY_TIME); if (sync_state) { ret = sync_inode_metadata(inode, 1); diff --git a/fs/inode.c b/fs/inode.c index aa149e7262ac..4feb85cc125f 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -18,6 +18,7 @@ #include /* for inode_has_buffers */ #include #include +#include #include "internal.h" /* @@ -30,7 +31,7 @@ * inode_sb_list_lock protects: * sb->s_inodes, inode->i_sb_list * bdi->wb.list_lock protects: - * bdi->wb.b_{dirty,io,more_io}, inode->i_wb_list + * bdi->wb.b_{dirty,io,more_io,dirty_time}, inode->i_wb_list * inode_hash_lock protects: * inode_hashtable, inode->i_hash * @@ -416,7 +417,8 @@ static void inode_lru_list_add(struct inode *inode) */ void inode_add_lru(struct inode *inode) { - if (!(inode->i_state & (I_DIRTY | I_SYNC | I_FREEING | I_WILL_FREE)) && + if (!(inode->i_state & (I_DIRTY_ALL | I_SYNC | + I_FREEING | I_WILL_FREE)) && !atomic_read(&inode->i_count) && inode->i_sb->s_flags & MS_ACTIVE) inode_lru_list_add(inode); } @@ -647,7 +649,7 @@ int invalidate_inodes(struct super_block *sb, bool kill_dirty) spin_unlock(&inode->i_lock); continue; } - if (inode->i_state & I_DIRTY && !kill_dirty) { + if (inode->i_state & I_DIRTY_ALL && !kill_dirty) { spin_unlock(&inode->i_lock); busy = 1; continue; @@ -1432,11 +1434,20 @@ static void iput_final(struct inode *inode) */ void iput(struct inode *inode) { - if (inode) { - BUG_ON(inode->i_state & I_CLEAR); - - if (atomic_dec_and_lock(&inode->i_count, &inode->i_lock)) - iput_final(inode); + if (!inode) + return; + BUG_ON(inode->i_state & I_CLEAR); +retry: + if (atomic_dec_and_lock(&inode->i_count, &inode->i_lock)) { + if (inode->i_nlink && (inode->i_state & I_DIRTY_TIME)) { + atomic_inc(&inode->i_count); + inode->i_state &= ~I_DIRTY_TIME; + spin_unlock(&inode->i_lock); + trace_writeback_lazytime_iput(inode); + mark_inode_dirty_sync(inode); + goto retry; + } + iput_final(inode); } } EXPORT_SYMBOL(iput); @@ -1495,14 +1506,9 @@ static int relatime_need_update(struct vfsmount *mnt, struct inode *inode, return 0; } -/* - * This does the actual work of updating an inodes time or version. Must have - * had called mnt_want_write() before calling this. - */ -static int update_time(struct inode *inode, struct timespec *time, int flags) +int generic_update_time(struct inode *inode, struct timespec *time, int flags) { - if (inode->i_op->update_time) - return inode->i_op->update_time(inode, time, flags); + int iflags = I_DIRTY_TIME; if (flags & S_ATIME) inode->i_atime = *time; @@ -1512,9 +1518,27 @@ static int update_time(struct inode *inode, struct timespec *time, int flags) inode->i_ctime = *time; if (flags & S_MTIME) inode->i_mtime = *time; - mark_inode_dirty_sync(inode); + + if (!(inode->i_sb->s_flags & MS_LAZYTIME) || (flags & S_VERSION)) + iflags |= I_DIRTY_SYNC; + __mark_inode_dirty(inode, iflags); return 0; } +EXPORT_SYMBOL(generic_update_time); + +/* + * This does the actual work of updating an inodes time or version. Must have + * had called mnt_want_write() before calling this. + */ +static int update_time(struct inode *inode, struct timespec *time, int flags) +{ + int (*update_time)(struct inode *, struct timespec *, int); + + update_time = inode->i_op->update_time ? inode->i_op->update_time : + generic_update_time; + + return update_time(inode, time, flags); +} /** * touch_atime - update the access time diff --git a/fs/jfs/file.c b/fs/jfs/file.c index 33aa0cc1f8b8..10815f8dfd8b 100644 --- a/fs/jfs/file.c +++ b/fs/jfs/file.c @@ -39,7 +39,7 @@ int jfs_fsync(struct file *file, loff_t start, loff_t end, int datasync) return rc; mutex_lock(&inode->i_mutex); - if (!(inode->i_state & I_DIRTY) || + if (!(inode->i_state & I_DIRTY_ALL) || (datasync && !(inode->i_state & I_DIRTY_DATASYNC))) { /* Make sure committed changes hit the disk */ jfs_flush_journal(JFS_SBI(inode->i_sb)->log, 1); diff --git a/fs/libfs.c b/fs/libfs.c index 005843ce5dbd..b2ffdb045be4 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -948,7 +948,7 @@ int __generic_file_fsync(struct file *file, loff_t start, loff_t end, mutex_lock(&inode->i_mutex); ret = sync_mapping_buffers(inode->i_mapping); - if (!(inode->i_state & I_DIRTY)) + if (!(inode->i_state & I_DIRTY_ALL)) goto out; if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) goto out; diff --git a/fs/proc_namespace.c b/fs/proc_namespace.c index 0f96f71ab32b..8db932da4009 100644 --- a/fs/proc_namespace.c +++ b/fs/proc_namespace.c @@ -44,6 +44,7 @@ static int show_sb_opts(struct seq_file *m, struct super_block *sb) { MS_SYNCHRONOUS, ",sync" }, { MS_DIRSYNC, ",dirsync" }, { MS_MANDLOCK, ",mand" }, + { MS_LAZYTIME, ",lazytime" }, { 0, NULL } }; const struct proc_fs_info *fs_infop; diff --git a/fs/sync.c b/fs/sync.c index 01d9f18a70b5..fbc98ee62044 100644 --- a/fs/sync.c +++ b/fs/sync.c @@ -177,8 +177,16 @@ SYSCALL_DEFINE1(syncfs, int, fd) */ int vfs_fsync_range(struct file *file, loff_t start, loff_t end, int datasync) { + struct inode *inode = file->f_mapping->host; + if (!file->f_op->fsync) return -EINVAL; + if (!datasync && (inode->i_state & I_DIRTY_TIME)) { + spin_lock(&inode->i_lock); + inode->i_state &= ~I_DIRTY_TIME; + spin_unlock(&inode->i_lock); + mark_inode_dirty_sync(inode); + } return file->f_op->fsync(file, start, end, datasync); } EXPORT_SYMBOL(vfs_fsync_range); diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 5da6012b7a14..4cdf7336f64a 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -55,6 +55,7 @@ struct bdi_writeback { struct list_head b_dirty; /* dirty inodes */ struct list_head b_io; /* parked for writeback */ struct list_head b_more_io; /* parked for more writeback */ + struct list_head b_dirty_time; /* time stamps are dirty */ spinlock_t list_lock; /* protects the b_* lists */ }; diff --git a/include/linux/fs.h b/include/linux/fs.h index 42efe13077b6..cd027ce2c705 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1746,8 +1746,12 @@ struct super_operations { #define __I_DIO_WAKEUP 9 #define I_DIO_WAKEUP (1 << I_DIO_WAKEUP) #define I_LINKABLE (1 << 10) +#define I_DIRTY_TIME (1 << 11) +#define __I_DIRTY_TIME_EXPIRED 12 +#define I_DIRTY_TIME_EXPIRED (1 << __I_DIRTY_TIME_EXPIRED) #define I_DIRTY (I_DIRTY_SYNC | I_DIRTY_DATASYNC | I_DIRTY_PAGES) +#define I_DIRTY_ALL (I_DIRTY | I_DIRTY_TIME) extern void __mark_inode_dirty(struct inode *, int); static inline void mark_inode_dirty(struct inode *inode) @@ -1910,6 +1914,7 @@ extern int current_umask(void); extern void ihold(struct inode * inode); extern void iput(struct inode *); +extern int generic_update_time(struct inode *, struct timespec *, int); static inline struct inode *file_inode(const struct file *f) { diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h index cee02d65ab3f..5ecb4c234625 100644 --- a/include/trace/events/writeback.h +++ b/include/trace/events/writeback.h @@ -18,6 +18,8 @@ {I_FREEING, "I_FREEING"}, \ {I_CLEAR, "I_CLEAR"}, \ {I_SYNC, "I_SYNC"}, \ + {I_DIRTY_TIME, "I_DIRTY_TIME"}, \ + {I_DIRTY_TIME_EXPIRED, "I_DIRTY_TIME_EXPIRED"}, \ {I_REFERENCED, "I_REFERENCED"} \ ) @@ -68,6 +70,7 @@ DECLARE_EVENT_CLASS(writeback_dirty_inode_template, TP_STRUCT__entry ( __array(char, name, 32) __field(unsigned long, ino) + __field(unsigned long, state) __field(unsigned long, flags) ), @@ -78,16 +81,25 @@ DECLARE_EVENT_CLASS(writeback_dirty_inode_template, strncpy(__entry->name, bdi->dev ? dev_name(bdi->dev) : "(unknown)", 32); __entry->ino = inode->i_ino; + __entry->state = inode->i_state; __entry->flags = flags; ), - TP_printk("bdi %s: ino=%lu flags=%s", + TP_printk("bdi %s: ino=%lu state=%s flags=%s", __entry->name, __entry->ino, + show_inode_state(__entry->state), show_inode_state(__entry->flags) ) ); +DEFINE_EVENT(writeback_dirty_inode_template, writeback_mark_inode_dirty, + + TP_PROTO(struct inode *inode, int flags), + + TP_ARGS(inode, flags) +); + DEFINE_EVENT(writeback_dirty_inode_template, writeback_dirty_inode_start, TP_PROTO(struct inode *inode, int flags), @@ -598,6 +610,52 @@ DEFINE_EVENT(writeback_single_inode_template, writeback_single_inode, TP_ARGS(inode, wbc, nr_to_write) ); +DECLARE_EVENT_CLASS(writeback_lazytime_template, + TP_PROTO(struct inode *inode), + + TP_ARGS(inode), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field(unsigned long, ino ) + __field(unsigned long, state ) + __field( __u16, mode ) + __field(unsigned long, dirtied_when ) + ), + + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; + __entry->state = inode->i_state; + __entry->mode = inode->i_mode; + __entry->dirtied_when = inode->dirtied_when; + ), + + TP_printk("dev %d,%d ino %lu dirtied %lu state %s mode 0%o", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, __entry->dirtied_when, + show_inode_state(__entry->state), __entry->mode) +); + +DEFINE_EVENT(writeback_lazytime_template, writeback_lazytime, + TP_PROTO(struct inode *inode), + + TP_ARGS(inode) +); + +DEFINE_EVENT(writeback_lazytime_template, writeback_lazytime_iput, + TP_PROTO(struct inode *inode), + + TP_ARGS(inode) +); + +DEFINE_EVENT(writeback_lazytime_template, writeback_dirty_inode_enqueue, + + TP_PROTO(struct inode *inode), + + TP_ARGS(inode) +); + #endif /* _TRACE_WRITEBACK_H */ /* This part must be outside protection */ diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h index 3735fa0a6784..9b964a5920af 100644 --- a/include/uapi/linux/fs.h +++ b/include/uapi/linux/fs.h @@ -90,6 +90,7 @@ struct inodes_stat_t { #define MS_KERNMOUNT (1<<22) /* this is a kern_mount call */ #define MS_I_VERSION (1<<23) /* Update inode I_version field */ #define MS_STRICTATIME (1<<24) /* Always perform atime updates */ +#define MS_LAZYTIME (1<<25) /* Update the on-disk [acm]times lazily */ /* These sb flags are internal to the kernel */ #define MS_NOSEC (1<<28) @@ -100,7 +101,8 @@ struct inodes_stat_t { /* * Superblock flags that can be altered by MS_REMOUNT */ -#define MS_RMT_MASK (MS_RDONLY|MS_SYNCHRONOUS|MS_MANDLOCK|MS_I_VERSION) +#define MS_RMT_MASK (MS_RDONLY|MS_SYNCHRONOUS|MS_MANDLOCK|MS_I_VERSION|\ + MS_LAZYTIME) /* * Old magic mount flag and mask diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 0ae0df55000b..915feea94c66 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -69,10 +69,10 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v) unsigned long background_thresh; unsigned long dirty_thresh; unsigned long bdi_thresh; - unsigned long nr_dirty, nr_io, nr_more_io; + unsigned long nr_dirty, nr_io, nr_more_io, nr_dirty_time; struct inode *inode; - nr_dirty = nr_io = nr_more_io = 0; + nr_dirty = nr_io = nr_more_io = nr_dirty_time = 0; spin_lock(&wb->list_lock); list_for_each_entry(inode, &wb->b_dirty, i_wb_list) nr_dirty++; @@ -80,6 +80,9 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v) nr_io++; list_for_each_entry(inode, &wb->b_more_io, i_wb_list) nr_more_io++; + list_for_each_entry(inode, &wb->b_dirty_time, i_wb_list) + if (inode->i_state & I_DIRTY_TIME) + nr_dirty_time++; spin_unlock(&wb->list_lock); global_dirty_limits(&background_thresh, &dirty_thresh); @@ -98,6 +101,7 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v) "b_dirty: %10lu\n" "b_io: %10lu\n" "b_more_io: %10lu\n" + "b_dirty_time: %10lu\n" "bdi_list: %10u\n" "state: %10lx\n", (unsigned long) K(bdi_stat(bdi, BDI_WRITEBACK)), @@ -111,6 +115,7 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v) nr_dirty, nr_io, nr_more_io, + nr_dirty_time, !list_empty(&bdi->bdi_list), bdi->state); #undef K @@ -418,6 +423,7 @@ static void bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi) INIT_LIST_HEAD(&wb->b_dirty); INIT_LIST_HEAD(&wb->b_io); INIT_LIST_HEAD(&wb->b_more_io); + INIT_LIST_HEAD(&wb->b_dirty_time); spin_lock_init(&wb->list_lock); INIT_DELAYED_WORK(&wb->dwork, bdi_writeback_workfn); } -- cgit v1.2.3