summaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/binfmt_elf.c4
-rw-r--r--fs/btrfs/extent-tree.c6
-rw-r--r--fs/btrfs/extent_map.c13
-rw-r--r--fs/btrfs/extent_map.h1
-rw-r--r--fs/btrfs/file-item.c4
-rw-r--r--fs/btrfs/file.c10
-rw-r--r--fs/btrfs/free-space-cache.c20
-rw-r--r--fs/btrfs/inode.c137
-rw-r--r--fs/btrfs/ioctl.c129
-rw-r--r--fs/btrfs/qgroup.c20
-rw-r--r--fs/btrfs/send.c4
-rw-r--r--fs/btrfs/super.c2
-rw-r--r--fs/btrfs/transaction.c19
-rw-r--r--fs/btrfs/tree-log.c10
-rw-r--r--fs/btrfs/volumes.c23
-rw-r--r--fs/cifs/cifs_dfs_ref.c2
-rw-r--r--fs/cifs/cifsfs.c5
-rw-r--r--fs/cifs/connect.c2
-rw-r--r--fs/ecryptfs/crypto.c9
-rw-r--r--fs/ecryptfs/dentry.c2
-rw-r--r--fs/ecryptfs/file.c2
-rw-r--r--fs/ecryptfs/inode.c8
-rw-r--r--fs/ecryptfs/keystore.c2
-rw-r--r--fs/ext2/inode.c12
-rw-r--r--fs/ext2/super.c2
-rw-r--r--fs/ext2/xattr.c4
-rw-r--r--fs/ext3/inode.c16
-rw-r--r--fs/ext3/namei.c1
-rw-r--r--fs/ext3/resize.c12
-rw-r--r--fs/ext3/super.c51
-rw-r--r--fs/ext3/xattr.c4
-rw-r--r--fs/ext4/balloc.c4
-rw-r--r--fs/ext4/dir.c1
-rw-r--r--fs/ext4/ext4.h45
-rw-r--r--fs/ext4/ext4_extents.h6
-rw-r--r--fs/ext4/extents.c407
-rw-r--r--fs/ext4/extents_status.c588
-rw-r--r--fs/ext4/extents_status.h50
-rw-r--r--fs/ext4/file.c14
-rw-r--r--fs/ext4/indirect.c249
-rw-r--r--fs/ext4/inline.c2
-rw-r--r--fs/ext4/inode.c481
-rw-r--r--fs/ext4/ioctl.c9
-rw-r--r--fs/ext4/mmp.c4
-rw-r--r--fs/ext4/move_extent.c3
-rw-r--r--fs/ext4/namei.c20
-rw-r--r--fs/ext4/page-io.c85
-rw-r--r--fs/ext4/resize.c26
-rw-r--r--fs/ext4/super.c107
-rw-r--r--fs/ext4/xattr.c5
-rw-r--r--fs/f2fs/checkpoint.c52
-rw-r--r--fs/f2fs/f2fs.h6
-rw-r--r--fs/f2fs/file.c14
-rw-r--r--fs/f2fs/gc.c9
-rw-r--r--fs/f2fs/inode.c53
-rw-r--r--fs/f2fs/node.c12
-rw-r--r--fs/f2fs/recovery.c2
-rw-r--r--fs/f2fs/segment.c8
-rw-r--r--fs/f2fs/super.c32
-rw-r--r--fs/gfs2/lock_dlm.c7
-rw-r--r--fs/jbd2/commit.c13
-rw-r--r--fs/jbd2/journal.c17
-rw-r--r--fs/jbd2/transaction.c7
-rw-r--r--fs/nfs/namespace.c20
-rw-r--r--fs/nfs/nfs4client.c62
-rw-r--r--fs/nfs/nfs4state.c22
-rw-r--r--fs/nfs/super.c22
-rw-r--r--fs/ocfs2/dir.c1
-rw-r--r--fs/pstore/platform.c35
-rw-r--r--fs/udf/super.c4
-rw-r--r--fs/xfs/xfs_aops.c2
-rw-r--r--fs/xfs/xfs_bmap.c6
-rw-r--r--fs/xfs/xfs_buf.c20
-rw-r--r--fs/xfs/xfs_buf_item.c12
-rw-r--r--fs/xfs/xfs_dfrag.c4
-rw-r--r--fs/xfs/xfs_iomap.c9
-rw-r--r--fs/xfs/xfs_mount.c2
-rw-r--r--fs/xfs/xfs_trace.h1
78 files changed, 1839 insertions, 1257 deletions
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 0c42cdbabecf..cfc22c9d75bc 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -321,6 +321,8 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec,
return 0;
}
+#ifndef elf_map
+
static unsigned long elf_map(struct file *filep, unsigned long addr,
struct elf_phdr *eppnt, int prot, int type,
unsigned long total_size)
@@ -355,6 +357,8 @@ static unsigned long elf_map(struct file *filep, unsigned long addr,
return(map_addr);
}
+#endif /* !elf_map */
+
static unsigned long total_mapping_size(struct elf_phdr *cmds, int nr)
{
int i, first_idx = -1, last_idx = -1;
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 521e9d4424f6..a8b8adc05070 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3997,7 +3997,7 @@ again:
* We make the other tasks wait for the flush only when we can flush
* all things.
*/
- if (ret && flush == BTRFS_RESERVE_FLUSH_ALL) {
+ if (ret && flush != BTRFS_RESERVE_NO_FLUSH) {
flushing = true;
space_info->flush = 1;
}
@@ -5560,7 +5560,7 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
int empty_cluster = 2 * 1024 * 1024;
struct btrfs_space_info *space_info;
int loop = 0;
- int index = 0;
+ int index = __get_raid_index(data);
int alloc_type = (data & BTRFS_BLOCK_GROUP_DATA) ?
RESERVE_ALLOC_NO_ACCOUNT : RESERVE_ALLOC;
bool found_uncached_bg = false;
@@ -6788,11 +6788,13 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
&wc->flags[level]);
if (ret < 0) {
btrfs_tree_unlock_rw(eb, path->locks[level]);
+ path->locks[level] = 0;
return ret;
}
BUG_ON(wc->refs[level] == 0);
if (wc->refs[level] == 1) {
btrfs_tree_unlock_rw(eb, path->locks[level]);
+ path->locks[level] = 0;
return 1;
}
}
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index f169d6b11d7f..2e8cae63d247 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -171,6 +171,10 @@ static int mergable_maps(struct extent_map *prev, struct extent_map *next)
if (test_bit(EXTENT_FLAG_COMPRESSED, &prev->flags))
return 0;
+ if (test_bit(EXTENT_FLAG_LOGGING, &prev->flags) ||
+ test_bit(EXTENT_FLAG_LOGGING, &next->flags))
+ return 0;
+
if (extent_map_end(prev) == next->start &&
prev->flags == next->flags &&
prev->bdev == next->bdev &&
@@ -255,7 +259,8 @@ int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len,
if (!em)
goto out;
- list_move(&em->list, &tree->modified_extents);
+ if (!test_bit(EXTENT_FLAG_LOGGING, &em->flags))
+ list_move(&em->list, &tree->modified_extents);
em->generation = gen;
clear_bit(EXTENT_FLAG_PINNED, &em->flags);
em->mod_start = em->start;
@@ -280,6 +285,12 @@ out:
}
+void clear_em_logging(struct extent_map_tree *tree, struct extent_map *em)
+{
+ clear_bit(EXTENT_FLAG_LOGGING, &em->flags);
+ try_merge_map(tree, em);
+}
+
/**
* add_extent_mapping - add new extent map to the extent tree
* @tree: tree to insert new map in
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index 922943ce29e8..c6598c89cff8 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -69,6 +69,7 @@ void free_extent_map(struct extent_map *em);
int __init extent_map_init(void);
void extent_map_exit(void);
int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len, u64 gen);
+void clear_em_logging(struct extent_map_tree *tree, struct extent_map *em);
struct extent_map *search_extent_mapping(struct extent_map_tree *tree,
u64 start, u64 len);
#endif
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index bd38cef42358..94aa53b38721 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -460,8 +460,8 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
if (!contig)
offset = page_offset(bvec->bv_page) + bvec->bv_offset;
- if (!contig && (offset >= ordered->file_offset + ordered->len ||
- offset < ordered->file_offset)) {
+ if (offset >= ordered->file_offset + ordered->len ||
+ offset < ordered->file_offset) {
unsigned long bytes_left;
sums->len = this_sum_bytes;
this_sum_bytes = 0;
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 77061bf43edb..f76b1fd160d4 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -2241,6 +2241,7 @@ static int find_desired_extent(struct inode *inode, loff_t *offset, int whence)
if (lockend <= lockstart)
lockend = lockstart + root->sectorsize;
+ lockend--;
len = lockend - lockstart + 1;
len = max_t(u64, len, root->sectorsize);
@@ -2307,9 +2308,12 @@ static int find_desired_extent(struct inode *inode, loff_t *offset, int whence)
}
}
- *offset = start;
- free_extent_map(em);
- break;
+ if (!test_bit(EXTENT_FLAG_PREALLOC,
+ &em->flags)) {
+ *offset = start;
+ free_extent_map(em);
+ break;
+ }
}
}
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 59ea2e4349c9..0be7a8742a43 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -1862,11 +1862,13 @@ int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
{
struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
struct btrfs_free_space *info;
- int ret = 0;
+ int ret;
+ bool re_search = false;
spin_lock(&ctl->tree_lock);
again:
+ ret = 0;
if (!bytes)
goto out_lock;
@@ -1879,17 +1881,17 @@ again:
info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
1, 0);
if (!info) {
- /* the tree logging code might be calling us before we
- * have fully loaded the free space rbtree for this
- * block group. So it is possible the entry won't
- * be in the rbtree yet at all. The caching code
- * will make sure not to put it in the rbtree if
- * the logging code has pinned it.
+ /*
+ * If we found a partial bit of our free space in a
+ * bitmap but then couldn't find the other part this may
+ * be a problem, so WARN about it.
*/
+ WARN_ON(re_search);
goto out_lock;
}
}
+ re_search = false;
if (!info->bitmap) {
unlink_free_space(ctl, info);
if (offset == info->offset) {
@@ -1935,8 +1937,10 @@ again:
}
ret = remove_from_bitmap(ctl, info, &offset, &bytes);
- if (ret == -EAGAIN)
+ if (ret == -EAGAIN) {
+ re_search = true;
goto again;
+ }
BUG_ON(ret); /* logic error */
out_lock:
spin_unlock(&ctl->tree_lock);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 16d9e8e191e6..cc93b23ca352 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -88,7 +88,7 @@ static unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = {
[S_IFLNK >> S_SHIFT] = BTRFS_FT_SYMLINK,
};
-static int btrfs_setsize(struct inode *inode, loff_t newsize);
+static int btrfs_setsize(struct inode *inode, struct iattr *attr);
static int btrfs_truncate(struct inode *inode);
static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent);
static noinline int cow_file_range(struct inode *inode,
@@ -2478,6 +2478,18 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
continue;
}
nr_truncate++;
+
+ /* 1 for the orphan item deletion. */
+ trans = btrfs_start_transaction(root, 1);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ goto out;
+ }
+ ret = btrfs_orphan_add(trans, inode);
+ btrfs_end_transaction(trans, root);
+ if (ret)
+ goto out;
+
ret = btrfs_truncate(inode);
} else {
nr_unlink++;
@@ -3665,6 +3677,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
block_end - cur_offset, 0);
if (IS_ERR(em)) {
err = PTR_ERR(em);
+ em = NULL;
break;
}
last_byte = min(extent_map_end(em), block_end);
@@ -3748,16 +3761,27 @@ next:
return err;
}
-static int btrfs_setsize(struct inode *inode, loff_t newsize)
+static int btrfs_setsize(struct inode *inode, struct iattr *attr)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_trans_handle *trans;
loff_t oldsize = i_size_read(inode);
+ loff_t newsize = attr->ia_size;
+ int mask = attr->ia_valid;
int ret;
if (newsize == oldsize)
return 0;
+ /*
+ * The regular truncate() case without ATTR_CTIME and ATTR_MTIME is a
+ * special case where we need to update the times despite not having
+ * these flags set. For all other operations the VFS set these flags
+ * explicitly if it wants a timestamp update.
+ */
+ if (newsize != oldsize && (!(mask & (ATTR_CTIME | ATTR_MTIME))))
+ inode->i_ctime = inode->i_mtime = current_fs_time(inode->i_sb);
+
if (newsize > oldsize) {
truncate_pagecache(inode, oldsize, newsize);
ret = btrfs_cont_expand(inode, oldsize, newsize);
@@ -3783,9 +3807,34 @@ static int btrfs_setsize(struct inode *inode, loff_t newsize)
set_bit(BTRFS_INODE_ORDERED_DATA_CLOSE,
&BTRFS_I(inode)->runtime_flags);
+ /*
+ * 1 for the orphan item we're going to add
+ * 1 for the orphan item deletion.
+ */
+ trans = btrfs_start_transaction(root, 2);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
+
+ /*
+ * We need to do this in case we fail at _any_ point during the
+ * actual truncate. Once we do the truncate_setsize we could
+ * invalidate pages which forces any outstanding ordered io to
+ * be instantly completed which will give us extents that need
+ * to be truncated. If we fail to get an orphan inode down we
+ * could have left over extents that were never meant to live,
+ * so we need to garuntee from this point on that everything
+ * will be consistent.
+ */
+ ret = btrfs_orphan_add(trans, inode);
+ btrfs_end_transaction(trans, root);
+ if (ret)
+ return ret;
+
/* we don't support swapfiles, so vmtruncate shouldn't fail */
truncate_setsize(inode, newsize);
ret = btrfs_truncate(inode);
+ if (ret && inode->i_nlink)
+ btrfs_orphan_del(NULL, inode);
}
return ret;
@@ -3805,7 +3854,7 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
return err;
if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
- err = btrfs_setsize(inode, attr->ia_size);
+ err = btrfs_setsize(inode, attr);
if (err)
return err;
}
@@ -5572,10 +5621,13 @@ struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *pag
return em;
if (em) {
/*
- * if our em maps to a hole, there might
- * actually be delalloc bytes behind it
+ * if our em maps to
+ * - a hole or
+ * - a pre-alloc extent,
+ * there might actually be delalloc bytes behind it.
*/
- if (em->block_start != EXTENT_MAP_HOLE)
+ if (em->block_start != EXTENT_MAP_HOLE &&
+ !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
return em;
else
hole_em = em;
@@ -5657,6 +5709,8 @@ struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *pag
*/
em->block_start = hole_em->block_start;
em->block_len = hole_len;
+ if (test_bit(EXTENT_FLAG_PREALLOC, &hole_em->flags))
+ set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
} else {
em->start = range_start;
em->len = found;
@@ -6915,11 +6969,9 @@ static int btrfs_truncate(struct inode *inode)
/*
* 1 for the truncate slack space
- * 1 for the orphan item we're going to add
- * 1 for the orphan item deletion
* 1 for updating the inode.
*/
- trans = btrfs_start_transaction(root, 4);
+ trans = btrfs_start_transaction(root, 2);
if (IS_ERR(trans)) {
err = PTR_ERR(trans);
goto out;
@@ -6930,12 +6982,6 @@ static int btrfs_truncate(struct inode *inode)
min_size);
BUG_ON(ret);
- ret = btrfs_orphan_add(trans, inode);
- if (ret) {
- btrfs_end_transaction(trans, root);
- goto out;
- }
-
/*
* setattr is responsible for setting the ordered_data_close flag,
* but that is only tested during the last file release. That
@@ -7004,12 +7050,6 @@ static int btrfs_truncate(struct inode *inode)
ret = btrfs_orphan_del(trans, inode);
if (ret)
err = ret;
- } else if (ret && inode->i_nlink > 0) {
- /*
- * Failed to do the truncate, remove us from the in memory
- * orphan list.
- */
- ret = btrfs_orphan_del(NULL, inode);
}
if (trans) {
@@ -7531,41 +7571,61 @@ void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work)
*/
int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
{
- struct list_head *head = &root->fs_info->delalloc_inodes;
struct btrfs_inode *binode;
struct inode *inode;
struct btrfs_delalloc_work *work, *next;
struct list_head works;
+ struct list_head splice;
int ret = 0;
if (root->fs_info->sb->s_flags & MS_RDONLY)
return -EROFS;
INIT_LIST_HEAD(&works);
-
+ INIT_LIST_HEAD(&splice);
+again:
spin_lock(&root->fs_info->delalloc_lock);
- while (!list_empty(head)) {
- binode = list_entry(head->next, struct btrfs_inode,
+ list_splice_init(&root->fs_info->delalloc_inodes, &splice);
+ while (!list_empty(&splice)) {
+ binode = list_entry(splice.next, struct btrfs_inode,
delalloc_inodes);
+
+ list_del_init(&binode->delalloc_inodes);
+
inode = igrab(&binode->vfs_inode);
if (!inode)
- list_del_init(&binode->delalloc_inodes);
+ continue;
+
+ list_add_tail(&binode->delalloc_inodes,
+ &root->fs_info->delalloc_inodes);
spin_unlock(&root->fs_info->delalloc_lock);
- if (inode) {
- work = btrfs_alloc_delalloc_work(inode, 0, delay_iput);
- if (!work) {
- ret = -ENOMEM;
- goto out;
- }
- list_add_tail(&work->list, &works);
- btrfs_queue_worker(&root->fs_info->flush_workers,
- &work->work);
+
+ work = btrfs_alloc_delalloc_work(inode, 0, delay_iput);
+ if (unlikely(!work)) {
+ ret = -ENOMEM;
+ goto out;
}
+ list_add_tail(&work->list, &works);
+ btrfs_queue_worker(&root->fs_info->flush_workers,
+ &work->work);
+
cond_resched();
spin_lock(&root->fs_info->delalloc_lock);
}
spin_unlock(&root->fs_info->delalloc_lock);
+ list_for_each_entry_safe(work, next, &works, list) {
+ list_del_init(&work->list);
+ btrfs_wait_and_free_delalloc_work(work);
+ }
+
+ spin_lock(&root->fs_info->delalloc_lock);
+ if (!list_empty(&root->fs_info->delalloc_inodes)) {
+ spin_unlock(&root->fs_info->delalloc_lock);
+ goto again;
+ }
+ spin_unlock(&root->fs_info->delalloc_lock);
+
/* the filemap_flush will queue IO into the worker threads, but
* we have to make sure the IO is actually started and that
* ordered extents get created before we return
@@ -7578,11 +7638,18 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
atomic_read(&root->fs_info->async_delalloc_pages) == 0));
}
atomic_dec(&root->fs_info->async_submit_draining);
+ return 0;
out:
list_for_each_entry_safe(work, next, &works, list) {
list_del_init(&work->list);
btrfs_wait_and_free_delalloc_work(work);
}
+
+ if (!list_empty_careful(&splice)) {
+ spin_lock(&root->fs_info->delalloc_lock);
+ list_splice_tail(&splice, &root->fs_info->delalloc_inodes);
+ spin_unlock(&root->fs_info->delalloc_lock);
+ }
return ret;
}
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 4b4516770f05..5b22d45d3c6a 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1339,7 +1339,8 @@ static noinline int btrfs_ioctl_resize(struct file *file,
if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running,
1)) {
pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n");
- return -EINPROGRESS;
+ mnt_drop_write_file(file);
+ return -EINVAL;
}
mutex_lock(&root->fs_info->volume_mutex);
@@ -1362,6 +1363,7 @@ static noinline int btrfs_ioctl_resize(struct file *file,
printk(KERN_INFO "btrfs: resizing devid %llu\n",
(unsigned long long)devid);
}
+
device = btrfs_find_device(root->fs_info, devid, NULL, NULL);
if (!device) {
printk(KERN_INFO "btrfs: resizer unable to find device %llu\n",
@@ -1369,9 +1371,10 @@ static noinline int btrfs_ioctl_resize(struct file *file,
ret = -EINVAL;
goto out_free;
}
- if (device->fs_devices && device->fs_devices->seeding) {
+
+ if (!device->writeable) {
printk(KERN_INFO "btrfs: resizer unable to apply on "
- "seeding device %llu\n",
+ "readonly device %llu\n",
(unsigned long long)devid);
ret = -EINVAL;
goto out_free;
@@ -1443,8 +1446,8 @@ out_free:
kfree(vol_args);
out:
mutex_unlock(&root->fs_info->volume_mutex);
- mnt_drop_write_file(file);
atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0);
+ mnt_drop_write_file(file);
return ret;
}
@@ -2095,13 +2098,13 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
err = inode_permission(inode, MAY_WRITE | MAY_EXEC);
if (err)
goto out_dput;
-
- /* check if subvolume may be deleted by a non-root user */
- err = btrfs_may_delete(dir, dentry, 1);
- if (err)
- goto out_dput;
}
+ /* check if subvolume may be deleted by a user */
+ err = btrfs_may_delete(dir, dentry, 1);
+ if (err)
+ goto out_dput;
+
if (btrfs_ino(inode) != BTRFS_FIRST_FREE_OBJECTID) {
err = -EINVAL;
goto out_dput;
@@ -2183,19 +2186,20 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
struct btrfs_ioctl_defrag_range_args *range;
int ret;
- if (btrfs_root_readonly(root))
- return -EROFS;
+ ret = mnt_want_write_file(file);
+ if (ret)
+ return ret;
if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running,
1)) {
pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n");
- return -EINPROGRESS;
+ mnt_drop_write_file(file);
+ return -EINVAL;
}
- ret = mnt_want_write_file(file);
- if (ret) {
- atomic_set(&root->fs_info->mutually_exclusive_operation_running,
- 0);
- return ret;
+
+ if (btrfs_root_readonly(root)) {
+ ret = -EROFS;
+ goto out;
}
switch (inode->i_mode & S_IFMT) {
@@ -2247,8 +2251,8 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
ret = -EINVAL;
}
out:
- mnt_drop_write_file(file);
atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0);
+ mnt_drop_write_file(file);
return ret;
}
@@ -2263,7 +2267,7 @@ static long btrfs_ioctl_add_dev(struct btrfs_root *root, void __user *arg)
if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running,
1)) {
pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n");
- return -EINPROGRESS;
+ return -EINVAL;
}
mutex_lock(&root->fs_info->volume_mutex);
@@ -2300,7 +2304,7 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
1)) {
pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n");
mnt_drop_write_file(file);
- return -EINPROGRESS;
+ return -EINVAL;
}
mutex_lock(&root->fs_info->volume_mutex);
@@ -2316,8 +2320,8 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
kfree(vol_args);
out:
mutex_unlock(&root->fs_info->volume_mutex);
- mnt_drop_write_file(file);
atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0);
+ mnt_drop_write_file(file);
return ret;
}
@@ -3437,8 +3441,8 @@ static long btrfs_ioctl_balance(struct file *file, void __user *arg)
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_ioctl_balance_args *bargs;
struct btrfs_balance_control *bctl;
+ bool need_unlock; /* for mut. excl. ops lock */
int ret;
- int need_to_clear_lock = 0;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
@@ -3447,14 +3451,61 @@ static long btrfs_ioctl_balance(struct file *file, void __user *arg)
if (ret)
return ret;
- mutex_lock(&fs_info->volume_mutex);
+again:
+ if (!atomic_xchg(&fs_info->mutually_exclusive_operation_running, 1)) {
+ mutex_lock(&fs_info->volume_mutex);
+ mutex_lock(&fs_info->balance_mutex);
+ need_unlock = true;
+ goto locked;
+ }
+
+ /*
+ * mut. excl. ops lock is locked. Three possibilites:
+ * (1) some other op is running
+ * (2) balance is running
+ * (3) balance is paused -- special case (think resume)
+ */
mutex_lock(&fs_info->balance_mutex);
+ if (fs_info->balance_ctl) {
+ /* this is either (2) or (3) */
+ if (!atomic_read(&fs_info->balance_running)) {
+ mutex_unlock(&fs_info->balance_mutex);
+ if (!mutex_trylock(&fs_info->volume_mutex))
+ goto again;
+ mutex_lock(&fs_info->balance_mutex);
+
+ if (fs_info->balance_ctl &&
+ !atomic_read(&fs_info->balance_running)) {
+ /* this is (3) */
+ need_unlock = false;
+ goto locked;
+ }
+
+ mutex_unlock(&fs_info->balance_mutex);
+ mutex_unlock(&fs_info->volume_mutex);
+ goto again;
+ } else {
+ /* this is (2) */
+ mutex_unlock(&fs_info->balance_mutex);
+ ret = -EINPROGRESS;
+ goto out;
+ }
+ } else {
+ /* this is (1) */
+ mutex_unlock(&fs_info->balance_mutex);
+ pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n");
+ ret = -EINVAL;
+ goto out;
+ }
+
+locked:
+ BUG_ON(!atomic_read(&fs_info->mutually_exclusive_operation_running));
if (arg) {
bargs = memdup_user(arg, sizeof(*bargs));
if (IS_ERR(bargs)) {
ret = PTR_ERR(bargs);
- goto out;
+ goto out_unlock;
}
if (bargs->flags & BTRFS_BALANCE_RESUME) {
@@ -3474,13 +3525,10 @@ static long btrfs_ioctl_balance(struct file *file, void __user *arg)
bargs = NULL;
}
- if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running,
- 1)) {
- pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n");
+ if (fs_info->balance_ctl) {
ret = -EINPROGRESS;
goto out_bargs;
}
- need_to_clear_lock = 1;
bctl = kzalloc(sizeof(*bctl), GFP_NOFS);
if (!bctl) {
@@ -3501,11 +3549,17 @@ static long btrfs_ioctl_balance(struct file *file, void __user *arg)
}
do_balance:
- ret = btrfs_balance(bctl, bargs);
/*
- * bctl is freed in __cancel_balance or in free_fs_info if
- * restriper was paused all the way until unmount
+ * Ownership of bctl and mutually_exclusive_operation_running
+ * goes to to btrfs_balance. bctl is freed in __cancel_balance,
+ * or, if restriper was paused all the way until unmount, in
+ * free_fs_info. mutually_exclusive_operation_running is
+ * cleared in __cancel_balance.
*/
+ need_unlock = false;
+
+ ret = btrfs_balance(bctl, bargs);
+
if (arg) {
if (copy_to_user(arg, bargs, sizeof(*bargs)))
ret = -EFAULT;
@@ -3513,12 +3567,12 @@ do_balance:
out_bargs:
kfree(bargs);
-out:
- if (need_to_clear_lock)
- atomic_set(&root->fs_info->mutually_exclusive_operation_running,
- 0);
+out_unlock:
mutex_unlock(&fs_info->balance_mutex);
mutex_unlock(&fs_info->volume_mutex);
+ if (need_unlock)
+ atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
+out:
mnt_drop_write_file(file);
return ret;
}
@@ -3698,6 +3752,11 @@ static long btrfs_ioctl_qgroup_create(struct file *file, void __user *arg)
goto drop_write;
}
+ if (!sa->qgroupid) {
+ ret = -EINVAL;
+ goto out;
+ }
+
trans = btrfs_join_transaction(root);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index fe9d02c45f8e..a5c856234323 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -379,6 +379,13 @@ next1:
ret = add_relation_rb(fs_info, found_key.objectid,
found_key.offset);
+ if (ret == -ENOENT) {
+ printk(KERN_WARNING
+ "btrfs: orphan qgroup relation 0x%llx->0x%llx\n",
+ (unsigned long long)found_key.objectid,
+ (unsigned long long)found_key.offset);
+ ret = 0; /* ignore the error */
+ }
if (ret)
goto out;
next2:
@@ -956,17 +963,28 @@ int btrfs_remove_qgroup(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info, u64 qgroupid)
{
struct btrfs_root *quota_root;
+ struct btrfs_qgroup *qgroup;
int ret = 0;
quota_root = fs_info->quota_root;
if (!quota_root)
return -EINVAL;
+ /* check if there are no relations to this qgroup */
+ spin_lock(&fs_info->qgroup_lock);
+ qgroup = find_qgroup_rb(fs_info, qgroupid);
+ if (qgroup) {
+ if (!list_empty(&qgroup->groups) || !list_empty(&qgroup->members)) {
+ spin_unlock(&fs_info->qgroup_lock);
+ return -EBUSY;
+ }
+ }
+ spin_unlock(&fs_info->qgroup_lock);
+
ret = del_qgroup_item(trans, quota_root, qgroupid);
spin_lock(&fs_info->qgroup_lock);
del_qgroup_rb(quota_root->fs_info, qgroupid);
-
spin_unlock(&fs_info->qgroup_lock);
return ret;
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 54454542ad40..321b7fb4e441 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -1814,8 +1814,10 @@ static int name_cache_insert(struct send_ctx *sctx,
(unsigned long)nce->ino);
if (!nce_head) {
nce_head = kmalloc(sizeof(*nce_head), GFP_NOFS);
- if (!nce_head)
+ if (!nce_head) {
+ kfree(nce);
return -ENOMEM;
+ }
INIT_LIST_HEAD(nce_head);
ret = radix_tree_insert(&sctx->name_cache, nce->ino, nce_head);
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 99545df1b86c..d8982e9601d3 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -267,7 +267,7 @@ void __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
function, line, errstr);
return;
}
- trans->transaction->aborted = errno;
+ ACCESS_ONCE(trans->transaction->aborted) = errno;
__btrfs_std_error(root->fs_info, function, line, errno, NULL);
}
/*
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 87fac9a21ea5..f15494699f3b 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -1468,7 +1468,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
goto cleanup_transaction;
}
- if (cur_trans->aborted) {
+ /* Stop the commit early if ->aborted is set */
+ if (unlikely(ACCESS_ONCE(cur_trans->aborted))) {
ret = cur_trans->aborted;
goto cleanup_transaction;
}
@@ -1574,6 +1575,11 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
wait_event(cur_trans->writer_wait,
atomic_read(&cur_trans->num_writers) == 1);
+ /* ->aborted might be set after the previous check, so check it */
+ if (unlikely(ACCESS_ONCE(cur_trans->aborted))) {
+ ret = cur_trans->aborted;
+ goto cleanup_transaction;
+ }
/*
* the reloc mutex makes sure that we stop
* the balancing code from coming in and moving
@@ -1657,6 +1663,17 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
goto cleanup_transaction;
}
+ /*
+ * The tasks which save the space cache and inode cache may also
+ * update ->aborted, check it.
+ */
+ if (unlikely(ACCESS_ONCE(cur_trans->aborted))) {
+ ret = cur_trans->aborted;
+ mutex_unlock(&root->fs_info->tree_log_mutex);
+ mutex_unlock(&root->fs_info->reloc_mutex);
+ goto cleanup_transaction;
+ }
+
btrfs_prepare_extent_commit(trans, root);
cur_trans = root->fs_info->running_transaction;
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 83186c7e45d4..9027bb1e7466 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -3357,6 +3357,11 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
if (skip_csum)
return 0;
+ if (em->compress_type) {
+ csum_offset = 0;
+ csum_len = block_len;
+ }
+
/* block start is already adjusted for the file extent offset. */
ret = btrfs_lookup_csums_range(log->fs_info->csum_root,
em->block_start + csum_offset,
@@ -3410,13 +3415,13 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
em = list_entry(extents.next, struct extent_map, list);
list_del_init(&em->list);
- clear_bit(EXTENT_FLAG_LOGGING, &em->flags);
/*
* If we had an error we just need to delete everybody from our
* private list.
*/
if (ret) {
+ clear_em_logging(tree, em);
free_extent_map(em);
continue;
}
@@ -3424,8 +3429,9 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
write_unlock(&tree->lock);
ret = log_one_extent(trans, inode, root, em, path);
- free_extent_map(em);
write_lock(&tree->lock);
+ clear_em_logging(tree, em);
+ free_extent_map(em);
}
WARN_ON(!list_empty(&extents));
write_unlock(&tree->lock);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 5cce6aa74012..15f6efdf6463 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1431,7 +1431,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
}
} else {
ret = btrfs_get_bdev_and_sb(device_path,
- FMODE_READ | FMODE_EXCL,
+ FMODE_WRITE | FMODE_EXCL,
root->fs_info->bdev_holder, 0,
&bdev, &bh);
if (ret)
@@ -2614,7 +2614,14 @@ static int chunk_usage_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
cache = btrfs_lookup_block_group(fs_info, chunk_offset);
chunk_used = btrfs_block_group_used(&cache->item);
- user_thresh = div_factor_fine(cache->key.offset, bargs->usage);
+ if (bargs->usage == 0)
+ user_thresh = 0;
+ else if (bargs->usage > 100)
+ user_thresh = cache->key.offset;
+ else
+ user_thresh = div_factor_fine(cache->key.offset,
+ bargs->usage);
+
if (chunk_used < user_thresh)
ret = 0;
@@ -2959,6 +2966,8 @@ static void __cancel_balance(struct btrfs_fs_info *fs_info)
unset_balance_control(fs_info);
ret = del_balance_item(fs_info->tree_root);
BUG_ON(ret);
+
+ atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
}
void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, int lock,
@@ -3138,8 +3147,10 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
out:
if (bctl->flags & BTRFS_BALANCE_RESUME)
__cancel_balance(fs_info);
- else
+ else {
kfree(bctl);
+ atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
+ }
return ret;
}
@@ -3156,7 +3167,6 @@ static int balance_kthread(void *data)
ret = btrfs_balance(fs_info->balance_ctl, NULL);
}
- atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
mutex_unlock(&fs_info->balance_mutex);
mutex_unlock(&fs_info->volume_mutex);
@@ -3179,7 +3189,6 @@ int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info)
return 0;
}
- WARN_ON(atomic_xchg(&fs_info->mutually_exclusive_operation_running, 1));
tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance");
if (IS_ERR(tsk))
return PTR_ERR(tsk);
@@ -3233,6 +3242,8 @@ int btrfs_recover_balance(struct btrfs_fs_info *fs_info)
btrfs_balance_sys(leaf, item, &disk_bargs);
btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs);
+ WARN_ON(atomic_xchg(&fs_info->mutually_exclusive_operation_running, 1));
+
mutex_lock(&fs_info->volume_mutex);
mutex_lock(&fs_info->balance_mutex);
@@ -3496,7 +3507,7 @@ struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
{ 1, 1, 2, 2, 2, 2 /* raid1 */ },
{ 1, 2, 1, 1, 1, 2 /* dup */ },
{ 1, 1, 0, 2, 1, 1 /* raid0 */ },
- { 1, 1, 0, 1, 1, 1 /* single */ },
+ { 1, 1, 1, 1, 1, 1 /* single */ },
};
static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c
index ce5cbd717bfc..210fce2df308 100644
--- a/fs/cifs/cifs_dfs_ref.c
+++ b/fs/cifs/cifs_dfs_ref.c
@@ -226,6 +226,8 @@ compose_mount_options_out:
compose_mount_options_err:
kfree(mountdata);
mountdata = ERR_PTR(rc);
+ kfree(*devname);
+ *devname = NULL;
goto compose_mount_options_out;
}
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index de7f9168a118..e32833980fdb 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -558,6 +558,11 @@ cifs_get_root(struct smb_vol *vol, struct super_block *sb)
dentry = ERR_PTR(-ENOENT);
break;
}
+ if (!S_ISDIR(dir->i_mode)) {
+ dput(dentry);
+ dentry = ERR_PTR(-ENOTDIR);
+ break;
+ }
/* skip separators */
while (*s == sep)
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 17c3643e5950..12b3da39733b 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -1917,7 +1917,7 @@ srcip_matches(struct sockaddr *srcaddr, struct sockaddr *rhs)
}
case AF_INET6: {
struct sockaddr_in6 *saddr6 = (struct sockaddr_in6 *)srcaddr;
- struct sockaddr_in6 *vaddr6 = (struct sockaddr_in6 *)&rhs;
+ struct sockaddr_in6 *vaddr6 = (struct sockaddr_in6 *)rhs;
return ipv6_addr_equal(&saddr6->sin6_addr, &vaddr6->sin6_addr);
}
default:
diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index a7b0c2dfb3db..d5c25db4398f 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -301,17 +301,14 @@ int virt_to_scatterlist(const void *addr, int size, struct scatterlist *sg,
while (size > 0 && i < sg_size) {
pg = virt_to_page(addr);
offset = offset_in_page(addr);
- if (sg)
- sg_set_page(&sg[i], pg, 0, offset);
+ sg_set_page(&sg[i], pg, 0, offset);
remainder_of_page = PAGE_CACHE_SIZE - offset;
if (size >= remainder_of_page) {
- if (sg)
- sg[i].length = remainder_of_page;
+ sg[i].length = remainder_of_page;
addr += remainder_of_page;
size -= remainder_of_page;
} else {
- if (sg)
- sg[i].length = size;
+ sg[i].length = size;
addr += size;
size = 0;
}
diff --git a/fs/ecryptfs/dentry.c b/fs/ecryptfs/dentry.c
index 1b5d9af937df..bf12ba5dd223 100644
--- a/fs/ecryptfs/dentry.c
+++ b/fs/ecryptfs/dentry.c
@@ -45,14 +45,12 @@
static int ecryptfs_d_revalidate(struct dentry *dentry, unsigned int flags)
{
struct dentry *lower_dentry;
- struct vfsmount *lower_mnt;
int rc = 1;
if (flags & LOOKUP_RCU)
return -ECHILD;
lower_dentry = ecryptfs_dentry_to_lower(dentry);
- lower_mnt = ecryptfs_dentry_to_lower_mnt(dentry);
if (!lower_dentry->d_op || !lower_dentry->d_op->d_revalidate)
goto out;
rc = lower_dentry->d_op->d_revalidate(lower_dentry, flags);
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index d45ba4568128..bfa52d2ef460 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -199,7 +199,6 @@ static int ecryptfs_open(struct inode *inode, struct file *file)
struct dentry *ecryptfs_dentry = file->f_path.dentry;
/* Private value of ecryptfs_dentry allocated in
* ecryptfs_lookup() */
- struct dentry *lower_dentry;
struct ecryptfs_file_info *file_info;
mount_crypt_stat = &ecryptfs_superblock_to_private(
@@ -222,7 +221,6 @@ static int ecryptfs_open(struct inode *inode, struct file *file)
rc = -ENOMEM;
goto out;
}
- lower_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry);
crypt_stat = &ecryptfs_inode_to_private(inode)->crypt_stat;
mutex_lock(&crypt_stat->cs_mutex);
if (!(crypt_stat->flags & ECRYPTFS_POLICY_APPLIED)) {
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index cc7709e7c508..ddd961ba2cf9 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -999,8 +999,8 @@ out:
return rc;
}
-int ecryptfs_getattr_link(struct vfsmount *mnt, struct dentry *dentry,
- struct kstat *stat)
+static int ecryptfs_getattr_link(struct vfsmount *mnt, struct dentry *dentry,
+ struct kstat *stat)
{
struct ecryptfs_mount_crypt_stat *mount_crypt_stat;
int rc = 0;
@@ -1021,8 +1021,8 @@ int ecryptfs_getattr_link(struct vfsmount *mnt, struct dentry *dentry,
return rc;
}
-int ecryptfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
- struct kstat *stat)
+static int ecryptfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
+ struct kstat *stat)
{
struct kstat lower_stat;
int rc;
diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c
index 2333203a120b..6154cde3a052 100644
--- a/fs/ecryptfs/keystore.c
+++ b/fs/ecryptfs/keystore.c
@@ -1150,7 +1150,7 @@ decrypt_pki_encrypted_session_key(struct ecryptfs_auth_tok *auth_tok,
struct ecryptfs_message *msg = NULL;
char *auth_tok_sig;
char *payload;
- size_t payload_len;
+ size_t payload_len = 0;
int rc;
rc = ecryptfs_get_auth_tok_sig(&auth_tok_sig, auth_tok);
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 6363ac66fafa..c3881e56662e 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -495,6 +495,10 @@ static int ext2_alloc_branch(struct inode *inode,
* parent to disk.
*/
bh = sb_getblk(inode->i_sb, new_blocks[n-1]);
+ if (unlikely(!bh)) {
+ err = -ENOMEM;
+ goto failed;
+ }
branch[n].bh = bh;
lock_buffer(bh);
memset(bh->b_data, 0, blocksize);
@@ -523,6 +527,14 @@ static int ext2_alloc_branch(struct inode *inode,
}
*blks = num;
return err;
+
+failed:
+ for (i = 1; i < n; i++)
+ bforget(branch[i].bh);
+ for (i = 0; i < indirect_blks; i++)
+ ext2_free_blocks(inode, new_blocks[i], 1);
+ ext2_free_blocks(inode, new_blocks[i], num);
+ return err;
}
/**
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index fa04d023177e..7f68c8114026 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -1500,7 +1500,7 @@ static ssize_t ext2_quota_write(struct super_block *sb, int type,
bh = sb_bread(sb, tmp_bh.b_blocknr);
else
bh = sb_getblk(sb, tmp_bh.b_blocknr);
- if (!bh) {
+ if (unlikely(!bh)) {
err = -EIO;
goto out;
}
diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c
index b6754dbbce3c..2d7557db3ae8 100644
--- a/fs/ext2/xattr.c
+++ b/fs/ext2/xattr.c
@@ -662,10 +662,10 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh,
ea_idebug(inode, "creating block %d", block);
new_bh = sb_getblk(sb, block);
- if (!new_bh) {
+ if (unlikely(!new_bh)) {
ext2_free_blocks(inode, block, 1);
mark_inode_dirty(inode);
- error = -EIO;
+ error = -ENOMEM;
goto cleanup;
}
lock_buffer(new_bh);
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index b176d4253544..d512c4bc4ad7 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -676,6 +676,10 @@ static int ext3_alloc_branch(handle_t *handle, struct inode *inode,
* parent to disk.
*/
bh = sb_getblk(inode->i_sb, new_blocks[n-1]);
+ if (unlikely(!bh)) {
+ err = -ENOMEM;
+ goto failed;
+ }
branch[n].bh = bh;
lock_buffer(bh);
BUFFER_TRACE(bh, "call get_create_access");
@@ -717,7 +721,7 @@ failed:
BUFFER_TRACE(branch[i].bh, "call journal_forget");
ext3_journal_forget(handle, branch[i].bh);
}
- for (i = 0; i <indirect_blks; i++)
+ for (i = 0; i < indirect_blks; i++)
ext3_free_blocks(handle, inode, new_blocks[i], 1);
ext3_free_blocks(handle, inode, new_blocks[i], num);
@@ -1078,8 +1082,8 @@ struct buffer_head *ext3_getblk(handle_t *handle, struct inode *inode,
if (!err && buffer_mapped(&dummy)) {
struct buffer_head *bh;
bh = sb_getblk(inode->i_sb, dummy.b_blocknr);
- if (!bh) {
- *errp = -EIO;
+ if (unlikely(!bh)) {
+ *errp = -ENOMEM;
goto err;
}
if (buffer_new(&dummy)) {
@@ -2729,12 +2733,12 @@ static int __ext3_get_inode_loc(struct inode *inode,
return -EIO;
bh = sb_getblk(inode->i_sb, block);
- if (!bh) {
+ if (unlikely(!bh)) {
ext3_error (inode->i_sb, "ext3_get_inode_loc",
"unable to read inode block - "
"inode=%lu, block="E3FSBLK,
inode->i_ino, block);
- return -EIO;
+ return -ENOMEM;
}
if (!buffer_uptodate(bh)) {
lock_buffer(bh);
@@ -2783,7 +2787,7 @@ static int __ext3_get_inode_loc(struct inode *inode,
bitmap_bh = sb_getblk(inode->i_sb,
le32_to_cpu(desc->bg_inode_bitmap));
- if (!bitmap_bh)
+ if (unlikely(!bitmap_bh))
goto make_io;
/*
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index 890b8947c546..88f64eb1b6fa 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -36,7 +36,6 @@
#define NAMEI_RA_CHUNKS 2
#define NAMEI_RA_BLOCKS 4
#define NAMEI_RA_SIZE (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS)
-#define NAMEI_RA_INDEX(c,b) (((c) * NAMEI_RA_BLOCKS) + (b))
static struct buffer_head *ext3_append(handle_t *handle,
struct inode *inode,
diff --git a/fs/ext3/resize.c b/fs/ext3/resize.c
index 0f814f3450de..27105655502c 100644
--- a/fs/ext3/resize.c
+++ b/fs/ext3/resize.c
@@ -116,8 +116,8 @@ static struct buffer_head *bclean(handle_t *handle, struct super_block *sb,
int err;
bh = sb_getblk(sb, blk);
- if (!bh)
- return ERR_PTR(-EIO);
+ if (unlikely(!bh))
+ return ERR_PTR(-ENOMEM);
if ((err = ext3_journal_get_write_access(handle, bh))) {
brelse(bh);
bh = ERR_PTR(err);
@@ -234,8 +234,8 @@ static int setup_new_group_blocks(struct super_block *sb,
goto exit_bh;
gdb = sb_getblk(sb, block);
- if (!gdb) {
- err = -EIO;
+ if (unlikely(!gdb)) {
+ err = -ENOMEM;
goto exit_bh;
}
if ((err = ext3_journal_get_write_access(handle, gdb))) {
@@ -722,8 +722,8 @@ static void update_backups(struct super_block *sb,
break;
bh = sb_getblk(sb, group * bpg + blk_off);
- if (!bh) {
- err = -EIO;
+ if (unlikely(!bh)) {
+ err = -ENOMEM;
break;
}
ext3_debug("update metadata backup %#04lx\n",
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 6e50223b3299..0926fe46ae3e 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -916,21 +916,24 @@ static int set_qf_name(struct super_block *sb, int qtype, substring_t *args)
"Not enough memory for storing quotafile name");
return 0;
}
- if (sbi->s_qf_names[qtype] &&
- strcmp(sbi->s_qf_names[qtype], qname)) {
- ext3_msg(sb, KERN_ERR,
- "%s quota file already specified", QTYPE2NAME(qtype));
+ if (sbi->s_qf_names[qtype]) {
+ int same = !strcmp(sbi->s_qf_names[qtype], qname);
+
kfree(qname);
- return 0;
+ if (!same) {
+ ext3_msg(sb, KERN_ERR,
+ "%s quota file already specified",
+ QTYPE2NAME(qtype));
+ }
+ return same;
}
- sbi->s_qf_names[qtype] = qname;
- if (strchr(sbi->s_qf_names[qtype], '/')) {
+ if (strchr(qname, '/')) {
ext3_msg(sb, KERN_ERR,
"quotafile must be on filesystem root");
- kfree(sbi->s_qf_names[qtype]);
- sbi->s_qf_names[qtype] = NULL;
+ kfree(qname);
return 0;
}
+ sbi->s_qf_names[qtype] = qname;
set_opt(sbi->s_mount_opt, QUOTA);
return 1;
}
@@ -945,11 +948,10 @@ static int clear_qf_name(struct super_block *sb, int qtype) {
" when quota turned on");
return 0;
}
- /*
- * The space will be released later when all options are confirmed
- * to be correct
- */
- sbi->s_qf_names[qtype] = NULL;
+ if (sbi->s_qf_names[qtype]) {
+ kfree(sbi->s_qf_names[qtype]);
+ sbi->s_qf_names[qtype] = NULL;
+ }
return 1;
}
#endif
@@ -2605,7 +2607,18 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
#ifdef CONFIG_QUOTA
old_opts.s_jquota_fmt = sbi->s_jquota_fmt;
for (i = 0; i < MAXQUOTAS; i++)
- old_opts.s_qf_names[i] = sbi->s_qf_names[i];
+ if (sbi->s_qf_names[i]) {
+ old_opts.s_qf_names[i] = kstrdup(sbi->s_qf_names[i],
+ GFP_KERNEL);
+ if (!old_opts.s_qf_names[i]) {
+ int j;
+
+ for (j = 0; j < i; j++)
+ kfree(old_opts.s_qf_names[j]);
+ return -ENOMEM;
+ }
+ } else
+ old_opts.s_qf_names[i] = NULL;
#endif
/*
@@ -2698,9 +2711,7 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
#ifdef CONFIG_QUOTA
/* Release old quota file names */
for (i = 0; i < MAXQUOTAS; i++)
- if (old_opts.s_qf_names[i] &&
- old_opts.s_qf_names[i] != sbi->s_qf_names[i])
- kfree(old_opts.s_qf_names[i]);
+ kfree(old_opts.s_qf_names[i]);
#endif
if (enable_quota)
dquot_resume(sb, -1);
@@ -2714,9 +2725,7 @@ restore_opts:
#ifdef CONFIG_QUOTA
sbi->s_jquota_fmt = old_opts.s_jquota_fmt;
for (i = 0; i < MAXQUOTAS; i++) {
- if (sbi->s_qf_names[i] &&
- old_opts.s_qf_names[i] != sbi->s_qf_names[i])
- kfree(sbi->s_qf_names[i]);
+ kfree(sbi->s_qf_names[i]);
sbi->s_qf_names[i] = old_opts.s_qf_names[i];
}
#endif
diff --git a/fs/ext3/xattr.c b/fs/ext3/xattr.c
index d22ebb7a4f55..b1fc96383e08 100644
--- a/fs/ext3/xattr.c
+++ b/fs/ext3/xattr.c
@@ -813,10 +813,10 @@ inserted:
ea_idebug(inode, "creating block %d", block);
new_bh = sb_getblk(sb, block);
- if (!new_bh) {
+ if (unlikely(!new_bh)) {
getblk_failed:
ext3_free_blocks(handle, inode, block, 1);
- error = -EIO;
+ error = -ENOMEM;
goto cleanup;
}
lock_buffer(new_bh);
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index cf1821784a16..33938c120c85 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -358,7 +358,7 @@ void ext4_validate_block_bitmap(struct super_block *sb,
}
/**
- * ext4_read_block_bitmap()
+ * ext4_read_block_bitmap_nowait()
* @sb: super block
* @block_group: given block group
*
@@ -457,6 +457,8 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
struct buffer_head *bh;
bh = ext4_read_block_bitmap_nowait(sb, block_group);
+ if (!bh)
+ return NULL;
if (ext4_wait_block_bitmap(sb, block_group, bh)) {
put_bh(bh);
return NULL;
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index 80a28b297279..3882fbc5e215 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -185,6 +185,7 @@ static int ext4_readdir(struct file *filp,
"at offset %llu",
(unsigned long long)filp->f_pos);
filp->f_pos += sb->s_blocksize - offset;
+ brelse(bh);
continue;
}
set_buffer_verified(bh);
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 8462eb3c33aa..829cba9bae60 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -155,17 +155,9 @@ struct ext4_allocation_request {
#define EXT4_MAP_UNWRITTEN (1 << BH_Unwritten)
#define EXT4_MAP_BOUNDARY (1 << BH_Boundary)
#define EXT4_MAP_UNINIT (1 << BH_Uninit)
-/* Sometimes (in the bigalloc case, from ext4_da_get_block_prep) the caller of
- * ext4_map_blocks wants to know whether or not the underlying cluster has
- * already been accounted for. EXT4_MAP_FROM_CLUSTER conveys to the caller that
- * the requested mapping was from previously mapped (or delayed allocated)
- * cluster. We use BH_AllocFromCluster only for this flag. BH_AllocFromCluster
- * should never appear on buffer_head's state flags.
- */
-#define EXT4_MAP_FROM_CLUSTER (1 << BH_AllocFromCluster)
#define EXT4_MAP_FLAGS (EXT4_MAP_NEW | EXT4_MAP_MAPPED |\
EXT4_MAP_UNWRITTEN | EXT4_MAP_BOUNDARY |\
- EXT4_MAP_UNINIT | EXT4_MAP_FROM_CLUSTER)
+ EXT4_MAP_UNINIT)
struct ext4_map_blocks {
ext4_fsblk_t m_pblk;
@@ -194,8 +186,7 @@ struct mpage_da_data {
*/
#define EXT4_IO_END_UNWRITTEN 0x0001
#define EXT4_IO_END_ERROR 0x0002
-#define EXT4_IO_END_QUEUED 0x0004
-#define EXT4_IO_END_DIRECT 0x0008
+#define EXT4_IO_END_DIRECT 0x0004
struct ext4_io_page {
struct page *p_page;
@@ -215,10 +206,8 @@ typedef struct ext4_io_end {
struct list_head list; /* per-file finished IO list */
struct inode *inode; /* file being written to */
unsigned int flag; /* unwritten or not */
- struct page *page; /* for writepage() path */
loff_t offset; /* offset in the file */
ssize_t size; /* size of the extent */
- struct work_struct work; /* data work queue */
struct kiocb *iocb; /* iocb struct for AIO */
int result; /* error value for AIO */
int num_io_pages; /* for writepages() */
@@ -810,17 +799,6 @@ do { \
#endif /* defined(__KERNEL__) || defined(__linux__) */
-/*
- * storage for cached extent
- * If ec_len == 0, then the cache is invalid.
- * If ec_start == 0, then the cache represents a gap (null mapping)
- */
-struct ext4_ext_cache {
- ext4_fsblk_t ec_start;
- ext4_lblk_t ec_block;
- __u32 ec_len; /* must be 32bit to return holes */
-};
-
#include "extents_status.h"
/*
@@ -887,7 +865,6 @@ struct ext4_inode_info {
struct inode vfs_inode;
struct jbd2_inode *jinode;
- struct ext4_ext_cache i_cached_extent;
/*
* File creation time. Its function is same as that of
* struct timespec i_{a,c,m}time in the generic inode.
@@ -901,6 +878,8 @@ struct ext4_inode_info {
/* extents status tree */
struct ext4_es_tree i_es_tree;
rwlock_t i_es_lock;
+ struct list_head i_es_lru;
+ unsigned int i_es_lru_nr; /* protected by i_es_lock */
/* ialloc */
ext4_group_t i_last_alloc_group;
@@ -930,6 +909,7 @@ struct ext4_inode_info {
spinlock_t i_completed_io_lock;
atomic_t i_ioend_count; /* Number of outstanding io_end structs */
atomic_t i_unwritten; /* Nr. of inflight conversions pending */
+ struct work_struct i_unwritten_work; /* deferred extent conversion */
spinlock_t i_block_reservation_lock;
@@ -985,7 +965,6 @@ struct ext4_inode_info {
#define EXT4_MOUNT_DIOREAD_NOLOCK 0x400000 /* Enable support for dio read nolocking */
#define EXT4_MOUNT_JOURNAL_CHECKSUM 0x800000 /* Journal checksums */
#define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */
-#define EXT4_MOUNT_MBLK_IO_SUBMIT 0x4000000 /* multi-block io submits */
#define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */
#define EXT4_MOUNT_DATA_ERR_ABORT 0x10000000 /* Abort on file data write */
#define EXT4_MOUNT_BLOCK_VALIDITY 0x20000000 /* Block validity checking */
@@ -1316,6 +1295,11 @@ struct ext4_sb_info {
/* Precomputed FS UUID checksum for seeding other checksums */
__u32 s_csum_seed;
+
+ /* Reclaim extents from extent status tree */
+ struct shrinker s_es_shrinker;
+ struct list_head s_es_lru;
+ spinlock_t s_es_lru_lock ____cacheline_aligned_in_smp;
};
static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)
@@ -2103,6 +2087,7 @@ extern ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,
extern int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock);
extern int ext4_ind_trans_blocks(struct inode *inode, int nrblocks, int chunk);
extern void ext4_ind_truncate(struct inode *inode);
+extern int ext4_ind_punch_hole(struct file *file, loff_t offset, loff_t length);
/* ioctl.c */
extern long ext4_ioctl(struct file *, unsigned int, unsigned long);
@@ -2227,6 +2212,8 @@ extern int ext4_group_desc_csum_verify(struct super_block *sb, __u32 group,
struct ext4_group_desc *gdp);
extern void ext4_group_desc_csum_set(struct super_block *sb, __u32 group,
struct ext4_group_desc *gdp);
+extern int ext4_register_li_request(struct super_block *sb,
+ ext4_group_t first_not_zeroed);
static inline int ext4_has_group_desc_csum(struct super_block *sb)
{
@@ -2537,6 +2524,7 @@ extern void ext4_exit_pageio(void);
extern void ext4_ioend_wait(struct inode *);
extern void ext4_free_io_end(ext4_io_end_t *io);
extern ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags);
+extern void ext4_end_io_work(struct work_struct *work);
extern void ext4_io_submit(struct ext4_io_submit *io);
extern int ext4_bio_write_page(struct ext4_io_submit *io,
struct page *page,
@@ -2553,11 +2541,6 @@ extern int ext4_mmp_csum_verify(struct super_block *sb,
enum ext4_state_bits {
BH_Uninit /* blocks are allocated but uninitialized on disk */
= BH_JBDPrivateStart,
- BH_AllocFromCluster, /* allocated blocks were part of already
- * allocated cluster. Note that this flag will
- * never, ever appear in a buffer_head's state
- * flag. See EXT4_MAP_FROM_CLUSTER to see where
- * this is used. */
};
BUFFER_FNS(Uninit, uninit)
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
index 487fda12bc00..8643ff5bbeb7 100644
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -193,12 +193,6 @@ static inline unsigned short ext_depth(struct inode *inode)
return le16_to_cpu(ext_inode_hdr(inode)->eh_depth);
}
-static inline void
-ext4_ext_invalidate_cache(struct inode *inode)
-{
- EXT4_I(inode)->i_cached_extent.ec_len = 0;
-}
-
static inline void ext4_ext_mark_uninitialized(struct ext4_extent *ext)
{
/* We can not have an uninitialized extent of zero length! */
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 5ae1674ec12f..8733addc0069 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -54,9 +54,6 @@
#define EXT4_EXT_MARK_UNINIT1 0x2 /* mark first half uninitialized */
#define EXT4_EXT_MARK_UNINIT2 0x4 /* mark second half uninitialized */
-#define EXT4_EXT_DATA_VALID1 0x8 /* first half contains valid data */
-#define EXT4_EXT_DATA_VALID2 0x10 /* second half contains valid data */
-
static __le32 ext4_extent_block_csum(struct inode *inode,
struct ext4_extent_header *eh)
{
@@ -112,7 +109,7 @@ static int ext4_split_extent_at(handle_t *handle,
int flags);
static int ext4_find_delayed_extent(struct inode *inode,
- struct ext4_ext_cache *newex);
+ struct extent_status *newes);
static int ext4_ext_truncate_extend_restart(handle_t *handle,
struct inode *inode,
@@ -714,7 +711,6 @@ int ext4_ext_tree_init(handle_t *handle, struct inode *inode)
eh->eh_magic = EXT4_EXT_MAGIC;
eh->eh_max = cpu_to_le16(ext4_ext_space_root(inode, 0));
ext4_mark_inode_dirty(handle, inode);
- ext4_ext_invalidate_cache(inode);
return 0;
}
@@ -725,6 +721,7 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
struct ext4_extent_header *eh;
struct buffer_head *bh;
short int depth, i, ppos = 0, alloc = 0;
+ int ret;
eh = ext_inode_hdr(inode);
depth = ext_depth(inode);
@@ -752,12 +749,15 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
path[ppos].p_ext = NULL;
bh = sb_getblk(inode->i_sb, path[ppos].p_block);
- if (unlikely(!bh))
+ if (unlikely(!bh)) {
+ ret = -ENOMEM;
goto err;
+ }
if (!bh_uptodate_or_lock(bh)) {
trace_ext4_ext_load_extent(inode, block,
path[ppos].p_block);
- if (bh_submit_read(bh) < 0) {
+ ret = bh_submit_read(bh);
+ if (ret < 0) {
put_bh(bh);
goto err;
}
@@ -768,13 +768,15 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
put_bh(bh);
EXT4_ERROR_INODE(inode,
"ppos %d > depth %d", ppos, depth);
+ ret = -EIO;
goto err;
}
path[ppos].p_bh = bh;
path[ppos].p_hdr = eh;
i--;
- if (ext4_ext_check_block(inode, eh, i, bh))
+ ret = ext4_ext_check_block(inode, eh, i, bh);
+ if (ret < 0)
goto err;
}
@@ -796,7 +798,7 @@ err:
ext4_ext_drop_refs(path);
if (alloc)
kfree(path);
- return ERR_PTR(-EIO);
+ return ERR_PTR(ret);
}
/*
@@ -950,8 +952,8 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
goto cleanup;
}
bh = sb_getblk(inode->i_sb, newblock);
- if (!bh) {
- err = -EIO;
+ if (unlikely(!bh)) {
+ err = -ENOMEM;
goto cleanup;
}
lock_buffer(bh);
@@ -1023,8 +1025,8 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
oldblock = newblock;
newblock = ablocks[--a];
bh = sb_getblk(inode->i_sb, newblock);
- if (!bh) {
- err = -EIO;
+ if (unlikely(!bh)) {
+ err = -ENOMEM;
goto cleanup;
}
lock_buffer(bh);
@@ -1136,11 +1138,8 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
return err;
bh = sb_getblk(inode->i_sb, newblock);
- if (!bh) {
- err = -EIO;
- ext4_std_error(inode->i_sb, err);
- return err;
- }
+ if (unlikely(!bh))
+ return -ENOMEM;
lock_buffer(bh);
err = ext4_journal_get_create_access(handle, bh);
@@ -1579,20 +1578,17 @@ int
ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
struct ext4_extent *ex2)
{
- unsigned short ext1_ee_len, ext2_ee_len, max_len;
+ unsigned ext1_ee_len, ext2_ee_len;
/*
- * Make sure that either both extents are uninitialized, or
- * both are _not_.
+ * Make sure that both extents are initialized. We don't merge
+ * uninitialized extents so that we can be sure that end_io code has
+ * the extent that was written properly split out and conversion to
+ * initialized is trivial.
*/
- if (ext4_ext_is_uninitialized(ex1) ^ ext4_ext_is_uninitialized(ex2))
+ if (ext4_ext_is_uninitialized(ex1) || ext4_ext_is_uninitialized(ex2))
return 0;
- if (ext4_ext_is_uninitialized(ex1))
- max_len = EXT_UNINIT_MAX_LEN;
- else
- max_len = EXT_INIT_MAX_LEN;
-
ext1_ee_len = ext4_ext_get_actual_len(ex1);
ext2_ee_len = ext4_ext_get_actual_len(ex2);
@@ -1605,7 +1601,7 @@ ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
* as an RO_COMPAT feature, refuse to merge to extents if
* this can result in the top bit of ee_len being set.
*/
- if (ext1_ee_len + ext2_ee_len > max_len)
+ if (ext1_ee_len + ext2_ee_len > EXT_INIT_MAX_LEN)
return 0;
#ifdef AGGRESSIVE_TEST
if (ext1_ee_len >= 4)
@@ -1960,7 +1956,6 @@ cleanup:
ext4_ext_drop_refs(npath);
kfree(npath);
}
- ext4_ext_invalidate_cache(inode);
return err;
}
@@ -1969,8 +1964,8 @@ static int ext4_fill_fiemap_extents(struct inode *inode,
struct fiemap_extent_info *fieinfo)
{
struct ext4_ext_path *path = NULL;
- struct ext4_ext_cache newex;
struct ext4_extent *ex;
+ struct extent_status es;
ext4_lblk_t next, next_del, start = 0, end = 0;
ext4_lblk_t last = block + num;
int exists, depth = 0, err = 0;
@@ -2044,37 +2039,37 @@ static int ext4_fill_fiemap_extents(struct inode *inode,
BUG_ON(end <= start);
if (!exists) {
- newex.ec_block = start;
- newex.ec_len = end - start;
- newex.ec_start = 0;
+ es.es_lblk = start;
+ es.es_len = end - start;
+ es.es_pblk = 0;
} else {
- newex.ec_block = le32_to_cpu(ex->ee_block);
- newex.ec_len = ext4_ext_get_actual_len(ex);
- newex.ec_start = ext4_ext_pblock(ex);
+ es.es_lblk = le32_to_cpu(ex->ee_block);
+ es.es_len = ext4_ext_get_actual_len(ex);
+ es.es_pblk = ext4_ext_pblock(ex);
if (ext4_ext_is_uninitialized(ex))
flags |= FIEMAP_EXTENT_UNWRITTEN;
}
/*
- * Find delayed extent and update newex accordingly. We call
- * it even in !exists case to find out whether newex is the
+ * Find delayed extent and update es accordingly. We call
+ * it even in !exists case to find out whether es is the
* last existing extent or not.
*/
- next_del = ext4_find_delayed_extent(inode, &newex);
+ next_del = ext4_find_delayed_extent(inode, &es);
if (!exists && next_del) {
exists = 1;
flags |= FIEMAP_EXTENT_DELALLOC;
}
up_read(&EXT4_I(inode)->i_data_sem);
- if (unlikely(newex.ec_len == 0)) {
- EXT4_ERROR_INODE(inode, "newex.ec_len == 0");
+ if (unlikely(es.es_len == 0)) {
+ EXT4_ERROR_INODE(inode, "es.es_len == 0");
err = -EIO;
break;
}
/* This is possible iff next == next_del == EXT_MAX_BLOCKS */
- if (next == next_del) {
+ if (next == next_del && next_del == EXT_MAX_BLOCKS) {
flags |= FIEMAP_EXTENT_LAST;
if (unlikely(next_del != EXT_MAX_BLOCKS ||
next != EXT_MAX_BLOCKS)) {
@@ -2089,9 +2084,9 @@ static int ext4_fill_fiemap_extents(struct inode *inode,
if (exists) {
err = fiemap_fill_next_extent(fieinfo,
- (__u64)newex.ec_block << blksize_bits,
- (__u64)newex.ec_start << blksize_bits,
- (__u64)newex.ec_len << blksize_bits,
+ (__u64)es.es_lblk << blksize_bits,
+ (__u64)es.es_pblk << blksize_bits,
+ (__u64)es.es_len << blksize_bits,
flags);
if (err < 0)
break;
@@ -2101,7 +2096,7 @@ static int ext4_fill_fiemap_extents(struct inode *inode,
}
}
- block = newex.ec_block + newex.ec_len;
+ block = es.es_lblk + es.es_len;
}
if (path) {
@@ -2112,115 +2107,6 @@ static int ext4_fill_fiemap_extents(struct inode *inode,
return err;
}
-static void
-ext4_ext_put_in_cache(struct inode *inode, ext4_lblk_t block,
- __u32 len, ext4_fsblk_t start)
-{
- struct ext4_ext_cache *cex;
- BUG_ON(len == 0);
- spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
- trace_ext4_ext_put_in_cache(inode, block, len, start);
- cex = &EXT4_I(inode)->i_cached_extent;
- cex->ec_block = block;
- cex->ec_len = len;
- cex->ec_start = start;
- spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
-}
-
-/*
- * ext4_ext_put_gap_in_cache:
- * calculate boundaries of the gap that the requested block fits into
- * and cache this gap
- */
-static void
-ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path,
- ext4_lblk_t block)
-{
- int depth = ext_depth(inode);
- unsigned long len;
- ext4_lblk_t lblock;
- struct ext4_extent *ex;
-
- ex = path[depth].p_ext;
- if (ex == NULL) {
- /* there is no extent yet, so gap is [0;-] */
- lblock = 0;
- len = EXT_MAX_BLOCKS;
- ext_debug("cache gap(whole file):");
- } else if (block < le32_to_cpu(ex->ee_block)) {
- lblock = block;
- len = le32_to_cpu(ex->ee_block) - block;
- ext_debug("cache gap(before): %u [%u:%u]",
- block,
- le32_to_cpu(ex->ee_block),
- ext4_ext_get_actual_len(ex));
- } else if (block >= le32_to_cpu(ex->ee_block)
- + ext4_ext_get_actual_len(ex)) {
- ext4_lblk_t next;
- lblock = le32_to_cpu(ex->ee_block)
- + ext4_ext_get_actual_len(ex);
-
- next = ext4_ext_next_allocated_block(path);
- ext_debug("cache gap(after): [%u:%u] %u",
- le32_to_cpu(ex->ee_block),
- ext4_ext_get_actual_len(ex),
- block);
- BUG_ON(next == lblock);
- len = next - lblock;
- } else {
- lblock = len = 0;
- BUG();
- }
-
- ext_debug(" -> %u:%lu\n", lblock, len);
- ext4_ext_put_in_cache(inode, lblock, len, 0);
-}
-
-/*
- * ext4_ext_in_cache()
- * Checks to see if the given block is in the cache.
- * If it is, the cached extent is stored in the given
- * cache extent pointer.
- *
- * @inode: The files inode
- * @block: The block to look for in the cache
- * @ex: Pointer where the cached extent will be stored
- * if it contains block
- *
- * Return 0 if cache is invalid; 1 if the cache is valid
- */
-static int
-ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block,
- struct ext4_extent *ex)
-{
- struct ext4_ext_cache *cex;
- int ret = 0;
-
- /*
- * We borrow i_block_reservation_lock to protect i_cached_extent
- */
- spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
- cex = &EXT4_I(inode)->i_cached_extent;
-
- /* has cache valid data? */
- if (cex->ec_len == 0)
- goto errout;
-
- if (in_range(block, cex->ec_block, cex->ec_len)) {
- ex->ee_block = cpu_to_le32(cex->ec_block);
- ext4_ext_store_pblock(ex, cex->ec_start);
- ex->ee_len = cpu_to_le16(cex->ec_len);
- ext_debug("%u cached by %u:%u:%llu\n",
- block,
- cex->ec_block, cex->ec_len, cex->ec_start);
- ret = 1;
- }
-errout:
- trace_ext4_ext_in_cache(inode, block, ret);
- spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
- return ret;
-}
-
/*
* ext4_ext_rm_idx:
* removes index from the index block.
@@ -2658,8 +2544,6 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
return PTR_ERR(handle);
again:
- ext4_ext_invalidate_cache(inode);
-
trace_ext4_ext_remove_space(inode, start, depth);
/*
@@ -2973,9 +2857,6 @@ static int ext4_split_extent_at(handle_t *handle,
unsigned int ee_len, depth;
int err = 0;
- BUG_ON((split_flag & (EXT4_EXT_DATA_VALID1 | EXT4_EXT_DATA_VALID2)) ==
- (EXT4_EXT_DATA_VALID1 | EXT4_EXT_DATA_VALID2));
-
ext_debug("ext4_split_extents_at: inode %lu, logical"
"block %llu\n", inode->i_ino, (unsigned long long)split);
@@ -2988,6 +2869,10 @@ static int ext4_split_extent_at(handle_t *handle,
newblock = split - ee_block + ext4_ext_pblock(ex);
BUG_ON(split < ee_block || split >= (ee_block + ee_len));
+ BUG_ON(!ext4_ext_is_uninitialized(ex) &&
+ split_flag & (EXT4_EXT_MAY_ZEROOUT |
+ EXT4_EXT_MARK_UNINIT1 |
+ EXT4_EXT_MARK_UNINIT2));
err = ext4_ext_get_access(handle, inode, path + depth);
if (err)
@@ -3034,14 +2919,7 @@ static int ext4_split_extent_at(handle_t *handle,
err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
if (err == -ENOSPC && (EXT4_EXT_MAY_ZEROOUT & split_flag)) {
- if (split_flag & (EXT4_EXT_DATA_VALID1|EXT4_EXT_DATA_VALID2)) {
- if (split_flag & EXT4_EXT_DATA_VALID1)
- err = ext4_ext_zeroout(inode, ex2);
- else
- err = ext4_ext_zeroout(inode, ex);
- } else
- err = ext4_ext_zeroout(inode, &orig_ex);
-
+ err = ext4_ext_zeroout(inode, &orig_ex);
if (err)
goto fix_extent_len;
/* update the extent length and mark as initialized */
@@ -3099,26 +2977,30 @@ static int ext4_split_extent(handle_t *handle,
if (uninitialized)
split_flag1 |= EXT4_EXT_MARK_UNINIT1 |
EXT4_EXT_MARK_UNINIT2;
- if (split_flag & EXT4_EXT_DATA_VALID2)
- split_flag1 |= EXT4_EXT_DATA_VALID1;
err = ext4_split_extent_at(handle, inode, path,
map->m_lblk + map->m_len, split_flag1, flags1);
if (err)
goto out;
}
-
+ /*
+ * Update path is required because previous ext4_split_extent_at() may
+ * result in split of original leaf or extent zeroout.
+ */
ext4_ext_drop_refs(path);
path = ext4_ext_find_extent(inode, map->m_lblk, path);
if (IS_ERR(path))
return PTR_ERR(path);
+ depth = ext_depth(inode);
+ ex = path[depth].p_ext;
+ uninitialized = ext4_ext_is_uninitialized(ex);
+ split_flag1 = 0;
if (map->m_lblk >= ee_block) {
- split_flag1 = split_flag & (EXT4_EXT_MAY_ZEROOUT |
- EXT4_EXT_DATA_VALID2);
- if (uninitialized)
- split_flag1 |= EXT4_EXT_MARK_UNINIT1;
- if (split_flag & EXT4_EXT_MARK_UNINIT2)
- split_flag1 |= EXT4_EXT_MARK_UNINIT2;
+ if (uninitialized) {
+ split_flag1 = EXT4_EXT_MARK_UNINIT1;
+ split_flag1 |= split_flag & (EXT4_EXT_MAY_ZEROOUT |
+ EXT4_EXT_MARK_UNINIT2);
+ }
err = ext4_split_extent_at(handle, inode, path,
map->m_lblk, split_flag1, flags);
if (err)
@@ -3393,8 +3275,7 @@ static int ext4_split_unwritten_extents(handle_t *handle,
split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0;
split_flag |= EXT4_EXT_MARK_UNINIT2;
- if (flags & EXT4_GET_BLOCKS_CONVERT)
- split_flag |= EXT4_EXT_DATA_VALID2;
+
flags |= EXT4_GET_BLOCKS_PRE_IO;
return ext4_split_extent(handle, inode, path, map, split_flag, flags);
}
@@ -3419,20 +3300,15 @@ static int ext4_convert_unwritten_extents_endio(handle_t *handle,
"block %llu, max_blocks %u\n", inode->i_ino,
(unsigned long long)ee_block, ee_len);
- /* If extent is larger than requested then split is required */
+ /* Extent is larger than requested? */
if (ee_block != map->m_lblk || ee_len > map->m_len) {
- err = ext4_split_unwritten_extents(handle, inode, map, path,
- EXT4_GET_BLOCKS_CONVERT);
- if (err < 0)
- goto out;
- ext4_ext_drop_refs(path);
- path = ext4_ext_find_extent(inode, map->m_lblk, path);
- if (IS_ERR(path)) {
- err = PTR_ERR(path);
- goto out;
- }
- depth = ext_depth(inode);
- ex = path[depth].p_ext;
+ EXT4_ERROR_INODE(inode, "Written extent modified before IO"
+ " finished: extent logical block %llu, len %u; IO"
+ " logical block %llu, len %u\n",
+ (unsigned long long)ee_block, ee_len,
+ (unsigned long long)map->m_lblk, map->m_len);
+ err = -EIO;
+ goto out;
}
err = ext4_ext_get_access(handle, inode, path + depth);
@@ -3525,13 +3401,14 @@ static int ext4_find_delalloc_range(struct inode *inode,
{
struct extent_status es;
- es.start = lblk_start;
- ext4_es_find_extent(inode, &es);
- if (es.len == 0)
+ es.es_lblk = lblk_start;
+ (void)ext4_es_find_extent(inode, &es);
+ if (es.es_len == 0)
return 0; /* there is no delay extent in this tree */
- else if (es.start <= lblk_start && lblk_start < es.start + es.len)
+ else if (es.es_lblk <= lblk_start &&
+ lblk_start < es.es_lblk + es.es_len)
return 1;
- else if (lblk_start <= es.start && es.start <= lblk_end)
+ else if (lblk_start <= es.es_lblk && es.es_lblk <= lblk_end)
return 1;
else
return 0;
@@ -3893,40 +3770,12 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
ext4_io_end_t *io = ext4_inode_aio(inode);
ext4_lblk_t cluster_offset;
int set_unwritten = 0;
+ int from_cluster = 0;
ext_debug("blocks %u/%u requested for inode %lu\n",
map->m_lblk, map->m_len, inode->i_ino);
trace_ext4_ext_map_blocks_enter(inode, map->m_lblk, map->m_len, flags);
- /* check in cache */
- if (ext4_ext_in_cache(inode, map->m_lblk, &newex)) {
- if (!newex.ee_start_lo && !newex.ee_start_hi) {
- if ((sbi->s_cluster_ratio > 1) &&
- ext4_find_delalloc_cluster(inode, map->m_lblk))
- map->m_flags |= EXT4_MAP_FROM_CLUSTER;
-
- if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
- /*
- * block isn't allocated yet and
- * user doesn't want to allocate it
- */
- goto out2;
- }
- /* we should allocate requested block */
- } else {
- /* block is already allocated */
- if (sbi->s_cluster_ratio > 1)
- map->m_flags |= EXT4_MAP_FROM_CLUSTER;
- newblock = map->m_lblk
- - le32_to_cpu(newex.ee_block)
- + ext4_ext_pblock(&newex);
- /* number of remaining blocks in the extent */
- allocated = ext4_ext_get_actual_len(&newex) -
- (map->m_lblk - le32_to_cpu(newex.ee_block));
- goto out;
- }
- }
-
/* find extent for this block */
path = ext4_ext_find_extent(inode, map->m_lblk, NULL);
if (IS_ERR(path)) {
@@ -3973,15 +3822,9 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
ext_debug("%u fit into %u:%d -> %llu\n", map->m_lblk,
ee_block, ee_len, newblock);
- /*
- * Do not put uninitialized extent
- * in the cache
- */
- if (!ext4_ext_is_uninitialized(ex)) {
- ext4_ext_put_in_cache(inode, ee_block,
- ee_len, ee_start);
+ if (!ext4_ext_is_uninitialized(ex))
goto out;
- }
+
allocated = ext4_ext_handle_uninitialized_extents(
handle, inode, map, path, flags,
allocated, newblock);
@@ -3989,27 +3832,16 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
}
}
- if ((sbi->s_cluster_ratio > 1) &&
- ext4_find_delalloc_cluster(inode, map->m_lblk))
- map->m_flags |= EXT4_MAP_FROM_CLUSTER;
-
/*
* requested block isn't allocated yet;
* we couldn't try to create block if create flag is zero
*/
- if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
- /*
- * put just found gap into cache to speed up
- * subsequent requests
- */
- ext4_ext_put_gap_in_cache(inode, path, map->m_lblk);
+ if ((flags & EXT4_GET_BLOCKS_CREATE) == 0)
goto out2;
- }
/*
* Okay, we need to do block allocation.
*/
- map->m_flags &= ~EXT4_MAP_FROM_CLUSTER;
newex.ee_block = cpu_to_le32(map->m_lblk);
cluster_offset = map->m_lblk & (sbi->s_cluster_ratio-1);
@@ -4021,7 +3853,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
get_implied_cluster_alloc(inode->i_sb, map, ex, path)) {
ar.len = allocated = map->m_len;
newblock = map->m_pblk;
- map->m_flags |= EXT4_MAP_FROM_CLUSTER;
+ from_cluster = 1;
goto got_allocated_blocks;
}
@@ -4042,7 +3874,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
get_implied_cluster_alloc(inode->i_sb, map, ex2, path)) {
ar.len = allocated = map->m_len;
newblock = map->m_pblk;
- map->m_flags |= EXT4_MAP_FROM_CLUSTER;
+ from_cluster = 1;
goto got_allocated_blocks;
}
@@ -4167,7 +3999,7 @@ got_allocated_blocks:
*/
reserved_clusters = get_reserved_cluster_alloc(inode,
map->m_lblk, allocated);
- if (map->m_flags & EXT4_MAP_FROM_CLUSTER) {
+ if (from_cluster) {
if (reserved_clusters) {
/*
* We have clusters reserved for this range.
@@ -4241,10 +4073,9 @@ got_allocated_blocks:
* Cache the extent and update transaction to commit on fdatasync only
* when it is _not_ an uninitialized extent.
*/
- if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0) {
- ext4_ext_put_in_cache(inode, map->m_lblk, allocated, newblock);
+ if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0)
ext4_update_inode_fsync_trans(handle, inode, 1);
- } else
+ else
ext4_update_inode_fsync_trans(handle, inode, 0);
out:
if (allocated > map->m_len)
@@ -4303,7 +4134,6 @@ void ext4_ext_truncate(struct inode *inode)
goto out_stop;
down_write(&EXT4_I(inode)->i_data_sem);
- ext4_ext_invalidate_cache(inode);
ext4_discard_preallocations(inode);
@@ -4397,13 +4227,6 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
struct ext4_map_blocks map;
unsigned int credits, blkbits = inode->i_blkbits;
- /*
- * currently supporting (pre)allocate mode for extent-based
- * files _only_
- */
- if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
- return -EOPNOTSUPP;
-
/* Return error if mode is not supported */
if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
return -EOPNOTSUPP;
@@ -4415,6 +4238,13 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
if (ret)
return ret;
+ /*
+ * currently supporting (pre)allocate mode for extent-based
+ * files _only_
+ */
+ if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
+ return -EOPNOTSUPP;
+
trace_ext4_fallocate_enter(inode, offset, len, mode);
map.m_lblk = offset >> blkbits;
/*
@@ -4445,8 +4275,6 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
if (len <= EXT_UNINIT_MAX_LEN << blkbits)
flags |= EXT4_GET_BLOCKS_NO_NORMALIZE;
- /* Prevent race condition between unwritten */
- ext4_flush_unwritten_io(inode);
retry:
while (ret >= 0 && ret < max_blocks) {
map.m_lblk = map.m_lblk + ret;
@@ -4459,11 +4287,11 @@ retry:
ret = ext4_map_blocks(handle, inode, &map, flags);
if (ret <= 0) {
#ifdef EXT4FS_DEBUG
- WARN_ON(ret <= 0);
- printk(KERN_ERR "%s: ext4_ext_map_blocks "
- "returned error inode#%lu, block=%u, "
- "max_blocks=%u", __func__,
- inode->i_ino, map.m_lblk, max_blocks);
+ ext4_warning(inode->i_sb,
+ "inode #%lu: block %u: len %u: "
+ "ext4_ext_map_blocks returned %d",
+ inode->i_ino, map.m_lblk,
+ map.m_len, ret);
#endif
ext4_mark_inode_dirty(handle, inode);
ret2 = ext4_journal_stop(handle);
@@ -4536,14 +4364,12 @@ int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
}
ret = ext4_map_blocks(handle, inode, &map,
EXT4_GET_BLOCKS_IO_CONVERT_EXT);
- if (ret <= 0) {
- WARN_ON(ret <= 0);
- ext4_msg(inode->i_sb, KERN_ERR,
- "%s:%d: inode #%lu: block %u: len %u: "
- "ext4_ext_map_blocks returned %d",
- __func__, __LINE__, inode->i_ino, map.m_lblk,
- map.m_len, ret);
- }
+ if (ret <= 0)
+ ext4_warning(inode->i_sb,
+ "inode #%lu: block %u: len %u: "
+ "ext4_ext_map_blocks returned %d",
+ inode->i_ino, map.m_lblk,
+ map.m_len, ret);
ext4_mark_inode_dirty(handle, inode);
ret2 = ext4_journal_stop(handle);
if (ret <= 0 || ret2 )
@@ -4553,40 +4379,43 @@ int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
}
/*
- * If newex is not existing extent (newex->ec_start equals zero) find
- * delayed extent at start of newex and update newex accordingly and
+ * If newes is not existing extent (newes->ec_pblk equals zero) find
+ * delayed extent at start of newes and update newes accordingly and
* return start of the next delayed extent.
*
- * If newex is existing extent (newex->ec_start is not equal zero)
+ * If newes is existing extent (newes->ec_pblk is not equal zero)
* return start of next delayed extent or EXT_MAX_BLOCKS if no delayed
- * extent found. Leave newex unmodified.
+ * extent found. Leave newes unmodified.
*/
static int ext4_find_delayed_extent(struct inode *inode,
- struct ext4_ext_cache *newex)
+ struct extent_status *newes)
{
struct extent_status es;
ext4_lblk_t next_del;
- es.start = newex->ec_block;
+ es.es_lblk = newes->es_lblk;
next_del = ext4_es_find_extent(inode, &es);
- if (newex->ec_start == 0) {
+ if (newes->es_pblk == 0) {
/*
- * No extent in extent-tree contains block @newex->ec_start,
+ * No extent in extent-tree contains block @newes->es_pblk,
* then the block may stay in 1)a hole or 2)delayed-extent.
*/
- if (es.len == 0)
+ if (es.es_len == 0)
/* A hole found. */
return 0;
- if (es.start > newex->ec_block) {
+ if (!ext4_es_is_delayed(&es))
+ return 0;
+
+ if (es.es_lblk > newes->es_lblk) {
/* A hole found. */
- newex->ec_len = min(es.start - newex->ec_block,
- newex->ec_len);
+ newes->es_len = min(es.es_lblk - newes->es_lblk,
+ newes->es_len);
return 0;
}
- newex->ec_len = es.start + es.len - newex->ec_block;
+ newes->es_len = es.es_lblk + es.es_len - newes->es_lblk;
}
return next_del;
@@ -4786,14 +4615,12 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length)
goto out;
down_write(&EXT4_I(inode)->i_data_sem);
- ext4_ext_invalidate_cache(inode);
ext4_discard_preallocations(inode);
err = ext4_es_remove_extent(inode, first_block,
stop_block - first_block);
err = ext4_ext_remove_space(inode, first_block, stop_block - 1);
- ext4_ext_invalidate_cache(inode);
ext4_discard_preallocations(inode);
if (IS_SYNC(inode))
diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
index 564d981a2fcc..0e95ec31d9f5 100644
--- a/fs/ext4/extents_status.c
+++ b/fs/ext4/extents_status.c
@@ -23,40 +23,53 @@
* (e.g. Reservation space warning), and provide extent-level locking.
* Delay extent tree is the first step to achieve this goal. It is
* original built by Yongqiang Yang. At that time it is called delay
- * extent tree, whose goal is only track delay extent in memory to
+ * extent tree, whose goal is only track delayed extents in memory to
* simplify the implementation of fiemap and bigalloc, and introduce
* lseek SEEK_DATA/SEEK_HOLE support. That is why it is still called
- * delay extent tree at the following comment. But for better
- * understand what it does, it has been rename to extent status tree.
+ * delay extent tree at the first commit. But for better understand
+ * what it does, it has been rename to extent status tree.
*
- * Currently the first step has been done. All delay extents are
- * tracked in the tree. It maintains the delay extent when a delay
- * allocation is issued, and the delay extent is written out or
+ * Step1:
+ * Currently the first step has been done. All delayed extents are
+ * tracked in the tree. It maintains the delayed extent when a delayed
+ * allocation is issued, and the delayed extent is written out or
* invalidated. Therefore the implementation of fiemap and bigalloc
* are simplified, and SEEK_DATA/SEEK_HOLE are introduced.
*
* The following comment describes the implemenmtation of extent
* status tree and future works.
+ *
+ * Step2:
+ * In this step all extent status is tracked by extent status tree.
+ * Thus, we can first try to lookup a block mapping in this tree before
+ * finding it in extent tree. Hence, single extent cache can be removed
+ * because extent status tree can do a better job. Extents in status
+ * tree are loaded on-demand. Therefore, the extent status tree may not
+ * contain all of the extents in a file. Meanwhile we add
+ * nr_cached_objects and free_cached_objects callback functions to
+ * reclaim extents from extent status tree. These functions make us
+ * reclaim written/unwritten extents from the tree under a heavy memory
+ * pressure. Delayed extents will not be reclaimed because fiemap,
+ * bigalloc, and seek_data/hole need it.
*/
/*
- * extents status tree implementation for ext4.
+ * Extent status tree implementation for ext4.
*
*
* ==========================================================================
- * Extents status encompass delayed extents and extent locks
+ * Extent status tree tracks all extent status.
*
- * 1. Why delayed extent implementation ?
+ * 1. Why we need to implement extent status tree?
*
- * Without delayed extent, ext4 identifies a delayed extent by looking
+ * Without extent status tree, ext4 identifies a delayed extent by looking
* up page cache, this has several deficiencies - complicated, buggy,
* and inefficient code.
*
- * FIEMAP, SEEK_HOLE/DATA, bigalloc, punch hole and writeout all need
- * to know if a block or a range of blocks are belonged to a delayed
- * extent.
+ * FIEMAP, SEEK_HOLE/DATA, bigalloc, and writeout all need to know if a
+ * block or a range of blocks are belonged to a delayed extent.
*
- * Let us have a look at how they do without delayed extents implementation.
+ * Let us have a look at how they do without extent status tree.
* -- FIEMAP
* FIEMAP looks up page cache to identify delayed allocations from holes.
*
@@ -68,47 +81,48 @@
* already under delayed allocation or not to determine whether
* quota reserving is needed for the cluster.
*
- * -- punch hole
- * punch hole looks up page cache to identify a delayed extent.
- *
* -- writeout
* Writeout looks up whole page cache to see if a buffer is
* mapped, If there are not very many delayed buffers, then it is
* time comsuming.
*
- * With delayed extents implementation, FIEMAP, SEEK_HOLE/DATA,
+ * With extent status tree implementation, FIEMAP, SEEK_HOLE/DATA,
* bigalloc and writeout can figure out if a block or a range of
* blocks is under delayed allocation(belonged to a delayed extent) or
- * not by searching the delayed extent tree.
+ * not by searching the extent tree.
*
*
* ==========================================================================
- * 2. ext4 delayed extents impelmentation
+ * 2. Ext4 extent status tree impelmentation
+ *
+ * -- extent
+ * A extent is a range of blocks which are contiguous logically and
+ * physically. Unlike extent in extent tree, this extent in ext4 is
+ * a in-memory struct, there is no corresponding on-disk data. There
+ * is no limit on length of extent, so an extent can contain as many
+ * blocks as they are contiguous logically and physically.
*
- * -- delayed extent
- * A delayed extent is a range of blocks which are contiguous
- * logically and under delayed allocation. Unlike extent in
- * ext4, delayed extent in ext4 is a in-memory struct, there is
- * no corresponding on-disk data. There is no limit on length of
- * delayed extent, so a delayed extent can contain as many blocks
- * as they are contiguous logically.
+ * -- extent status tree
+ * Every inode has an extent status tree and all allocation blocks
+ * are added to the tree with different status. The extent in the
+ * tree are ordered by logical block no.
*
- * -- delayed extent tree
- * Every inode has a delayed extent tree and all under delayed
- * allocation blocks are added to the tree as delayed extents.
- * Delayed extents in the tree are ordered by logical block no.
+ * -- operations on a extent status tree
+ * There are three important operations on a delayed extent tree: find
+ * next extent, adding a extent(a range of blocks) and removing a extent.
*
- * -- operations on a delayed extent tree
- * There are three operations on a delayed extent tree: find next
- * delayed extent, adding a space(a range of blocks) and removing
- * a space.
+ * -- race on a extent status tree
+ * Extent status tree is protected by inode->i_es_lock.
*
- * -- race on a delayed extent tree
- * Delayed extent tree is protected inode->i_es_lock.
+ * -- memory consumption
+ * Fragmented extent tree will make extent status tree cost too much
+ * memory. Hence, we will reclaim written/unwritten extents from the
+ * tree under a heavy memory pressure.
*
*
* ==========================================================================
- * 3. performance analysis
+ * 3. Performance analysis
+ *
* -- overhead
* 1. There is a cache extent for write access, so if writes are
* not very random, adding space operaions are in O(1) time.
@@ -120,15 +134,21 @@
*
* ==========================================================================
* 4. TODO list
- * -- Track all extent status
*
- * -- Improve get block process
+ * -- Refactor delayed space reservation
*
* -- Extent-level locking
*/
static struct kmem_cache *ext4_es_cachep;
+static int __es_insert_extent(struct inode *inode, struct extent_status *newes);
+static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
+ ext4_lblk_t end);
+static int __es_try_to_reclaim_extents(struct ext4_inode_info *ei,
+ int nr_to_scan);
+static int ext4_es_reclaim_extents_count(struct super_block *sb);
+
int __init ext4_init_es(void)
{
ext4_es_cachep = KMEM_CACHE(extent_status, SLAB_RECLAIM_ACCOUNT);
@@ -161,7 +181,8 @@ static void ext4_es_print_tree(struct inode *inode)
while (node) {
struct extent_status *es;
es = rb_entry(node, struct extent_status, rb_node);
- printk(KERN_DEBUG " [%u/%u)", es->start, es->len);
+ printk(KERN_DEBUG " [%u/%u) %llu %u",
+ es->es_lblk, es->es_len, es->es_pblk, es->es_status);
node = rb_next(node);
}
printk(KERN_DEBUG "\n");
@@ -170,10 +191,10 @@ static void ext4_es_print_tree(struct inode *inode)
#define ext4_es_print_tree(inode)
#endif
-static inline ext4_lblk_t extent_status_end(struct extent_status *es)
+static inline ext4_lblk_t ext4_es_end(struct extent_status *es)
{
- BUG_ON(es->start + es->len < es->start);
- return es->start + es->len - 1;
+ BUG_ON(es->es_lblk + es->es_len < es->es_lblk);
+ return es->es_lblk + es->es_len - 1;
}
/*
@@ -181,25 +202,25 @@ static inline ext4_lblk_t extent_status_end(struct extent_status *es)
* it can't be found, try to find next extent.
*/
static struct extent_status *__es_tree_search(struct rb_root *root,
- ext4_lblk_t offset)
+ ext4_lblk_t lblk)
{
struct rb_node *node = root->rb_node;
struct extent_status *es = NULL;
while (node) {
es = rb_entry(node, struct extent_status, rb_node);
- if (offset < es->start)
+ if (lblk < es->es_lblk)
node = node->rb_left;
- else if (offset > extent_status_end(es))
+ else if (lblk > ext4_es_end(es))
node = node->rb_right;
else
return es;
}
- if (es && offset < es->start)
+ if (es && lblk < es->es_lblk)
return es;
- if (es && offset > extent_status_end(es)) {
+ if (es && lblk > ext4_es_end(es)) {
node = rb_next(&es->rb_node);
return node ? rb_entry(node, struct extent_status, rb_node) :
NULL;
@@ -209,14 +230,14 @@ static struct extent_status *__es_tree_search(struct rb_root *root,
}
/*
- * ext4_es_find_extent: find the 1st delayed extent covering @es->start
- * if it exists, otherwise, the next extent after @es->start.
+ * ext4_es_find_extent: find the 1st delayed extent covering @es->lblk
+ * if it exists, otherwise, the next extent after @es->lblk.
*
* @inode: the inode which owns delayed extents
* @es: delayed extent that we found
*
* Returns the first block of the next extent after es, otherwise
- * EXT_MAX_BLOCKS if no delay extent is found.
+ * EXT_MAX_BLOCKS if no extent is found.
* Delayed extent is returned via @es.
*/
ext4_lblk_t ext4_es_find_extent(struct inode *inode, struct extent_status *es)
@@ -226,62 +247,107 @@ ext4_lblk_t ext4_es_find_extent(struct inode *inode, struct extent_status *es)
struct rb_node *node;
ext4_lblk_t ret = EXT_MAX_BLOCKS;
- trace_ext4_es_find_extent_enter(inode, es->start);
+ trace_ext4_es_find_extent_enter(inode, es->es_lblk);
read_lock(&EXT4_I(inode)->i_es_lock);
tree = &EXT4_I(inode)->i_es_tree;
- /* find delay extent in cache firstly */
+ /* find extent in cache firstly */
if (tree->cache_es) {
es1 = tree->cache_es;
- if (in_range(es->start, es1->start, es1->len)) {
- es_debug("%u cached by [%u/%u)\n",
- es->start, es1->start, es1->len);
+ if (in_range(es->es_lblk, es1->es_lblk, es1->es_len)) {
+ es_debug("%u cached by [%u/%u) %llu %u\n",
+ es->es_lblk, es1->es_lblk, es1->es_len,
+ (unsigned long long)es1->es_pblk,
+ es1->es_status);
goto out;
}
}
- es->len = 0;
- es1 = __es_tree_search(&tree->root, es->start);
+ es->es_len = 0;
+ es1 = __es_tree_search(&tree->root, es->es_lblk);
out:
if (es1) {
tree->cache_es = es1;
- es->start = es1->start;
- es->len = es1->len;
+ es->es_lblk = es1->es_lblk;
+ es->es_len = es1->es_len;
+ es->es_pblk = es1->es_pblk;
+ es->es_status = es1->es_status;
node = rb_next(&es1->rb_node);
if (node) {
es1 = rb_entry(node, struct extent_status, rb_node);
- ret = es1->start;
+ ret = es1->es_lblk;
}
}
read_unlock(&EXT4_I(inode)->i_es_lock);
+ ext4_es_lru_add(inode);
trace_ext4_es_find_extent_exit(inode, es, ret);
return ret;
}
static struct extent_status *
-ext4_es_alloc_extent(ext4_lblk_t start, ext4_lblk_t len)
+ext4_es_alloc_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len,
+ ext4_fsblk_t pblk, int status)
{
struct extent_status *es;
es = kmem_cache_alloc(ext4_es_cachep, GFP_ATOMIC);
if (es == NULL)
return NULL;
- es->start = start;
- es->len = len;
+ es->es_lblk = lblk;
+ es->es_len = len;
+ es->es_pblk = pblk;
+ es->es_status = status;
+
+ /*
+ * We don't count delayed extent because we never try to reclaim them
+ */
+ if (!ext4_es_is_delayed(es))
+ EXT4_I(inode)->i_es_lru_nr++;
+
return es;
}
-static void ext4_es_free_extent(struct extent_status *es)
+static void ext4_es_free_extent(struct inode *inode, struct extent_status *es)
{
+ /* Decrease the lru counter when this es is not delayed */
+ if (!ext4_es_is_delayed(es)) {
+ BUG_ON(EXT4_I(inode)->i_es_lru_nr == 0);
+ EXT4_I(inode)->i_es_lru_nr--;
+ }
+
kmem_cache_free(ext4_es_cachep, es);
}
+/*
+ * Check whether or not two extents can be merged
+ * Condition:
+ * - logical block number is contiguous
+ * - physical block number is contiguous
+ * - status is equal
+ */
+static int ext4_es_can_be_merged(struct extent_status *es1,
+ struct extent_status *es2)
+{
+ if (es1->es_lblk + es1->es_len != es2->es_lblk)
+ return 0;
+
+ if (es1->es_status != es2->es_status)
+ return 0;
+
+ if (!ext4_es_is_delayed(es1) &&
+ (es1->es_pblk + es1->es_len != es2->es_pblk))
+ return 0;
+
+ return 1;
+}
+
static struct extent_status *
-ext4_es_try_to_merge_left(struct ext4_es_tree *tree, struct extent_status *es)
+ext4_es_try_to_merge_left(struct inode *inode, struct extent_status *es)
{
+ struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree;
struct extent_status *es1;
struct rb_node *node;
@@ -290,10 +356,10 @@ ext4_es_try_to_merge_left(struct ext4_es_tree *tree, struct extent_status *es)
return es;
es1 = rb_entry(node, struct extent_status, rb_node);
- if (es->start == extent_status_end(es1) + 1) {
- es1->len += es->len;
+ if (ext4_es_can_be_merged(es1, es)) {
+ es1->es_len += es->es_len;
rb_erase(&es->rb_node, &tree->root);
- ext4_es_free_extent(es);
+ ext4_es_free_extent(inode, es);
es = es1;
}
@@ -301,8 +367,9 @@ ext4_es_try_to_merge_left(struct ext4_es_tree *tree, struct extent_status *es)
}
static struct extent_status *
-ext4_es_try_to_merge_right(struct ext4_es_tree *tree, struct extent_status *es)
+ext4_es_try_to_merge_right(struct inode *inode, struct extent_status *es)
{
+ struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree;
struct extent_status *es1;
struct rb_node *node;
@@ -311,69 +378,51 @@ ext4_es_try_to_merge_right(struct ext4_es_tree *tree, struct extent_status *es)
return es;
es1 = rb_entry(node, struct extent_status, rb_node);
- if (es1->start == extent_status_end(es) + 1) {
- es->len += es1->len;
+ if (ext4_es_can_be_merged(es, es1)) {
+ es->es_len += es1->es_len;
rb_erase(node, &tree->root);
- ext4_es_free_extent(es1);
+ ext4_es_free_extent(inode, es1);
}
return es;
}
-static int __es_insert_extent(struct ext4_es_tree *tree, ext4_lblk_t offset,
- ext4_lblk_t len)
+static int __es_insert_extent(struct inode *inode, struct extent_status *newes)
{
+ struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree;
struct rb_node **p = &tree->root.rb_node;
struct rb_node *parent = NULL;
struct extent_status *es;
- ext4_lblk_t end = offset + len - 1;
-
- BUG_ON(end < offset);
- es = tree->cache_es;
- if (es && offset == (extent_status_end(es) + 1)) {
- es_debug("cached by [%u/%u)\n", es->start, es->len);
- es->len += len;
- es = ext4_es_try_to_merge_right(tree, es);
- goto out;
- } else if (es && es->start == end + 1) {
- es_debug("cached by [%u/%u)\n", es->start, es->len);
- es->start = offset;
- es->len += len;
- es = ext4_es_try_to_merge_left(tree, es);
- goto out;
- } else if (es && es->start <= offset &&
- end <= extent_status_end(es)) {
- es_debug("cached by [%u/%u)\n", es->start, es->len);
- goto out;
- }
while (*p) {
parent = *p;
es = rb_entry(parent, struct extent_status, rb_node);
- if (offset < es->start) {
- if (es->start == end + 1) {
- es->start = offset;
- es->len += len;
- es = ext4_es_try_to_merge_left(tree, es);
+ if (newes->es_lblk < es->es_lblk) {
+ if (ext4_es_can_be_merged(newes, es)) {
+ es->es_lblk = newes->es_lblk;
+ es->es_len += newes->es_len;
+ es->es_pblk = ext4_es_get_pblock(es,
+ newes->es_pblk);
+ es = ext4_es_try_to_merge_left(inode, es);
goto out;
}
p = &(*p)->rb_left;
- } else if (offset > extent_status_end(es)) {
- if (offset == extent_status_end(es) + 1) {
- es->len += len;
- es = ext4_es_try_to_merge_right(tree, es);
+ } else if (newes->es_lblk > ext4_es_end(es)) {
+ if (ext4_es_can_be_merged(es, newes)) {
+ es->es_len += newes->es_len;
+ es = ext4_es_try_to_merge_right(inode, es);
goto out;
}
p = &(*p)->rb_right;
} else {
- if (extent_status_end(es) <= end)
- es->len = offset - es->start + len;
- goto out;
+ BUG_ON(1);
+ return -EINVAL;
}
}
- es = ext4_es_alloc_extent(offset, len);
+ es = ext4_es_alloc_extent(inode, newes->es_lblk, newes->es_len,
+ newes->es_pblk, newes->es_status);
if (!es)
return -ENOMEM;
rb_link_node(&es->rb_node, parent, p);
@@ -385,85 +434,154 @@ out:
}
/*
- * ext4_es_insert_extent() adds a space to a delayed extent tree.
- * Caller holds inode->i_es_lock.
+ * ext4_es_insert_extent() adds a space to a extent status tree.
*
* ext4_es_insert_extent is called by ext4_da_write_begin and
* ext4_es_remove_extent.
*
* Return 0 on success, error code on failure.
*/
-int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t offset,
- ext4_lblk_t len)
+int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
+ ext4_lblk_t len, ext4_fsblk_t pblk, int status)
{
- struct ext4_es_tree *tree;
+ struct extent_status newes;
+ ext4_lblk_t end = lblk + len - 1;
int err = 0;
- trace_ext4_es_insert_extent(inode, offset, len);
- es_debug("add [%u/%u) to extent status tree of inode %lu\n",
- offset, len, inode->i_ino);
+ es_debug("add [%u/%u) %llu %d to extent status tree of inode %lu\n",
+ lblk, len, pblk, status, inode->i_ino);
+
+ BUG_ON(end < lblk);
+
+ newes.es_lblk = lblk;
+ newes.es_len = len;
+ newes.es_pblk = pblk;
+ newes.es_status = status;
+ trace_ext4_es_insert_extent(inode, &newes);
write_lock(&EXT4_I(inode)->i_es_lock);
- tree = &EXT4_I(inode)->i_es_tree;
- err = __es_insert_extent(tree, offset, len);
+ err = __es_remove_extent(inode, lblk, end);
+ if (err != 0)
+ goto error;
+ err = __es_insert_extent(inode, &newes);
+
+error:
write_unlock(&EXT4_I(inode)->i_es_lock);
+ ext4_es_lru_add(inode);
ext4_es_print_tree(inode);
return err;
}
/*
- * ext4_es_remove_extent() removes a space from a delayed extent tree.
- * Caller holds inode->i_es_lock.
+ * ext4_es_lookup_extent() looks up an extent in extent status tree.
*
- * Return 0 on success, error code on failure.
+ * ext4_es_lookup_extent is called by ext4_map_blocks/ext4_da_map_blocks.
+ *
+ * Return: 1 on found, 0 on not
*/
-int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t offset,
- ext4_lblk_t len)
+int ext4_es_lookup_extent(struct inode *inode, struct extent_status *es)
{
- struct rb_node *node;
struct ext4_es_tree *tree;
+ struct extent_status *es1;
+ struct rb_node *node;
+ int found = 0;
+
+ trace_ext4_es_lookup_extent_enter(inode, es->es_lblk);
+ es_debug("lookup extent in block %u\n", es->es_lblk);
+
+ tree = &EXT4_I(inode)->i_es_tree;
+ read_lock(&EXT4_I(inode)->i_es_lock);
+
+ /* find extent in cache firstly */
+ if (tree->cache_es) {
+ es1 = tree->cache_es;
+ if (in_range(es->es_lblk, es1->es_lblk, es1->es_len)) {
+ es_debug("%u cached by [%u/%u)\n",
+ es->es_lblk, es1->es_lblk, es1->es_len);
+ found = 1;
+ goto out;
+ }
+ }
+
+ es->es_len = 0;
+ node = tree->root.rb_node;
+ while (node) {
+ es1 = rb_entry(node, struct extent_status, rb_node);
+ if (es->es_lblk < es1->es_lblk)
+ node = node->rb_left;
+ else if (es->es_lblk > ext4_es_end(es1))
+ node = node->rb_right;
+ else {
+ found = 1;
+ break;
+ }
+ }
+
+out:
+ if (found) {
+ es->es_lblk = es1->es_lblk;
+ es->es_len = es1->es_len;
+ es->es_pblk = es1->es_pblk;
+ es->es_status = es1->es_status;
+ }
+
+ read_unlock(&EXT4_I(inode)->i_es_lock);
+
+ ext4_es_lru_add(inode);
+ trace_ext4_es_lookup_extent_exit(inode, es, found);
+ return found;
+}
+
+static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
+ ext4_lblk_t end)
+{
+ struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree;
+ struct rb_node *node;
struct extent_status *es;
struct extent_status orig_es;
- ext4_lblk_t len1, len2, end;
+ ext4_lblk_t len1, len2;
int err = 0;
- trace_ext4_es_remove_extent(inode, offset, len);
- es_debug("remove [%u/%u) from extent status tree of inode %lu\n",
- offset, len, inode->i_ino);
-
- end = offset + len - 1;
- BUG_ON(end < offset);
- write_lock(&EXT4_I(inode)->i_es_lock);
- tree = &EXT4_I(inode)->i_es_tree;
- es = __es_tree_search(&tree->root, offset);
+ es = __es_tree_search(&tree->root, lblk);
if (!es)
goto out;
- if (es->start > end)
+ if (es->es_lblk > end)
goto out;
/* Simply invalidate cache_es. */
tree->cache_es = NULL;
- orig_es.start = es->start;
- orig_es.len = es->len;
- len1 = offset > es->start ? offset - es->start : 0;
- len2 = extent_status_end(es) > end ?
- extent_status_end(es) - end : 0;
+ orig_es.es_lblk = es->es_lblk;
+ orig_es.es_len = es->es_len;
+ orig_es.es_pblk = es->es_pblk;
+ orig_es.es_status = es->es_status;
+
+ len1 = lblk > es->es_lblk ? lblk - es->es_lblk : 0;
+ len2 = ext4_es_end(es) > end ? ext4_es_end(es) - end : 0;
if (len1 > 0)
- es->len = len1;
+ es->es_len = len1;
if (len2 > 0) {
if (len1 > 0) {
- err = __es_insert_extent(tree, end + 1, len2);
+ struct extent_status newes;
+
+ newes.es_lblk = end + 1;
+ newes.es_len = len2;
+ newes.es_pblk = ext4_es_get_pblock(&orig_es,
+ orig_es.es_pblk + orig_es.es_len - len2);
+ newes.es_status = orig_es.es_status;
+ err = __es_insert_extent(inode, &newes);
if (err) {
- es->start = orig_es.start;
- es->len = orig_es.len;
+ es->es_lblk = orig_es.es_lblk;
+ es->es_len = orig_es.es_len;
goto out;
}
} else {
- es->start = end + 1;
- es->len = len2;
+ es->es_lblk = end + 1;
+ es->es_len = len2;
+ es->es_pblk = ext4_es_get_pblock(es,
+ orig_es.es_pblk + orig_es.es_len - len2);
}
goto out;
}
@@ -476,10 +594,10 @@ int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t offset,
es = NULL;
}
- while (es && extent_status_end(es) <= end) {
+ while (es && ext4_es_end(es) <= end) {
node = rb_next(&es->rb_node);
rb_erase(&es->rb_node, &tree->root);
- ext4_es_free_extent(es);
+ ext4_es_free_extent(inode, es);
if (!node) {
es = NULL;
break;
@@ -487,14 +605,178 @@ int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t offset,
es = rb_entry(node, struct extent_status, rb_node);
}
- if (es && es->start < end + 1) {
- len1 = extent_status_end(es) - end;
- es->start = end + 1;
- es->len = len1;
+ if (es && es->es_lblk < end + 1) {
+ ext4_lblk_t orig_len = es->es_len;
+
+ len1 = ext4_es_end(es) - end;
+ es->es_lblk = end + 1;
+ es->es_len = len1;
+ es->es_pblk = ext4_es_get_pblock(es,
+ es->es_pblk + orig_len - len1);
}
out:
+ return err;
+}
+
+/*
+ * ext4_es_remove_extent() removes a space from a extent status tree.
+ *
+ * Return 0 on success, error code on failure.
+ */
+int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
+ ext4_lblk_t len)
+{
+ ext4_lblk_t end;
+ int err = 0;
+
+ trace_ext4_es_remove_extent(inode, lblk, len);
+ es_debug("remove [%u/%u) from extent status tree of inode %lu\n",
+ lblk, len, inode->i_ino);
+
+ end = lblk + len - 1;
+ BUG_ON(end < lblk);
+
+ write_lock(&EXT4_I(inode)->i_es_lock);
+ err = __es_remove_extent(inode, lblk, end);
write_unlock(&EXT4_I(inode)->i_es_lock);
ext4_es_print_tree(inode);
return err;
}
+
+static int ext4_es_shrink(struct shrinker *shrink, struct shrink_control *sc)
+{
+ struct ext4_sb_info *sbi = container_of(shrink,
+ struct ext4_sb_info, s_es_shrinker);
+ struct ext4_inode_info *ei;
+ struct list_head *cur, *tmp, scanned;
+ int nr_to_scan = sc->nr_to_scan;
+ int ret, nr_shrunk = 0;
+
+ trace_ext4_es_shrink_enter(sbi->s_sb, nr_to_scan);
+
+ if (!nr_to_scan)
+ return ext4_es_reclaim_extents_count(sbi->s_sb);
+
+ INIT_LIST_HEAD(&scanned);
+
+ spin_lock(&sbi->s_es_lru_lock);
+ list_for_each_safe(cur, tmp, &sbi->s_es_lru) {
+ list_move_tail(cur, &scanned);
+
+ ei = list_entry(cur, struct ext4_inode_info, i_es_lru);
+
+ read_lock(&ei->i_es_lock);
+ if (ei->i_es_lru_nr == 0) {
+ read_unlock(&ei->i_es_lock);
+ continue;
+ }
+ read_unlock(&ei->i_es_lock);
+
+ write_lock(&ei->i_es_lock);
+ ret = __es_try_to_reclaim_extents(ei, nr_to_scan);
+ write_unlock(&ei->i_es_lock);
+
+ nr_shrunk += ret;
+ nr_to_scan -= ret;
+ if (nr_to_scan == 0)
+ break;
+ }
+ list_splice_tail(&scanned, &sbi->s_es_lru);
+ spin_unlock(&sbi->s_es_lru_lock);
+ trace_ext4_es_shrink_exit(sbi->s_sb, nr_shrunk);
+
+ return ext4_es_reclaim_extents_count(sbi->s_sb);
+}
+
+void ext4_es_register_shrinker(struct super_block *sb)
+{
+ struct ext4_sb_info *sbi;
+
+ sbi = EXT4_SB(sb);
+ INIT_LIST_HEAD(&sbi->s_es_lru);
+ spin_lock_init(&sbi->s_es_lru_lock);
+ sbi->s_es_shrinker.shrink = ext4_es_shrink;
+ sbi->s_es_shrinker.seeks = DEFAULT_SEEKS;
+ register_shrinker(&sbi->s_es_shrinker);
+}
+
+void ext4_es_unregister_shrinker(struct super_block *sb)
+{
+ unregister_shrinker(&EXT4_SB(sb)->s_es_shrinker);
+}
+
+void ext4_es_lru_add(struct inode *inode)
+{
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+
+ spin_lock(&sbi->s_es_lru_lock);
+ if (list_empty(&ei->i_es_lru))
+ list_add_tail(&ei->i_es_lru, &sbi->s_es_lru);
+ else
+ list_move_tail(&ei->i_es_lru, &sbi->s_es_lru);
+ spin_unlock(&sbi->s_es_lru_lock);
+}
+
+void ext4_es_lru_del(struct inode *inode)
+{
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+
+ spin_lock(&sbi->s_es_lru_lock);
+ if (!list_empty(&ei->i_es_lru))
+ list_del_init(&ei->i_es_lru);
+ spin_unlock(&sbi->s_es_lru_lock);
+}
+
+static int ext4_es_reclaim_extents_count(struct super_block *sb)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_inode_info *ei;
+ struct list_head *cur;
+ int nr_cached = 0;
+
+ spin_lock(&sbi->s_es_lru_lock);
+ list_for_each(cur, &sbi->s_es_lru) {
+ ei = list_entry(cur, struct ext4_inode_info, i_es_lru);
+ read_lock(&ei->i_es_lock);
+ nr_cached += ei->i_es_lru_nr;
+ read_unlock(&ei->i_es_lock);
+ }
+ spin_unlock(&sbi->s_es_lru_lock);
+ trace_ext4_es_reclaim_extents_count(sb, nr_cached);
+ return nr_cached;
+}
+
+static int __es_try_to_reclaim_extents(struct ext4_inode_info *ei,
+ int nr_to_scan)
+{
+ struct inode *inode = &ei->vfs_inode;
+ struct ext4_es_tree *tree = &ei->i_es_tree;
+ struct rb_node *node;
+ struct extent_status *es;
+ int nr_shrunk = 0;
+
+ if (ei->i_es_lru_nr == 0)
+ return 0;
+
+ node = rb_first(&tree->root);
+ while (node != NULL) {
+ es = rb_entry(node, struct extent_status, rb_node);
+ node = rb_next(&es->rb_node);
+ /*
+ * We can't reclaim delayed extent from status tree because
+ * fiemap, bigallic, and seek_data/hole need to use it.
+ */
+ if (!ext4_es_is_delayed(es)) {
+ rb_erase(&es->rb_node, &tree->root);
+ ext4_es_free_extent(inode, es);
+ nr_shrunk++;
+ if (--nr_to_scan == 0)
+ break;
+ }
+ }
+ tree->cache_es = NULL;
+ return nr_shrunk;
+}
diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h
index 077f82db092a..d199a51c1842 100644
--- a/fs/ext4/extents_status.h
+++ b/fs/ext4/extents_status.h
@@ -20,10 +20,22 @@
#define es_debug(fmt, ...) no_printk(fmt, ##__VA_ARGS__)
#endif
+enum {
+ EXTENT_STATUS_WRITTEN = 0, /* written extent */
+ EXTENT_STATUS_UNWRITTEN = 1, /* unwritten extent */
+ EXTENT_STATUS_DELAYED = 2, /* delayed extent */
+};
+
+/*
+ * Here for save memory es_status is stashed into es_pblk because we only have
+ * 48 bits physical block and es_status only needs 2 bits.
+ */
struct extent_status {
struct rb_node rb_node;
- ext4_lblk_t start; /* first block extent covers */
- ext4_lblk_t len; /* length of extent in block */
+ ext4_lblk_t es_lblk; /* first logical block extent covers */
+ ext4_lblk_t es_len; /* length of extent in block */
+ ext4_fsblk_t es_pblk : 62; /* first physical block */
+ ext4_fsblk_t es_status : 2; /* record the status of extent */
};
struct ext4_es_tree {
@@ -35,11 +47,39 @@ extern int __init ext4_init_es(void);
extern void ext4_exit_es(void);
extern void ext4_es_init_tree(struct ext4_es_tree *tree);
-extern int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t start,
- ext4_lblk_t len);
-extern int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t start,
+extern int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
+ ext4_lblk_t len, ext4_fsblk_t pblk,
+ int status);
+extern int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
ext4_lblk_t len);
extern ext4_lblk_t ext4_es_find_extent(struct inode *inode,
struct extent_status *es);
+extern int ext4_es_lookup_extent(struct inode *inode, struct extent_status *es);
+
+static inline int ext4_es_is_written(struct extent_status *es)
+{
+ return (es->es_status == EXTENT_STATUS_WRITTEN);
+}
+
+static inline int ext4_es_is_unwritten(struct extent_status *es)
+{
+ return (es->es_status == EXTENT_STATUS_UNWRITTEN);
+}
+
+static inline int ext4_es_is_delayed(struct extent_status *es)
+{
+ return (es->es_status == EXTENT_STATUS_DELAYED);
+}
+
+static inline ext4_fsblk_t ext4_es_get_pblock(struct extent_status *es,
+ ext4_fsblk_t pb)
+{
+ return (ext4_es_is_delayed(es) ? ~0 : pb);
+}
+
+extern void ext4_es_register_shrinker(struct super_block *sb);
+extern void ext4_es_unregister_shrinker(struct super_block *sb);
+extern void ext4_es_lru_add(struct inode *inode);
+extern void ext4_es_lru_del(struct inode *inode);
#endif /* _EXT4_EXTENTS_STATUS_H */
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 405565a62277..afaf9f15303e 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -464,10 +464,10 @@ static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize)
* If there is a delay extent at this offset,
* it will be as a data.
*/
- es.start = last;
+ es.es_lblk = last;
(void)ext4_es_find_extent(inode, &es);
- if (last >= es.start &&
- last < es.start + es.len) {
+ if (ext4_es_is_delayed(&es) &&
+ last >= es.es_lblk && last < es.es_lblk + es.es_len) {
if (last != start)
dataoff = last << blkbits;
break;
@@ -549,11 +549,11 @@ static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize)
* If there is a delay extent at this offset,
* we will skip this extent.
*/
- es.start = last;
+ es.es_lblk = last;
(void)ext4_es_find_extent(inode, &es);
- if (last >= es.start &&
- last < es.start + es.len) {
- last = es.start + es.len;
+ if (ext4_es_is_delayed(&es) &&
+ last >= es.es_lblk && last < es.es_lblk + es.es_len) {
+ last = es.es_lblk + es.es_len;
holeoff = last << blkbits;
continue;
}
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index 20862f96e8ae..bdd20231e66c 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -146,6 +146,7 @@ static Indirect *ext4_get_branch(struct inode *inode, int depth,
struct super_block *sb = inode->i_sb;
Indirect *p = chain;
struct buffer_head *bh;
+ int ret = -EIO;
*err = 0;
/* i_data is not going away, no lock needed */
@@ -154,8 +155,10 @@ static Indirect *ext4_get_branch(struct inode *inode, int depth,
goto no_block;
while (--depth) {
bh = sb_getblk(sb, le32_to_cpu(p->key));
- if (unlikely(!bh))
+ if (unlikely(!bh)) {
+ ret = -ENOMEM;
goto failure;
+ }
if (!bh_uptodate_or_lock(bh)) {
if (bh_submit_read(bh) < 0) {
@@ -177,7 +180,7 @@ static Indirect *ext4_get_branch(struct inode *inode, int depth,
return NULL;
failure:
- *err = -EIO;
+ *err = ret;
no_block:
return p;
}
@@ -471,7 +474,7 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
*/
bh = sb_getblk(inode->i_sb, new_blocks[n-1]);
if (unlikely(!bh)) {
- err = -EIO;
+ err = -ENOMEM;
goto failed;
}
@@ -1515,3 +1518,243 @@ out_stop:
trace_ext4_truncate_exit(inode);
}
+static int free_hole_blocks(handle_t *handle, struct inode *inode,
+ struct buffer_head *parent_bh, __le32 *i_data,
+ int level, ext4_lblk_t first,
+ ext4_lblk_t count, int max)
+{
+ struct buffer_head *bh = NULL;
+ int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb);
+ int ret = 0;
+ int i, inc;
+ ext4_lblk_t offset;
+ __le32 blk;
+
+ inc = 1 << ((EXT4_BLOCK_SIZE_BITS(inode->i_sb) - 2) * level);
+ for (i = 0, offset = 0; i < max; i++, i_data++, offset += inc) {
+ if (offset >= count + first)
+ break;
+ if (*i_data == 0 || (offset + inc) <= first)
+ continue;
+ blk = *i_data;
+ if (level > 0) {
+ ext4_lblk_t first2;
+ bh = sb_bread(inode->i_sb, blk);
+ if (!bh) {
+ EXT4_ERROR_INODE_BLOCK(inode, blk,
+ "Read failure");
+ return -EIO;
+ }
+ first2 = (first > offset) ? first - offset : 0;
+ ret = free_hole_blocks(handle, inode, bh,
+ (__le32 *)bh->b_data, level - 1,
+ first2, count - offset,
+ inode->i_sb->s_blocksize >> 2);
+ if (ret) {
+ brelse(bh);
+ goto err;
+ }
+ }
+ if (level == 0 ||
+ (bh && all_zeroes((__le32 *)bh->b_data,
+ (__le32 *)bh->b_data + addr_per_block))) {
+ ext4_free_data(handle, inode, parent_bh, &blk, &blk+1);
+ *i_data = 0;
+ }
+ brelse(bh);
+ bh = NULL;
+ }
+
+err:
+ return ret;
+}
+
+static int ext4_free_hole_blocks(handle_t *handle, struct inode *inode,
+ ext4_lblk_t first, ext4_lblk_t stop)
+{
+ int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb);
+ int level, ret = 0;
+ int num = EXT4_NDIR_BLOCKS;
+ ext4_lblk_t count, max = EXT4_NDIR_BLOCKS;
+ __le32 *i_data = EXT4_I(inode)->i_data;
+
+ count = stop - first;
+ for (level = 0; level < 4; level++, max *= addr_per_block) {
+ if (first < max) {
+ ret = free_hole_blocks(handle, inode, NULL, i_data,
+ level, first, count, num);
+ if (ret)
+ goto err;
+ if (count > max - first)
+ count -= max - first;
+ else
+ break;
+ first = 0;
+ } else {
+ first -= max;
+ }
+ i_data += num;
+ if (level == 0) {
+ num = 1;
+ max = 1;
+ }
+ }
+
+err:
+ return ret;
+}
+
+int ext4_ind_punch_hole(struct file *file, loff_t offset, loff_t length)
+{
+ struct inode *inode = file->f_path.dentry->d_inode;
+ struct super_block *sb = inode->i_sb;
+ ext4_lblk_t first_block, stop_block;
+ struct address_space *mapping = inode->i_mapping;
+ handle_t *handle = NULL;
+ loff_t first_page, last_page, page_len;
+ loff_t first_page_offset, last_page_offset;
+ int err = 0;
+
+ /*
+ * Write out all dirty pages to avoid race conditions
+ * Then release them.
+ */
+ if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
+ err = filemap_write_and_wait_range(mapping,
+ offset, offset + length - 1);
+ if (err)
+ return err;
+ }
+
+ mutex_lock(&inode->i_mutex);
+ /* It's not possible punch hole on append only file */
+ if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) {
+ err = -EPERM;
+ goto out_mutex;
+ }
+ if (IS_SWAPFILE(inode)) {
+ err = -ETXTBSY;
+ goto out_mutex;
+ }
+
+ /* No need to punch hole beyond i_size */
+ if (offset >= inode->i_size)
+ goto out_mutex;
+
+ /*
+ * If the hole extents beyond i_size, set the hole
+ * to end after the page that contains i_size
+ */
+ if (offset + length > inode->i_size) {
+ length = inode->i_size +
+ PAGE_CACHE_SIZE - (inode->i_size & (PAGE_CACHE_SIZE - 1)) -
+ offset;
+ }
+
+ first_page = (offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+ last_page = (offset + length) >> PAGE_CACHE_SHIFT;
+
+ first_page_offset = first_page << PAGE_CACHE_SHIFT;
+ last_page_offset = last_page << PAGE_CACHE_SHIFT;
+
+ /* Now release the pages */
+ if (last_page_offset > first_page_offset) {
+ truncate_pagecache_range(inode, first_page_offset,
+ last_page_offset - 1);
+ }
+
+ /* Wait all existing dio works, newcomers will block on i_mutex */
+ inode_dio_wait(inode);
+
+ handle = start_transaction(inode);
+ if (IS_ERR(handle))
+ goto out_mutex;
+
+ /*
+ * Now we need to zero out the non-page-aligned data in the
+ * pages at the start and tail of the hole, and unmap the buffer
+ * heads for the block aligned regions of the page that were
+ * completely zerod.
+ */
+ if (first_page > last_page) {
+ /*
+ * If the file space being truncated is contained within a page
+ * just zero out and unmap the middle of that page
+ */
+ err = ext4_discard_partial_page_buffers(handle,
+ mapping, offset, length, 0);
+ if (err)
+ goto out;
+ } else {
+ /*
+ * Zero out and unmap the paritial page that contains
+ * the start of the hole
+ */
+ page_len = first_page_offset - offset;
+ if (page_len > 0) {
+ err = ext4_discard_partial_page_buffers(handle, mapping,
+ offset, page_len, 0);
+ if (err)
+ goto out;
+ }
+
+ /*
+ * Zero out and unmap the partial page that contains
+ * the end of the hole
+ */
+ page_len = offset + length - last_page_offset;
+ if (page_len > 0) {
+ err = ext4_discard_partial_page_buffers(handle, mapping,
+ last_page_offset, page_len, 0);
+ if (err)
+ goto out;
+ }
+ }
+
+ /*
+ * If i_size contained in the last page, we need to
+ * unmap and zero the paritial page after i_size
+ */
+ if (inode->i_size >> PAGE_CACHE_SHIFT == last_page &&
+ inode->i_size % PAGE_CACHE_SIZE != 0) {
+ page_len = PAGE_CACHE_SIZE -
+ (inode->i_size & (PAGE_CACHE_SIZE - 1));
+ if (page_len > 0) {
+ err = ext4_discard_partial_page_buffers(handle,
+ mapping, inode->i_size, page_len, 0);
+ if (err)
+ goto out;
+ }
+ }
+
+ first_block = (offset + sb->s_blocksize - 1) >>
+ EXT4_BLOCK_SIZE_BITS(sb);
+ stop_block = (offset + length) >> EXT4_BLOCK_SIZE_BITS(sb);
+
+ if (first_block >= stop_block)
+ goto out;
+
+ down_write(&EXT4_I(inode)->i_data_sem);
+ ext4_discard_preallocations(inode);
+
+ err = ext4_es_remove_extent(inode, first_block,
+ stop_block - first_block);
+ err = ext4_free_hole_blocks(handle, inode, first_block, stop_block);
+
+ ext4_discard_preallocations(inode);
+
+ if (IS_SYNC(inode))
+ ext4_handle_sync(handle);
+
+ up_write(&EXT4_I(inode)->i_data_sem);
+
+out:
+ inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
+ ext4_mark_inode_dirty(handle, inode);
+ ext4_journal_stop(handle);
+
+out_mutex:
+ mutex_unlock(&inode->i_mutex);
+
+ return err;
+}
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index 387c47c6cda9..93a3408fc89b 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -1188,7 +1188,7 @@ static int ext4_convert_inline_data_nolock(handle_t *handle,
data_bh = sb_getblk(inode->i_sb, map.m_pblk);
if (!data_bh) {
- error = -EIO;
+ error = -ENOMEM;
goto out_restore;
}
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index cbfe13bf5b2a..fb1907ddfbde 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -132,10 +132,6 @@ static inline int ext4_begin_ordered_truncate(struct inode *inode,
}
static void ext4_invalidatepage(struct page *page, unsigned long offset);
-static int noalloc_get_block_write(struct inode *inode, sector_t iblock,
- struct buffer_head *bh_result, int create);
-static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode);
-static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate);
static int __ext4_journalled_writepage(struct page *page, unsigned int len);
static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh);
static int ext4_discard_partial_page_buffers_no_lock(handle_t *handle,
@@ -508,12 +504,33 @@ static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx,
int ext4_map_blocks(handle_t *handle, struct inode *inode,
struct ext4_map_blocks *map, int flags)
{
+ struct extent_status es;
int retval;
map->m_flags = 0;
ext_debug("ext4_map_blocks(): inode %lu, flag %d, max_blocks %u,"
"logical block %lu\n", inode->i_ino, flags, map->m_len,
(unsigned long) map->m_lblk);
+
+ /* Lookup extent status tree firstly */
+ es.es_lblk = map->m_lblk;
+ if (ext4_es_lookup_extent(inode, &es)) {
+ if (ext4_es_is_written(&es) || ext4_es_is_unwritten(&es)) {
+ map->m_pblk = es.es_pblk + map->m_lblk - es.es_lblk;
+ map->m_flags |= ext4_es_is_written(&es) ?
+ EXT4_MAP_MAPPED : EXT4_MAP_UNWRITTEN;
+ retval = es.es_len - (map->m_lblk - es.es_lblk);
+ if (retval > map->m_len)
+ retval = map->m_len;
+ map->m_len = retval;
+ } else if (ext4_es_is_delayed(&es)) {
+ retval = 0;
+ } else {
+ BUG_ON(1);
+ }
+ goto found;
+ }
+
/*
* Try to see if we can get the block without requesting a new
* file system block.
@@ -527,20 +544,21 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
retval = ext4_ind_map_blocks(handle, inode, map, flags &
EXT4_GET_BLOCKS_KEEP_SIZE);
}
+ if (retval > 0) {
+ int ret, status;
+ status = map->m_flags & EXT4_MAP_UNWRITTEN ?
+ EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
+ ret = ext4_es_insert_extent(inode, map->m_lblk,
+ map->m_len, map->m_pblk, status);
+ if (ret < 0)
+ retval = ret;
+ }
if (!(flags & EXT4_GET_BLOCKS_NO_LOCK))
up_read((&EXT4_I(inode)->i_data_sem));
+found:
if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
- int ret;
- if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) {
- /* delayed alloc may be allocated by fallocate and
- * coverted to initialized by directIO.
- * we need to handle delayed extent here.
- */
- down_write((&EXT4_I(inode)->i_data_sem));
- goto delayed_mapped;
- }
- ret = check_block_validity(inode, map);
+ int ret = check_block_validity(inode, map);
if (ret != 0)
return ret;
}
@@ -615,18 +633,27 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE))
ext4_da_update_reserve_space(inode, retval, 1);
}
- if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) {
+ if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
ext4_clear_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED);
- if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
- int ret;
-delayed_mapped:
- /* delayed allocation blocks has been allocated */
- ret = ext4_es_remove_extent(inode, map->m_lblk,
- map->m_len);
- if (ret < 0)
- retval = ret;
- }
+ if (retval > 0) {
+ int ret, status;
+
+ if (flags & EXT4_GET_BLOCKS_PRE_IO)
+ status = EXTENT_STATUS_UNWRITTEN;
+ else if (flags & EXT4_GET_BLOCKS_CONVERT)
+ status = EXTENT_STATUS_WRITTEN;
+ else if (flags & EXT4_GET_BLOCKS_UNINIT_EXT)
+ status = EXTENT_STATUS_UNWRITTEN;
+ else if (flags & EXT4_GET_BLOCKS_CREATE)
+ status = EXTENT_STATUS_WRITTEN;
+ else
+ BUG_ON(1);
+
+ ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
+ map->m_pblk, status);
+ if (ret < 0)
+ retval = ret;
}
up_write((&EXT4_I(inode)->i_data_sem));
@@ -713,8 +740,8 @@ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
return NULL;
bh = sb_getblk(inode->i_sb, map.m_pblk);
- if (!bh) {
- *errp = -EIO;
+ if (unlikely(!bh)) {
+ *errp = -ENOMEM;
return NULL;
}
if (map.m_flags & EXT4_MAP_NEW) {
@@ -808,11 +835,10 @@ int ext4_walk_page_buffers(handle_t *handle,
* and the commit_write(). So doing the jbd2_journal_start at the start of
* prepare_write() is the right place.
*
- * Also, this function can nest inside ext4_writepage() ->
- * block_write_full_page(). In that case, we *know* that ext4_writepage()
- * has generated enough buffer credits to do the whole page. So we won't
- * block on the journal in that case, which is good, because the caller may
- * be PF_MEMALLOC.
+ * Also, this function can nest inside ext4_writepage(). In that case, we
+ * *know* that ext4_writepage() has generated enough buffer credits to do the
+ * whole page. So we won't block on the journal in that case, which is good,
+ * because the caller may be PF_MEMALLOC.
*
* By accident, ext4 can be reentered when a transaction is open via
* quota file writes. If we were to commit the transaction while thus
@@ -1357,7 +1383,6 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd,
loff_t size = i_size_read(inode);
unsigned int len, block_start;
struct buffer_head *bh, *page_bufs = NULL;
- int journal_data = ext4_should_journal_data(inode);
sector_t pblock = 0, cur_logical = 0;
struct ext4_io_submit io_submit;
@@ -1378,7 +1403,7 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd,
if (nr_pages == 0)
break;
for (i = 0; i < nr_pages; i++) {
- int commit_write = 0, skip_page = 0;
+ int skip_page = 0;
struct page *page = pvec.pages[i];
index = page->index;
@@ -1400,27 +1425,9 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd,
BUG_ON(!PageLocked(page));
BUG_ON(PageWriteback(page));
- /*
- * If the page does not have buffers (for
- * whatever reason), try to create them using
- * __block_write_begin. If this fails,
- * skip the page and move on.
- */
- if (!page_has_buffers(page)) {
- if (__block_write_begin(page, 0, len,
- noalloc_get_block_write)) {
- skip_page:
- unlock_page(page);
- continue;
- }
- commit_write = 1;
- }
-
bh = page_bufs = page_buffers(page);
block_start = 0;
do {
- if (!bh)
- goto skip_page;
if (map && (cur_logical >= map->m_lblk) &&
(cur_logical <= (map->m_lblk +
(map->m_len - 1)))) {
@@ -1448,33 +1455,14 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd,
pblock++;
} while (bh != page_bufs);
- if (skip_page)
- goto skip_page;
-
- if (commit_write)
- /* mark the buffer_heads as dirty & uptodate */
- block_commit_write(page, 0, len);
+ if (skip_page) {
+ unlock_page(page);
+ continue;
+ }
clear_page_dirty_for_io(page);
- /*
- * Delalloc doesn't support data journalling,
- * but eventually maybe we'll lift this
- * restriction.
- */
- if (unlikely(journal_data && PageChecked(page)))
- err = __ext4_journalled_writepage(page, len);
- else if (test_opt(inode->i_sb, MBLK_IO_SUBMIT))
- err = ext4_bio_write_page(&io_submit, page,
- len, mpd->wbc);
- else if (buffer_uninit(page_bufs)) {
- ext4_set_bh_endio(page_bufs, inode);
- err = block_write_full_page_endio(page,
- noalloc_get_block_write,
- mpd->wbc, ext4_end_io_buffer_write);
- } else
- err = block_write_full_page(page,
- noalloc_get_block_write, mpd->wbc);
-
+ err = ext4_bio_write_page(&io_submit, page, len,
+ mpd->wbc);
if (!err)
mpd->pages_written++;
/*
@@ -1690,16 +1678,16 @@ submit_io:
*
* @mpd->lbh - extent of blocks
* @logical - logical number of the block in the file
- * @bh - bh of the block (used to access block's state)
+ * @b_state - b_state of the buffer head added
*
* the function is used to collect contig. blocks in same state
*/
-static void mpage_add_bh_to_extent(struct mpage_da_data *mpd,
- sector_t logical, size_t b_size,
+static void mpage_add_bh_to_extent(struct mpage_da_data *mpd, sector_t logical,
unsigned long b_state)
{
sector_t next;
- int nrblocks = mpd->b_size >> mpd->inode->i_blkbits;
+ int blkbits = mpd->inode->i_blkbits;
+ int nrblocks = mpd->b_size >> blkbits;
/*
* XXX Don't go larger than mballoc is willing to allocate
@@ -1707,11 +1695,11 @@ static void mpage_add_bh_to_extent(struct mpage_da_data *mpd,
* mpage_da_submit_io() into this function and then call
* ext4_map_blocks() multiple times in a loop
*/
- if (nrblocks >= 8*1024*1024/mpd->inode->i_sb->s_blocksize)
+ if (nrblocks >= (8*1024*1024 >> blkbits))
goto flush_it;
- /* check if thereserved journal credits might overflow */
- if (!(ext4_test_inode_flag(mpd->inode, EXT4_INODE_EXTENTS))) {
+ /* check if the reserved journal credits might overflow */
+ if (!ext4_test_inode_flag(mpd->inode, EXT4_INODE_EXTENTS)) {
if (nrblocks >= EXT4_MAX_TRANS_DATA) {
/*
* With non-extent format we are limited by the journal
@@ -1720,16 +1708,6 @@ static void mpage_add_bh_to_extent(struct mpage_da_data *mpd,
* nrblocks. So limit nrblocks.
*/
goto flush_it;
- } else if ((nrblocks + (b_size >> mpd->inode->i_blkbits)) >
- EXT4_MAX_TRANS_DATA) {
- /*
- * Adding the new buffer_head would make it cross the
- * allowed limit for which we have journal credit
- * reserved. So limit the new bh->b_size
- */
- b_size = (EXT4_MAX_TRANS_DATA - nrblocks) <<
- mpd->inode->i_blkbits;
- /* we will do mpage_da_submit_io in the next loop */
}
}
/*
@@ -1737,7 +1715,7 @@ static void mpage_add_bh_to_extent(struct mpage_da_data *mpd,
*/
if (mpd->b_size == 0) {
mpd->b_blocknr = logical;
- mpd->b_size = b_size;
+ mpd->b_size = 1 << blkbits;
mpd->b_state = b_state & BH_FLAGS;
return;
}
@@ -1747,7 +1725,7 @@ static void mpage_add_bh_to_extent(struct mpage_da_data *mpd,
* Can we merge the block to our big extent?
*/
if (logical == next && (b_state & BH_FLAGS) == mpd->b_state) {
- mpd->b_size += b_size;
+ mpd->b_size += 1 << blkbits;
return;
}
@@ -1775,6 +1753,7 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
struct ext4_map_blocks *map,
struct buffer_head *bh)
{
+ struct extent_status es;
int retval;
sector_t invalid_block = ~((sector_t) 0xffff);
@@ -1785,6 +1764,32 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
ext_debug("ext4_da_map_blocks(): inode %lu, max_blocks %u,"
"logical block %lu\n", inode->i_ino, map->m_len,
(unsigned long) map->m_lblk);
+
+ /* Lookup extent status tree firstly */
+ es.es_lblk = iblock;
+ if (ext4_es_lookup_extent(inode, &es)) {
+ map->m_pblk = es.es_pblk + iblock - es.es_lblk;
+ retval = es.es_len - (iblock - es.es_lblk);
+ if (retval > map->m_len)
+ retval = map->m_len;
+ map->m_len = retval;
+ if (ext4_es_is_written(&es)) {
+ map->m_flags |= EXT4_MAP_MAPPED;
+ } else if (ext4_es_is_unwritten(&es)) {
+ map->m_flags |= EXT4_MAP_UNWRITTEN;
+ } else if (ext4_es_is_delayed(&es)) {
+ map_bh(bh, inode->i_sb, invalid_block);
+ set_buffer_new(bh);
+ set_buffer_delay(bh);
+
+ return 0;
+ } else {
+ BUG_ON(1);
+ }
+
+ return retval;
+ }
+
/*
* Try to see if we can get the block without requesting a new
* file system block.
@@ -1798,9 +1803,6 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
* of mapping from cluster so that the reserved space
* is calculated properly.
*/
- if ((EXT4_SB(inode->i_sb)->s_cluster_ratio > 1) &&
- ext4_find_delalloc_cluster(inode, map->m_lblk))
- map->m_flags |= EXT4_MAP_FROM_CLUSTER;
retval = 0;
} else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
retval = ext4_ext_map_blocks(NULL, inode, map, 0);
@@ -1808,31 +1810,42 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
retval = ext4_ind_map_blocks(NULL, inode, map, 0);
if (retval == 0) {
+ int ret;
/*
* XXX: __block_prepare_write() unmaps passed block,
* is it OK?
*/
/* If the block was allocated from previously allocated cluster,
* then we dont need to reserve it again. */
- if (!(map->m_flags & EXT4_MAP_FROM_CLUSTER)) {
- retval = ext4_da_reserve_space(inode, iblock);
- if (retval)
+ if ((EXT4_SB(inode->i_sb)->s_cluster_ratio == 1) ||
+ !ext4_find_delalloc_cluster(inode, map->m_lblk)) {
+ ret = ext4_da_reserve_space(inode, iblock);
+ if (ret) {
/* not enough space to reserve */
+ retval = ret;
goto out_unlock;
+ }
}
- retval = ext4_es_insert_extent(inode, map->m_lblk, map->m_len);
- if (retval)
+ ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
+ ~0, EXTENT_STATUS_DELAYED);
+ if (ret) {
+ retval = ret;
goto out_unlock;
-
- /* Clear EXT4_MAP_FROM_CLUSTER flag since its purpose is served
- * and it should not appear on the bh->b_state.
- */
- map->m_flags &= ~EXT4_MAP_FROM_CLUSTER;
+ }
map_bh(bh, inode->i_sb, invalid_block);
set_buffer_new(bh);
set_buffer_delay(bh);
+ } else if (retval > 0) {
+ int ret, status;
+
+ status = map->m_flags & EXT4_MAP_UNWRITTEN ?
+ EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
+ ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
+ map->m_pblk, status);
+ if (ret != 0)
+ retval = ret;
}
out_unlock:
@@ -1890,27 +1903,6 @@ int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
return 0;
}
-/*
- * This function is used as a standard get_block_t calback function
- * when there is no desire to allocate any blocks. It is used as a
- * callback function for block_write_begin() and block_write_full_page().
- * These functions should only try to map a single block at a time.
- *
- * Since this function doesn't do block allocations even if the caller
- * requests it by passing in create=1, it is critically important that
- * any caller checks to make sure that any buffer heads are returned
- * by this function are either all already mapped or marked for
- * delayed allocation before calling block_write_full_page(). Otherwise,
- * b_blocknr could be left unitialized, and the page write functions will
- * be taken by surprise.
- */
-static int noalloc_get_block_write(struct inode *inode, sector_t iblock,
- struct buffer_head *bh_result, int create)
-{
- BUG_ON(bh_result->b_size != inode->i_sb->s_blocksize);
- return _ext4_get_block(inode, iblock, bh_result, 0);
-}
-
static int bget_one(handle_t *handle, struct buffer_head *bh)
{
get_bh(bh);
@@ -2035,11 +2027,12 @@ out:
static int ext4_writepage(struct page *page,
struct writeback_control *wbc)
{
- int ret = 0, commit_write = 0;
+ int ret = 0;
loff_t size;
unsigned int len;
struct buffer_head *page_bufs = NULL;
struct inode *inode = page->mapping->host;
+ struct ext4_io_submit io_submit;
trace_ext4_writepage(page);
size = i_size_read(inode);
@@ -2048,39 +2041,29 @@ static int ext4_writepage(struct page *page,
else
len = PAGE_CACHE_SIZE;
+ page_bufs = page_buffers(page);
/*
- * If the page does not have buffers (for whatever reason),
- * try to create them using __block_write_begin. If this
- * fails, redirty the page and move on.
+ * We cannot do block allocation or other extent handling in this
+ * function. If there are buffers needing that, we have to redirty
+ * the page. But we may reach here when we do a journal commit via
+ * journal_submit_inode_data_buffers() and in that case we must write
+ * allocated buffers to achieve data=ordered mode guarantees.
*/
- if (!page_has_buffers(page)) {
- if (__block_write_begin(page, 0, len,
- noalloc_get_block_write)) {
- redirty_page:
- redirty_page_for_writepage(wbc, page);
+ if (ext4_walk_page_buffers(NULL, page_bufs, 0, len, NULL,
+ ext4_bh_delay_or_unwritten)) {
+ redirty_page_for_writepage(wbc, page);
+ if (current->flags & PF_MEMALLOC) {
+ /*
+ * For memory cleaning there's no point in writing only
+ * some buffers. So just bail out. Warn if we came here
+ * from direct reclaim.
+ */
+ WARN_ON_ONCE((current->flags & (PF_MEMALLOC|PF_KSWAPD))
+ == PF_MEMALLOC);
unlock_page(page);
return 0;
}
- commit_write = 1;
- }
- page_bufs = page_buffers(page);
- if (ext4_walk_page_buffers(NULL, page_bufs, 0, len, NULL,
- ext4_bh_delay_or_unwritten)) {
- /*
- * We don't want to do block allocation, so redirty
- * the page and return. We may reach here when we do
- * a journal commit via journal_submit_inode_data_buffers.
- * We can also reach here via shrink_page_list but it
- * should never be for direct reclaim so warn if that
- * happens
- */
- WARN_ON_ONCE((current->flags & (PF_MEMALLOC|PF_KSWAPD)) ==
- PF_MEMALLOC);
- goto redirty_page;
}
- if (commit_write)
- /* now mark the buffer_heads as dirty and uptodate */
- block_commit_write(page, 0, len);
if (PageChecked(page) && ext4_should_journal_data(inode))
/*
@@ -2089,14 +2072,9 @@ static int ext4_writepage(struct page *page,
*/
return __ext4_journalled_writepage(page, len);
- if (buffer_uninit(page_bufs)) {
- ext4_set_bh_endio(page_bufs, inode);
- ret = block_write_full_page_endio(page, noalloc_get_block_write,
- wbc, ext4_end_io_buffer_write);
- } else
- ret = block_write_full_page(page, noalloc_get_block_write,
- wbc);
-
+ memset(&io_submit, 0, sizeof(io_submit));
+ ret = ext4_bio_write_page(&io_submit, page, len, wbc);
+ ext4_io_submit(&io_submit);
return ret;
}
@@ -2228,51 +2206,38 @@ static int write_cache_pages_da(handle_t *handle,
logical = (sector_t) page->index <<
(PAGE_CACHE_SHIFT - inode->i_blkbits);
- if (!page_has_buffers(page)) {
- mpage_add_bh_to_extent(mpd, logical,
- PAGE_CACHE_SIZE,
- (1 << BH_Dirty) | (1 << BH_Uptodate));
- if (mpd->io_done)
- goto ret_extent_tail;
- } else {
+ /* Add all dirty buffers to mpd */
+ head = page_buffers(page);
+ bh = head;
+ do {
+ BUG_ON(buffer_locked(bh));
/*
- * Page with regular buffer heads,
- * just add all dirty ones
+ * We need to try to allocate unmapped blocks
+ * in the same page. Otherwise we won't make
+ * progress with the page in ext4_writepage
*/
- head = page_buffers(page);
- bh = head;
- do {
- BUG_ON(buffer_locked(bh));
+ if (ext4_bh_delay_or_unwritten(NULL, bh)) {
+ mpage_add_bh_to_extent(mpd, logical,
+ bh->b_state);
+ if (mpd->io_done)
+ goto ret_extent_tail;
+ } else if (buffer_dirty(bh) &&
+ buffer_mapped(bh)) {
/*
- * We need to try to allocate
- * unmapped blocks in the same page.
- * Otherwise we won't make progress
- * with the page in ext4_writepage
+ * mapped dirty buffer. We need to
+ * update the b_state because we look
+ * at b_state in mpage_da_map_blocks.
+ * We don't update b_size because if we
+ * find an unmapped buffer_head later
+ * we need to use the b_state flag of
+ * that buffer_head.
*/
- if (ext4_bh_delay_or_unwritten(NULL, bh)) {
- mpage_add_bh_to_extent(mpd, logical,
- bh->b_size,
- bh->b_state);
- if (mpd->io_done)
- goto ret_extent_tail;
- } else if (buffer_dirty(bh) && (buffer_mapped(bh))) {
- /*
- * mapped dirty buffer. We need
- * to update the b_state
- * because we look at b_state
- * in mpage_da_map_blocks. We
- * don't update b_size because
- * if we find an unmapped
- * buffer_head later we need to
- * use the b_state flag of that
- * buffer_head.
- */
- if (mpd->b_size == 0)
- mpd->b_state = bh->b_state & BH_FLAGS;
- }
- logical++;
- } while ((bh = bh->b_this_page) != head);
- }
+ if (mpd->b_size == 0)
+ mpd->b_state =
+ bh->b_state & BH_FLAGS;
+ }
+ logical++;
+ } while ((bh = bh->b_this_page) != head);
if (nr_to_write > 0) {
nr_to_write--;
@@ -2858,36 +2823,10 @@ ext4_readpages(struct file *file, struct address_space *mapping,
return mpage_readpages(mapping, pages, nr_pages, ext4_get_block);
}
-static void ext4_invalidatepage_free_endio(struct page *page, unsigned long offset)
-{
- struct buffer_head *head, *bh;
- unsigned int curr_off = 0;
-
- if (!page_has_buffers(page))
- return;
- head = bh = page_buffers(page);
- do {
- if (offset <= curr_off && test_clear_buffer_uninit(bh)
- && bh->b_private) {
- ext4_free_io_end(bh->b_private);
- bh->b_private = NULL;
- bh->b_end_io = NULL;
- }
- curr_off = curr_off + bh->b_size;
- bh = bh->b_this_page;
- } while (bh != head);
-}
-
static void ext4_invalidatepage(struct page *page, unsigned long offset)
{
trace_ext4_invalidatepage(page, offset);
- /*
- * free any io_end structure allocated for buffers to be discarded
- */
- if (ext4_should_dioread_nolock(page->mapping->host))
- ext4_invalidatepage_free_endio(page, offset);
-
/* No journalling happens on data buffers when this function is used */
WARN_ON(page_has_buffers(page) && buffer_jbd(page_buffers(page)));
@@ -2977,9 +2916,9 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
ext4_free_io_end(io_end);
out:
+ inode_dio_done(inode);
if (is_async)
aio_complete(iocb, ret, 0);
- inode_dio_done(inode);
return;
}
@@ -2993,65 +2932,6 @@ out:
ext4_add_complete_io(io_end);
}
-static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate)
-{
- ext4_io_end_t *io_end = bh->b_private;
- struct inode *inode;
-
- if (!test_clear_buffer_uninit(bh) || !io_end)
- goto out;
-
- if (!(io_end->inode->i_sb->s_flags & MS_ACTIVE)) {
- ext4_msg(io_end->inode->i_sb, KERN_INFO,
- "sb umounted, discard end_io request for inode %lu",
- io_end->inode->i_ino);
- ext4_free_io_end(io_end);
- goto out;
- }
-
- /*
- * It may be over-defensive here to check EXT4_IO_END_UNWRITTEN now,
- * but being more careful is always safe for the future change.
- */
- inode = io_end->inode;
- ext4_set_io_unwritten_flag(inode, io_end);
- ext4_add_complete_io(io_end);
-out:
- bh->b_private = NULL;
- bh->b_end_io = NULL;
- clear_buffer_uninit(bh);
- end_buffer_async_write(bh, uptodate);
-}
-
-static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode)
-{
- ext4_io_end_t *io_end;
- struct page *page = bh->b_page;
- loff_t offset = (sector_t)page->index << PAGE_CACHE_SHIFT;
- size_t size = bh->b_size;
-
-retry:
- io_end = ext4_init_io_end(inode, GFP_ATOMIC);
- if (!io_end) {
- pr_warn_ratelimited("%s: allocation fail\n", __func__);
- schedule();
- goto retry;
- }
- io_end->offset = offset;
- io_end->size = size;
- /*
- * We need to hold a reference to the page to make sure it
- * doesn't get evicted before ext4_end_io_work() has a chance
- * to convert the extent from written to unwritten.
- */
- io_end->page = page;
- get_page(io_end->page);
-
- bh->b_private = io_end;
- bh->b_end_io = ext4_end_io_buffer_write;
- return 0;
-}
-
/*
* For ext4 extent files, ext4 will do direct-io write to holes,
* preallocated extents, and those write extend the file, no need to
@@ -3557,16 +3437,16 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
if (!S_ISREG(inode->i_mode))
return -EOPNOTSUPP;
- if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
- /* TODO: Add support for non extent hole punching */
- return -EOPNOTSUPP;
- }
+ if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
+ return ext4_ind_punch_hole(file, offset, length);
if (EXT4_SB(inode->i_sb)->s_cluster_ratio > 1) {
/* TODO: Add support for bigalloc file systems */
return -EOPNOTSUPP;
}
+ trace_ext4_punch_hole(inode, offset, length);
+
return ext4_ext_punch_hole(file, offset, length);
}
@@ -3660,11 +3540,8 @@ static int __ext4_get_inode_loc(struct inode *inode,
iloc->offset = (inode_offset % inodes_per_block) * EXT4_INODE_SIZE(sb);
bh = sb_getblk(sb, block);
- if (!bh) {
- EXT4_ERROR_INODE_BLOCK(inode, block,
- "unable to read itable block");
- return -EIO;
- }
+ if (unlikely(!bh))
+ return -ENOMEM;
if (!buffer_uptodate(bh)) {
lock_buffer(bh);
@@ -3696,7 +3573,7 @@ static int __ext4_get_inode_loc(struct inode *inode,
/* Is the inode bitmap in cache? */
bitmap_bh = sb_getblk(sb, ext4_inode_bitmap(sb, gdp));
- if (!bitmap_bh)
+ if (unlikely(!bitmap_bh))
goto make_io;
/*
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 5747f52f7c72..4784ac244fc6 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -313,6 +313,9 @@ mext_out:
if (err == 0)
err = err2;
mnt_drop_write_file(filp);
+ if (!err && ext4_has_group_desc_csum(sb) &&
+ test_opt(sb, INIT_INODE_TABLE))
+ err = ext4_register_li_request(sb, input.group);
group_add_out:
ext4_resize_end(sb);
return err;
@@ -358,6 +361,7 @@ group_add_out:
ext4_fsblk_t n_blocks_count;
struct super_block *sb = inode->i_sb;
int err = 0, err2 = 0;
+ ext4_group_t o_group = EXT4_SB(sb)->s_groups_count;
if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
EXT4_FEATURE_RO_COMPAT_BIGALLOC)) {
@@ -388,6 +392,11 @@ group_add_out:
if (err == 0)
err = err2;
mnt_drop_write_file(filp);
+ if (!err && (o_group > EXT4_SB(sb)->s_groups_count) &&
+ ext4_has_group_desc_csum(sb) &&
+ test_opt(sb, INIT_INODE_TABLE))
+ err = ext4_register_li_request(sb, o_group);
+
resizefs_out:
ext4_resize_end(sb);
return err;
diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c
index fe7c63f4717e..f9b551561d2c 100644
--- a/fs/ext4/mmp.c
+++ b/fs/ext4/mmp.c
@@ -80,6 +80,8 @@ static int read_mmp_block(struct super_block *sb, struct buffer_head **bh,
* is not blocked in the elevator. */
if (!*bh)
*bh = sb_getblk(sb, mmp_block);
+ if (!*bh)
+ return -ENOMEM;
if (*bh) {
get_bh(*bh);
lock_buffer(*bh);
@@ -91,7 +93,7 @@ static int read_mmp_block(struct super_block *sb, struct buffer_head **bh,
*bh = NULL;
}
}
- if (!*bh) {
+ if (unlikely(!*bh)) {
ext4_warning(sb, "Error while reading MMP block %llu",
mmp_block);
return -EIO;
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index d9cc5ee42f53..b9222c80f4f8 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -761,9 +761,6 @@ out:
kfree(donor_path);
}
- ext4_ext_invalidate_cache(orig_inode);
- ext4_ext_invalidate_cache(donor_inode);
-
return replaced_count;
}
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index f9ed946a448e..a0f13ee87336 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -47,7 +47,6 @@
#define NAMEI_RA_CHUNKS 2
#define NAMEI_RA_BLOCKS 4
#define NAMEI_RA_SIZE (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS)
-#define NAMEI_RA_INDEX(c,b) (((c) * NAMEI_RA_BLOCKS) + (b))
static struct buffer_head *ext4_append(handle_t *handle,
struct inode *inode,
@@ -714,7 +713,7 @@ dx_probe(const struct qstr *d_name, struct inode *dir,
*err = ERR_BAD_DX_DIR;
goto fail2;
}
- at = entries = ((struct dx_node *) bh->b_data)->entries;
+ entries = ((struct dx_node *) bh->b_data)->entries;
if (!buffer_verified(bh) &&
!ext4_dx_csum_verify(dir,
@@ -837,6 +836,7 @@ static int ext4_htree_next_block(struct inode *dir, __u32 hash,
!ext4_dx_csum_verify(dir,
(struct ext4_dir_entry *)bh->b_data)) {
ext4_warning(dir->i_sb, "Node failed checksum");
+ brelse(bh);
return -EIO;
}
set_buffer_verified(bh);
@@ -877,8 +877,11 @@ static int htree_dirblock_to_tree(struct file *dir_file,
}
if (!buffer_verified(bh) &&
- !ext4_dirent_csum_verify(dir, (struct ext4_dir_entry *)bh->b_data))
+ !ext4_dirent_csum_verify(dir,
+ (struct ext4_dir_entry *)bh->b_data)) {
+ brelse(bh);
return -EIO;
+ }
set_buffer_verified(bh);
de = (struct ext4_dir_entry_2 *) bh->b_data;
@@ -1699,7 +1702,6 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
const char *name = dentry->d_name.name;
int namelen = dentry->d_name.len;
unsigned int blocksize = dir->i_sb->s_blocksize;
- unsigned short reclen;
int csum_size = 0;
int err;
@@ -1707,7 +1709,6 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
csum_size = sizeof(struct ext4_dir_entry_tail);
- reclen = EXT4_DIR_REC_LEN(namelen);
if (!de) {
err = ext4_find_dest_de(dir, inode,
bh, bh->b_data, blocksize - csum_size,
@@ -1929,8 +1930,10 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
}
if (!buffer_verified(bh) &&
!ext4_dirent_csum_verify(dir,
- (struct ext4_dir_entry *)bh->b_data))
+ (struct ext4_dir_entry *)bh->b_data)) {
+ brelse(bh);
return -EIO;
+ }
set_buffer_verified(bh);
retval = add_dirent_to_buf(handle, dentry, inode, NULL, bh);
if (retval != -ENOSPC) {
@@ -2106,8 +2109,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
journal_error:
ext4_std_error(dir->i_sb, err);
cleanup:
- if (bh)
- brelse(bh);
+ brelse(bh);
dx_release(frames);
return err;
}
@@ -2492,6 +2494,7 @@ static int empty_dir(struct inode *inode)
(struct ext4_dir_entry *)bh->b_data)) {
EXT4_ERROR_INODE(inode, "checksum error reading directory "
"lblock 0");
+ brelse(bh);
return -EIO;
}
set_buffer_verified(bh);
@@ -2536,6 +2539,7 @@ static int empty_dir(struct inode *inode)
(struct ext4_dir_entry *)bh->b_data)) {
EXT4_ERROR_INODE(inode, "checksum error "
"reading directory lblock 0");
+ brelse(bh);
return -EIO;
}
set_buffer_verified(bh);
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 0016fbca2a40..809b31003ecc 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -23,6 +23,7 @@
#include <linux/workqueue.h>
#include <linux/kernel.h>
#include <linux/slab.h>
+#include <linux/mm.h>
#include "ext4_jbd2.h"
#include "xattr.h"
@@ -73,8 +74,6 @@ void ext4_free_io_end(ext4_io_end_t *io)
BUG_ON(!list_empty(&io->list));
BUG_ON(io->flag & EXT4_IO_END_UNWRITTEN);
- if (io->page)
- put_page(io->page);
for (i = 0; i < io->num_io_pages; i++)
put_io_page(io->pages[i]);
io->num_io_pages = 0;
@@ -103,14 +102,13 @@ static int ext4_end_io(ext4_io_end_t *io)
"(inode %lu, offset %llu, size %zd, error %d)",
inode->i_ino, offset, size, ret);
}
- if (io->iocb)
- aio_complete(io->iocb, io->result, 0);
-
- if (io->flag & EXT4_IO_END_DIRECT)
- inode_dio_done(inode);
/* Wake up anyone waiting on unwritten extent conversion */
if (atomic_dec_and_test(&EXT4_I(inode)->i_unwritten))
wake_up_all(ext4_ioend_wq(inode));
+ if (io->flag & EXT4_IO_END_DIRECT)
+ inode_dio_done(inode);
+ if (io->iocb)
+ aio_complete(io->iocb, io->result, 0);
return ret;
}
@@ -119,7 +117,6 @@ static void dump_completed_IO(struct inode *inode)
#ifdef EXT4FS_DEBUG
struct list_head *cur, *before, *after;
ext4_io_end_t *io, *io0, *io1;
- unsigned long flags;
if (list_empty(&EXT4_I(inode)->i_completed_io_list)) {
ext4_debug("inode %lu completed_io list is empty\n",
@@ -152,26 +149,20 @@ void ext4_add_complete_io(ext4_io_end_t *io_end)
wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq;
spin_lock_irqsave(&ei->i_completed_io_lock, flags);
- if (list_empty(&ei->i_completed_io_list)) {
- io_end->flag |= EXT4_IO_END_QUEUED;
- queue_work(wq, &io_end->work);
- }
+ if (list_empty(&ei->i_completed_io_list))
+ queue_work(wq, &ei->i_unwritten_work);
list_add_tail(&io_end->list, &ei->i_completed_io_list);
spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
}
-static int ext4_do_flush_completed_IO(struct inode *inode,
- ext4_io_end_t *work_io)
+static int ext4_do_flush_completed_IO(struct inode *inode)
{
ext4_io_end_t *io;
- struct list_head unwritten, complete, to_free;
+ struct list_head unwritten;
unsigned long flags;
struct ext4_inode_info *ei = EXT4_I(inode);
int err, ret = 0;
- INIT_LIST_HEAD(&complete);
- INIT_LIST_HEAD(&to_free);
-
spin_lock_irqsave(&ei->i_completed_io_lock, flags);
dump_completed_IO(inode);
list_replace_init(&ei->i_completed_io_list, &unwritten);
@@ -185,32 +176,7 @@ static int ext4_do_flush_completed_IO(struct inode *inode,
err = ext4_end_io(io);
if (unlikely(!ret && err))
ret = err;
-
- list_add_tail(&io->list, &complete);
- }
- spin_lock_irqsave(&ei->i_completed_io_lock, flags);
- while (!list_empty(&complete)) {
- io = list_entry(complete.next, ext4_io_end_t, list);
io->flag &= ~EXT4_IO_END_UNWRITTEN;
- /* end_io context can not be destroyed now because it still
- * used by queued worker. Worker thread will destroy it later */
- if (io->flag & EXT4_IO_END_QUEUED)
- list_del_init(&io->list);
- else
- list_move(&io->list, &to_free);
- }
- /* If we are called from worker context, it is time to clear queued
- * flag, and destroy it's end_io if it was converted already */
- if (work_io) {
- work_io->flag &= ~EXT4_IO_END_QUEUED;
- if (!(work_io->flag & EXT4_IO_END_UNWRITTEN))
- list_add_tail(&work_io->list, &to_free);
- }
- spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
-
- while (!list_empty(&to_free)) {
- io = list_entry(to_free.next, ext4_io_end_t, list);
- list_del_init(&io->list);
ext4_free_io_end(io);
}
return ret;
@@ -219,10 +185,11 @@ static int ext4_do_flush_completed_IO(struct inode *inode,
/*
* work on completed aio dio IO, to convert unwritten extents to extents
*/
-static void ext4_end_io_work(struct work_struct *work)
+void ext4_end_io_work(struct work_struct *work)
{
- ext4_io_end_t *io = container_of(work, ext4_io_end_t, work);
- ext4_do_flush_completed_IO(io->inode, io);
+ struct ext4_inode_info *ei = container_of(work, struct ext4_inode_info,
+ i_unwritten_work);
+ ext4_do_flush_completed_IO(&ei->vfs_inode);
}
int ext4_flush_unwritten_io(struct inode *inode)
@@ -230,7 +197,7 @@ int ext4_flush_unwritten_io(struct inode *inode)
int ret;
WARN_ON_ONCE(!mutex_is_locked(&inode->i_mutex) &&
!(inode->i_state & I_FREEING));
- ret = ext4_do_flush_completed_IO(inode, NULL);
+ ret = ext4_do_flush_completed_IO(inode);
ext4_unwritten_wait(inode);
return ret;
}
@@ -241,7 +208,6 @@ ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags)
if (io) {
atomic_inc(&EXT4_I(inode)->i_ioend_count);
io->inode = inode;
- INIT_WORK(&io->work, ext4_end_io_work);
INIT_LIST_HEAD(&io->list);
}
return io;
@@ -382,14 +348,6 @@ static int io_submit_add_bh(struct ext4_io_submit *io,
unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);
}
- if (!buffer_mapped(bh) || buffer_delay(bh)) {
- if (!buffer_mapped(bh))
- clear_buffer_dirty(bh);
- if (io->io_bio)
- ext4_io_submit(io);
- return 0;
- }
-
if (io->io_bio && bh->b_blocknr != io->io_next_block) {
submit_and_retry:
ext4_io_submit(io);
@@ -436,7 +394,7 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
io_page = kmem_cache_alloc(io_page_cachep, GFP_NOFS);
if (!io_page) {
- set_page_dirty(page);
+ redirty_page_for_writepage(wbc, page);
unlock_page(page);
return -ENOMEM;
}
@@ -468,7 +426,15 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
set_buffer_uptodate(bh);
continue;
}
- clear_buffer_dirty(bh);
+ if (!buffer_dirty(bh) || buffer_delay(bh) ||
+ !buffer_mapped(bh) || buffer_unwritten(bh)) {
+ /* A hole? We can safely clear the dirty bit */
+ if (!buffer_mapped(bh))
+ clear_buffer_dirty(bh);
+ if (io->io_bio)
+ ext4_io_submit(io);
+ continue;
+ }
ret = io_submit_add_bh(io, io_page, inode, wbc, bh);
if (ret) {
/*
@@ -476,9 +442,10 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
* we can do but mark the page as dirty, and
* better luck next time.
*/
- set_page_dirty(page);
+ redirty_page_for_writepage(wbc, page);
break;
}
+ clear_buffer_dirty(bh);
}
unlock_page(page);
/*
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index d99387b89edd..8eefb636beb8 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -333,8 +333,8 @@ static struct buffer_head *bclean(handle_t *handle, struct super_block *sb,
int err;
bh = sb_getblk(sb, blk);
- if (!bh)
- return ERR_PTR(-EIO);
+ if (unlikely(!bh))
+ return ERR_PTR(-ENOMEM);
if ((err = ext4_journal_get_write_access(handle, bh))) {
brelse(bh);
bh = ERR_PTR(err);
@@ -410,8 +410,8 @@ static int set_flexbg_block_bitmap(struct super_block *sb, handle_t *handle,
return err;
bh = sb_getblk(sb, flex_gd->groups[group].block_bitmap);
- if (!bh)
- return -EIO;
+ if (unlikely(!bh))
+ return -ENOMEM;
err = ext4_journal_get_write_access(handle, bh);
if (err)
@@ -500,8 +500,8 @@ static int setup_new_flex_group_blocks(struct super_block *sb,
goto out;
gdb = sb_getblk(sb, block);
- if (!gdb) {
- err = -EIO;
+ if (unlikely(!gdb)) {
+ err = -ENOMEM;
goto out;
}
@@ -1064,8 +1064,8 @@ static void update_backups(struct super_block *sb, int blk_off, char *data,
ext4_bg_has_super(sb, group));
bh = sb_getblk(sb, backup_block);
- if (!bh) {
- err = -EIO;
+ if (unlikely(!bh)) {
+ err = -ENOMEM;
break;
}
ext4_debug("update metadata backup %llu(+%llu)\n",
@@ -1168,7 +1168,7 @@ static int ext4_add_new_descs(handle_t *handle, struct super_block *sb,
static struct buffer_head *ext4_get_bitmap(struct super_block *sb, __u64 block)
{
struct buffer_head *bh = sb_getblk(sb, block);
- if (!bh)
+ if (unlikely(!bh))
return NULL;
if (!bh_uptodate_or_lock(bh)) {
if (bh_submit_read(bh) < 0) {
@@ -1506,10 +1506,12 @@ static int ext4_setup_next_flex_gd(struct super_block *sb,
group_data[i].blocks_count = blocks_per_group;
overhead = ext4_group_overhead_blocks(sb, group + i);
group_data[i].free_blocks_count = blocks_per_group - overhead;
- if (ext4_has_group_desc_csum(sb))
+ if (ext4_has_group_desc_csum(sb)) {
flex_gd->bg_flags[i] = EXT4_BG_BLOCK_UNINIT |
EXT4_BG_INODE_UNINIT;
- else
+ if (!test_opt(sb, INIT_INODE_TABLE))
+ flex_gd->bg_flags[i] |= EXT4_BG_INODE_ZEROED;
+ } else
flex_gd->bg_flags[i] = EXT4_BG_INODE_ZEROED;
}
@@ -1594,7 +1596,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
err = ext4_alloc_flex_bg_array(sb, input->group + 1);
if (err)
- return err;
+ goto out;
err = ext4_mb_alloc_groupinfo(sb, input->group + 1);
if (err)
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 3d4fb81bacd5..dc20e4d95df1 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -858,6 +858,7 @@ static void ext4_put_super(struct super_block *sb)
ext4_abort(sb, "Couldn't clean up the journal");
}
+ ext4_es_unregister_shrinker(sb);
del_timer(&sbi->s_err_report);
ext4_release_system_zone(sb);
ext4_mb_release(sb);
@@ -939,11 +940,12 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
return NULL;
ei->vfs_inode.i_version = 1;
- memset(&ei->i_cached_extent, 0, sizeof(struct ext4_ext_cache));
INIT_LIST_HEAD(&ei->i_prealloc_list);
spin_lock_init(&ei->i_prealloc_lock);
ext4_es_init_tree(&ei->i_es_tree);
rwlock_init(&ei->i_es_lock);
+ INIT_LIST_HEAD(&ei->i_es_lru);
+ ei->i_es_lru_nr = 0;
ei->i_reserved_data_blocks = 0;
ei->i_reserved_meta_blocks = 0;
ei->i_allocated_meta_blocks = 0;
@@ -960,6 +962,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
ei->i_datasync_tid = 0;
atomic_set(&ei->i_ioend_count, 0);
atomic_set(&ei->i_unwritten, 0);
+ INIT_WORK(&ei->i_unwritten_work, ext4_end_io_work);
return &ei->vfs_inode;
}
@@ -1031,6 +1034,7 @@ void ext4_clear_inode(struct inode *inode)
dquot_drop(inode);
ext4_discard_preallocations(inode);
ext4_es_remove_extent(inode, 0, EXT_MAX_BLOCKS);
+ ext4_es_lru_del(inode);
if (EXT4_I(inode)->jinode) {
jbd2_journal_release_jbd_inode(EXT4_JOURNAL(inode),
EXT4_I(inode)->jinode);
@@ -1280,8 +1284,8 @@ static const match_table_t tokens = {
{Opt_stripe, "stripe=%u"},
{Opt_delalloc, "delalloc"},
{Opt_nodelalloc, "nodelalloc"},
- {Opt_mblk_io_submit, "mblk_io_submit"},
- {Opt_nomblk_io_submit, "nomblk_io_submit"},
+ {Opt_removed, "mblk_io_submit"},
+ {Opt_removed, "nomblk_io_submit"},
{Opt_block_validity, "block_validity"},
{Opt_noblock_validity, "noblock_validity"},
{Opt_inode_readahead_blks, "inode_readahead_blks=%u"},
@@ -1337,6 +1341,7 @@ static int set_qf_name(struct super_block *sb, int qtype, substring_t *args)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
char *qname;
+ int ret = -1;
if (sb_any_quota_loaded(sb) &&
!sbi->s_qf_names[qtype]) {
@@ -1351,23 +1356,26 @@ static int set_qf_name(struct super_block *sb, int qtype, substring_t *args)
"Not enough memory for storing quotafile name");
return -1;
}
- if (sbi->s_qf_names[qtype] &&
- strcmp(sbi->s_qf_names[qtype], qname)) {
- ext4_msg(sb, KERN_ERR,
- "%s quota file already specified", QTYPE2NAME(qtype));
- kfree(qname);
- return -1;
+ if (sbi->s_qf_names[qtype]) {
+ if (strcmp(sbi->s_qf_names[qtype], qname) == 0)
+ ret = 1;
+ else
+ ext4_msg(sb, KERN_ERR,
+ "%s quota file already specified",
+ QTYPE2NAME(qtype));
+ goto errout;
}
- sbi->s_qf_names[qtype] = qname;
- if (strchr(sbi->s_qf_names[qtype], '/')) {
+ if (strchr(qname, '/')) {
ext4_msg(sb, KERN_ERR,
"quotafile must be on filesystem root");
- kfree(sbi->s_qf_names[qtype]);
- sbi->s_qf_names[qtype] = NULL;
- return -1;
+ goto errout;
}
+ sbi->s_qf_names[qtype] = qname;
set_opt(sb, QUOTA);
return 1;
+errout:
+ kfree(qname);
+ return ret;
}
static int clear_qf_name(struct super_block *sb, int qtype)
@@ -1381,10 +1389,7 @@ static int clear_qf_name(struct super_block *sb, int qtype)
" when quota turned on");
return -1;
}
- /*
- * The space will be released later when all options are confirmed
- * to be correct
- */
+ kfree(sbi->s_qf_names[qtype]);
sbi->s_qf_names[qtype] = NULL;
return 1;
}
@@ -1414,8 +1419,6 @@ static const struct mount_opts {
{Opt_bsd_df, EXT4_MOUNT_MINIX_DF, MOPT_CLEAR},
{Opt_grpid, EXT4_MOUNT_GRPID, MOPT_SET},
{Opt_nogrpid, EXT4_MOUNT_GRPID, MOPT_CLEAR},
- {Opt_mblk_io_submit, EXT4_MOUNT_MBLK_IO_SUBMIT, MOPT_SET},
- {Opt_nomblk_io_submit, EXT4_MOUNT_MBLK_IO_SUBMIT, MOPT_CLEAR},
{Opt_block_validity, EXT4_MOUNT_BLOCK_VALIDITY, MOPT_SET},
{Opt_noblock_validity, EXT4_MOUNT_BLOCK_VALIDITY, MOPT_CLEAR},
{Opt_dioread_nolock, EXT4_MOUNT_DIOREAD_NOLOCK, MOPT_SET},
@@ -2776,7 +2779,7 @@ static int ext4_run_li_request(struct ext4_li_request *elr)
break;
}
- if (group == ngroups)
+ if (group >= ngroups)
ret = 1;
if (!ret) {
@@ -3016,33 +3019,34 @@ static struct ext4_li_request *ext4_li_request_new(struct super_block *sb,
return elr;
}
-static int ext4_register_li_request(struct super_block *sb,
- ext4_group_t first_not_zeroed)
+int ext4_register_li_request(struct super_block *sb,
+ ext4_group_t first_not_zeroed)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
- struct ext4_li_request *elr;
+ struct ext4_li_request *elr = NULL;
ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
int ret = 0;
+ mutex_lock(&ext4_li_mtx);
if (sbi->s_li_request != NULL) {
/*
* Reset timeout so it can be computed again, because
* s_li_wait_mult might have changed.
*/
sbi->s_li_request->lr_timeout = 0;
- return 0;
+ goto out;
}
if (first_not_zeroed == ngroups ||
(sb->s_flags & MS_RDONLY) ||
!test_opt(sb, INIT_INODE_TABLE))
- return 0;
+ goto out;
elr = ext4_li_request_new(sb, first_not_zeroed);
- if (!elr)
- return -ENOMEM;
-
- mutex_lock(&ext4_li_mtx);
+ if (!elr) {
+ ret = -ENOMEM;
+ goto out;
+ }
if (NULL == ext4_li_info) {
ret = ext4_li_info_new();
@@ -3379,7 +3383,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
#ifdef CONFIG_EXT4_FS_POSIX_ACL
set_opt(sb, POSIX_ACL);
#endif
- set_opt(sb, MBLK_IO_SUBMIT);
if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_DATA)
set_opt(sb, JOURNAL_DATA);
else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_ORDERED)
@@ -3772,6 +3775,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
sbi->s_max_writeback_mb_bump = 128;
sbi->s_extent_max_zeroout_kb = 32;
+ /* Register extent status tree shrinker */
+ ext4_es_register_shrinker(sb);
+
/*
* set up enough so that it can read an inode
*/
@@ -4008,7 +4014,7 @@ no_journal:
!(sb->s_flags & MS_RDONLY)) {
err = ext4_enable_quotas(sb);
if (err)
- goto failed_mount7;
+ goto failed_mount8;
}
#endif /* CONFIG_QUOTA */
@@ -4035,6 +4041,10 @@ cantfind_ext4:
ext4_msg(sb, KERN_ERR, "VFS: Can't find ext4 filesystem");
goto failed_mount;
+#ifdef CONFIG_QUOTA
+failed_mount8:
+ kobject_del(&sbi->s_kobj);
+#endif
failed_mount7:
ext4_unregister_li_request(sb);
failed_mount6:
@@ -4476,16 +4486,12 @@ static void ext4_clear_journal_err(struct super_block *sb,
int ext4_force_commit(struct super_block *sb)
{
journal_t *journal;
- int ret = 0;
if (sb->s_flags & MS_RDONLY)
return 0;
journal = EXT4_SB(sb)->s_journal;
- if (journal)
- ret = ext4_journal_force_commit(journal);
-
- return ret;
+ return ext4_journal_force_commit(journal);
}
static int ext4_sync_fs(struct super_block *sb, int wait)
@@ -4588,7 +4594,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
int err = 0;
#ifdef CONFIG_QUOTA
- int i;
+ int i, j;
#endif
char *orig_data = kstrdup(data, GFP_KERNEL);
@@ -4604,7 +4610,16 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
#ifdef CONFIG_QUOTA
old_opts.s_jquota_fmt = sbi->s_jquota_fmt;
for (i = 0; i < MAXQUOTAS; i++)
- old_opts.s_qf_names[i] = sbi->s_qf_names[i];
+ if (sbi->s_qf_names[i]) {
+ old_opts.s_qf_names[i] = kstrdup(sbi->s_qf_names[i],
+ GFP_KERNEL);
+ if (!old_opts.s_qf_names[i]) {
+ for (j = 0; j < i; j++)
+ kfree(old_opts.s_qf_names[j]);
+ return -ENOMEM;
+ }
+ } else
+ old_opts.s_qf_names[i] = NULL;
#endif
if (sbi->s_journal && sbi->s_journal->j_task->io_context)
journal_ioprio = sbi->s_journal->j_task->io_context->ioprio;
@@ -4737,9 +4752,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
#ifdef CONFIG_QUOTA
/* Release old quota file names */
for (i = 0; i < MAXQUOTAS; i++)
- if (old_opts.s_qf_names[i] &&
- old_opts.s_qf_names[i] != sbi->s_qf_names[i])
- kfree(old_opts.s_qf_names[i]);
+ kfree(old_opts.s_qf_names[i]);
if (enable_quota) {
if (sb_any_quota_suspended(sb))
dquot_resume(sb, -1);
@@ -4768,9 +4781,7 @@ restore_opts:
#ifdef CONFIG_QUOTA
sbi->s_jquota_fmt = old_opts.s_jquota_fmt;
for (i = 0; i < MAXQUOTAS; i++) {
- if (sbi->s_qf_names[i] &&
- old_opts.s_qf_names[i] != sbi->s_qf_names[i])
- kfree(sbi->s_qf_names[i]);
+ kfree(sbi->s_qf_names[i]);
sbi->s_qf_names[i] = old_opts.s_qf_names[i];
}
#endif
@@ -5005,9 +5016,9 @@ static int ext4_enable_quotas(struct super_block *sb)
DQUOT_USAGE_ENABLED);
if (err) {
ext4_warning(sb,
- "Failed to enable quota (type=%d) "
- "tracking. Please run e2fsck to fix.",
- type);
+ "Failed to enable quota tracking "
+ "(type=%d, err=%d). Please run "
+ "e2fsck to fix.", type, err);
return err;
}
}
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 3a91ebc2b66f..c68990c392c7 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -886,17 +886,18 @@ inserted:
(unsigned long long)block);
new_bh = sb_getblk(sb, block);
- if (!new_bh) {
+ if (unlikely(!new_bh)) {
+ error = -ENOMEM;
getblk_failed:
ext4_free_blocks(handle, inode, NULL, block, 1,
EXT4_FREE_BLOCKS_METADATA);
- error = -EIO;
goto cleanup;
}
lock_buffer(new_bh);
error = ext4_journal_get_create_access(handle, new_bh);
if (error) {
unlock_buffer(new_bh);
+ error = -EIO;
goto getblk_failed;
}
memcpy(new_bh->b_data, s->base, new_bh->b_size);
diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index ff3c8439af87..d3b34d05211f 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -72,22 +72,22 @@ static int f2fs_write_meta_page(struct page *page,
{
struct inode *inode = page->mapping->host;
struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
- int err;
- wait_on_page_writeback(page);
-
- err = write_meta_page(sbi, page, wbc);
- if (err) {
+ /* Should not write any meta pages, if any IO error was occurred */
+ if (wbc->for_reclaim ||
+ is_set_ckpt_flags(F2FS_CKPT(sbi), CP_ERROR_FLAG)) {
+ dec_page_count(sbi, F2FS_DIRTY_META);
wbc->pages_skipped++;
set_page_dirty(page);
+ return AOP_WRITEPAGE_ACTIVATE;
}
- dec_page_count(sbi, F2FS_DIRTY_META);
+ wait_on_page_writeback(page);
- /* In this case, we should not unlock this page */
- if (err != AOP_WRITEPAGE_ACTIVATE)
- unlock_page(page);
- return err;
+ write_meta_page(sbi, page);
+ dec_page_count(sbi, F2FS_DIRTY_META);
+ unlock_page(page);
+ return 0;
}
static int f2fs_write_meta_pages(struct address_space *mapping,
@@ -138,7 +138,10 @@ long sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type,
BUG_ON(page->mapping != mapping);
BUG_ON(!PageDirty(page));
clear_page_dirty_for_io(page);
- f2fs_write_meta_page(page, &wbc);
+ if (f2fs_write_meta_page(page, &wbc)) {
+ unlock_page(page);
+ break;
+ }
if (nwritten++ >= nr_to_write)
break;
}
@@ -216,19 +219,11 @@ retry:
new->ino = ino;
/* add new_oentry into list which is sorted by inode number */
- if (orphan) {
- struct orphan_inode_entry *prev;
-
- /* get previous entry */
- prev = list_entry(orphan->list.prev, typeof(*prev), list);
- if (&prev->list != head)
- /* insert new orphan inode entry */
- list_add(&new->list, &prev->list);
- else
- list_add(&new->list, head);
- } else {
+ if (orphan)
+ list_add(&new->list, this->prev);
+ else
list_add_tail(&new->list, head);
- }
+
sbi->n_orphans++;
out:
mutex_unlock(&sbi->orphan_inode_mutex);
@@ -717,13 +712,12 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount)
sbi->alloc_valid_block_count = 0;
/* Here, we only have one bio having CP pack */
- if (is_set_ckpt_flags(ckpt, CP_ERROR_FLAG))
- sbi->sb->s_flags |= MS_RDONLY;
- else
- sync_meta_pages(sbi, META_FLUSH, LONG_MAX);
+ sync_meta_pages(sbi, META_FLUSH, LONG_MAX);
- clear_prefree_segments(sbi);
- F2FS_RESET_SB_DIRT(sbi);
+ if (!is_set_ckpt_flags(ckpt, CP_ERROR_FLAG)) {
+ clear_prefree_segments(sbi);
+ F2FS_RESET_SB_DIRT(sbi);
+ }
}
/*
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index c8e2d751ef9c..5022a7d7f7ca 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -141,7 +141,7 @@ struct f2fs_inode_info {
/* Use below internally in f2fs*/
unsigned long flags; /* use to pass per-file flags */
- unsigned long long data_version;/* lastes version of data for fsync */
+ unsigned long long data_version;/* latest version of data for fsync */
atomic_t dirty_dents; /* # of dirty dentry pages */
f2fs_hash_t chash; /* hash value of given file name */
unsigned int clevel; /* maximum level of given file name */
@@ -847,7 +847,6 @@ long f2fs_ioctl(struct file *, unsigned int, unsigned long);
* inode.c
*/
void f2fs_set_inode_flags(struct inode *);
-struct inode *f2fs_iget_nowait(struct super_block *, unsigned long);
struct inode *f2fs_iget(struct super_block *, unsigned long);
void update_inode(struct inode *, struct page *);
int f2fs_write_inode(struct inode *, struct writeback_control *);
@@ -929,8 +928,7 @@ void allocate_new_segments(struct f2fs_sb_info *);
struct page *get_sum_page(struct f2fs_sb_info *, unsigned int);
struct bio *f2fs_bio_alloc(struct block_device *, int);
void f2fs_submit_bio(struct f2fs_sb_info *, enum page_type, bool sync);
-int write_meta_page(struct f2fs_sb_info *, struct page *,
- struct writeback_control *);
+void write_meta_page(struct f2fs_sb_info *, struct page *);
void write_node_page(struct f2fs_sb_info *, struct page *, unsigned int,
block_t, block_t *);
void write_data_page(struct inode *, struct page *, struct dnode_of_data*,
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 3191b52aafb0..33d1736ee5f9 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -157,11 +157,11 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
if (!S_ISREG(inode->i_mode) || inode->i_nlink != 1)
need_cp = true;
- if (is_inode_flag_set(F2FS_I(inode), FI_NEED_CP))
+ else if (is_inode_flag_set(F2FS_I(inode), FI_NEED_CP))
need_cp = true;
- if (!space_for_roll_forward(sbi))
+ else if (!space_for_roll_forward(sbi))
need_cp = true;
- if (need_to_sync_dir(sbi, inode))
+ else if (need_to_sync_dir(sbi, inode))
need_cp = true;
if (need_cp) {
@@ -298,8 +298,6 @@ void f2fs_truncate(struct inode *inode)
inode->i_mtime = inode->i_ctime = CURRENT_TIME;
mark_inode_dirty(inode);
}
-
- f2fs_balance_fs(F2FS_SB(inode->i_sb));
}
static int f2fs_getattr(struct vfsmount *mnt,
@@ -356,6 +354,7 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr)
attr->ia_size != i_size_read(inode)) {
truncate_setsize(inode, attr->ia_size);
f2fs_truncate(inode);
+ f2fs_balance_fs(F2FS_SB(inode->i_sb));
}
__setattr_copy(inode, attr);
@@ -387,12 +386,17 @@ const struct inode_operations f2fs_file_inode_operations = {
static void fill_zero(struct inode *inode, pgoff_t index,
loff_t start, loff_t len)
{
+ struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
struct page *page;
if (!len)
return;
+ f2fs_balance_fs(sbi);
+
+ mutex_lock_op(sbi, DATA_NEW);
page = get_new_data_page(inode, index, false);
+ mutex_unlock_op(sbi, DATA_NEW);
if (!IS_ERR(page)) {
wait_on_page_writeback(page);
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index c386910dacc5..375e69e2c6f1 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -44,6 +44,11 @@ static int gc_thread_func(void *data)
if (kthread_should_stop())
break;
+ if (sbi->sb->s_writers.frozen >= SB_FREEZE_WRITE) {
+ wait_ms = GC_THREAD_MAX_SLEEP_TIME;
+ continue;
+ }
+
f2fs_balance_fs(sbi);
if (!test_opt(sbi, BG_GC))
@@ -574,7 +579,7 @@ next_step:
ofs_in_node = le16_to_cpu(entry->ofs_in_node);
if (phase == 2) {
- inode = f2fs_iget_nowait(sb, dni.ino);
+ inode = f2fs_iget(sb, dni.ino);
if (IS_ERR(inode))
continue;
@@ -667,7 +672,7 @@ gc_more:
if (!(sbi->sb->s_flags & MS_ACTIVE))
goto stop;
- if (has_not_enough_free_secs(sbi))
+ if (gc_type == BG_GC && has_not_enough_free_secs(sbi))
gc_type = FG_GC;
if (!__get_victim(sbi, &segno, gc_type, NO_CHECK_TYPE))
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index 794241777322..ddae412d30c8 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -16,11 +16,6 @@
#include "f2fs.h"
#include "node.h"
-struct f2fs_iget_args {
- u64 ino;
- int on_free;
-};
-
void f2fs_set_inode_flags(struct inode *inode)
{
unsigned int flags = F2FS_I(inode)->i_flags;
@@ -40,34 +35,6 @@ void f2fs_set_inode_flags(struct inode *inode)
inode->i_flags |= S_DIRSYNC;
}
-static int f2fs_iget_test(struct inode *inode, void *data)
-{
- struct f2fs_iget_args *args = data;
-
- if (inode->i_ino != args->ino)
- return 0;
- if (inode->i_state & (I_FREEING | I_WILL_FREE)) {
- args->on_free = 1;
- return 0;
- }
- return 1;
-}
-
-struct inode *f2fs_iget_nowait(struct super_block *sb, unsigned long ino)
-{
- struct f2fs_iget_args args = {
- .ino = ino,
- .on_free = 0
- };
- struct inode *inode = ilookup5(sb, ino, f2fs_iget_test, &args);
-
- if (inode)
- return inode;
- if (!args.on_free)
- return f2fs_iget(sb, ino);
- return ERR_PTR(-ENOENT);
-}
-
static int do_read_inode(struct inode *inode)
{
struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
@@ -100,6 +67,10 @@ static int do_read_inode(struct inode *inode)
inode->i_ctime.tv_nsec = le32_to_cpu(ri->i_ctime_nsec);
inode->i_mtime.tv_nsec = le32_to_cpu(ri->i_mtime_nsec);
inode->i_generation = le32_to_cpu(ri->i_generation);
+ if (ri->i_addr[0])
+ inode->i_rdev = old_decode_dev(le32_to_cpu(ri->i_addr[0]));
+ else
+ inode->i_rdev = new_decode_dev(le32_to_cpu(ri->i_addr[1]));
fi->i_current_depth = le32_to_cpu(ri->i_current_depth);
fi->i_xattr_nid = le32_to_cpu(ri->i_xattr_nid);
@@ -203,6 +174,20 @@ void update_inode(struct inode *inode, struct page *node_page)
ri->i_flags = cpu_to_le32(F2FS_I(inode)->i_flags);
ri->i_pino = cpu_to_le32(F2FS_I(inode)->i_pino);
ri->i_generation = cpu_to_le32(inode->i_generation);
+
+ if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
+ if (old_valid_dev(inode->i_rdev)) {
+ ri->i_addr[0] =
+ cpu_to_le32(old_encode_dev(inode->i_rdev));
+ ri->i_addr[1] = 0;
+ } else {
+ ri->i_addr[0] = 0;
+ ri->i_addr[1] =
+ cpu_to_le32(new_encode_dev(inode->i_rdev));
+ ri->i_addr[2] = 0;
+ }
+ }
+
set_cold_node(inode, node_page);
set_page_dirty(node_page);
}
@@ -260,6 +245,7 @@ void f2fs_evict_inode(struct inode *inode)
if (inode->i_nlink || is_bad_inode(inode))
goto no_delete;
+ sb_start_intwrite(inode->i_sb);
set_inode_flag(F2FS_I(inode), FI_NO_ALLOC);
i_size_write(inode, 0);
@@ -267,6 +253,7 @@ void f2fs_evict_inode(struct inode *inode)
f2fs_truncate(inode);
remove_inode_page(inode);
+ sb_end_intwrite(inode->i_sb);
no_delete:
clear_inode(inode);
}
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 9bda63c9c166..33fa6d506d94 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -104,7 +104,7 @@ static void ra_nat_pages(struct f2fs_sb_info *sbi, int nid)
f2fs_put_page(page, 1);
continue;
}
- page_cache_release(page);
+ f2fs_put_page(page, 0);
}
}
@@ -874,15 +874,11 @@ void ra_node_page(struct f2fs_sb_info *sbi, nid_t nid)
return;
if (read_node_page(apage, READA))
- goto unlock_out;
+ unlock_page(apage);
- page_cache_release(apage);
- return;
-
-unlock_out:
- unlock_page(apage);
release_out:
- page_cache_release(apage);
+ f2fs_put_page(apage, 0);
+ return;
}
struct page *get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid)
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index f42e4060b399..e2a3e1a8eae9 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -226,7 +226,7 @@ static void check_index_in_prev_nodes(struct f2fs_sb_info *sbi,
f2fs_put_page(node_page, 1);
/* Deallocate previous index in the node page */
- inode = f2fs_iget_nowait(sbi->sb, ino);
+ inode = f2fs_iget(sbi->sb, ino);
if (IS_ERR(inode))
return;
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 4b0099066582..7aa270f3538a 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -600,6 +600,7 @@ static void f2fs_end_io_write(struct bio *bio, int err)
if (page->mapping)
set_bit(AS_EIO, &page->mapping->flags);
set_ckpt_flags(p->sbi->ckpt, CP_ERROR_FLAG);
+ p->sbi->sb->s_flags |= MS_RDONLY;
}
end_page_writeback(page);
dec_page_count(p->sbi, F2FS_WRITEBACK);
@@ -815,15 +816,10 @@ static void do_write_page(struct f2fs_sb_info *sbi, struct page *page,
mutex_unlock(&curseg->curseg_mutex);
}
-int write_meta_page(struct f2fs_sb_info *sbi, struct page *page,
- struct writeback_control *wbc)
+void write_meta_page(struct f2fs_sb_info *sbi, struct page *page)
{
- if (wbc->for_reclaim)
- return AOP_WRITEPAGE_ACTIVATE;
-
set_page_writeback(page);
submit_write_page(sbi, page, page->index, META);
- return 0;
}
void write_node_page(struct f2fs_sb_info *sbi, struct page *page,
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 37fad04c8669..0b18aee2ed25 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -143,6 +143,22 @@ int f2fs_sync_fs(struct super_block *sb, int sync)
return 0;
}
+static int f2fs_freeze(struct super_block *sb)
+{
+ int err;
+
+ if (sb->s_flags & MS_RDONLY)
+ return 0;
+
+ err = f2fs_sync_fs(sb, 1);
+ return err;
+}
+
+static int f2fs_unfreeze(struct super_block *sb)
+{
+ return 0;
+}
+
static int f2fs_statfs(struct dentry *dentry, struct kstatfs *buf)
{
struct super_block *sb = dentry->d_sb;
@@ -198,7 +214,7 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root)
seq_puts(seq, ",noacl");
#endif
if (test_opt(sbi, DISABLE_EXT_IDENTIFY))
- seq_puts(seq, ",disable_ext_indentify");
+ seq_puts(seq, ",disable_ext_identify");
seq_printf(seq, ",active_logs=%u", sbi->active_logs);
@@ -213,6 +229,8 @@ static struct super_operations f2fs_sops = {
.evict_inode = f2fs_evict_inode,
.put_super = f2fs_put_super,
.sync_fs = f2fs_sync_fs,
+ .freeze_fs = f2fs_freeze,
+ .unfreeze_fs = f2fs_unfreeze,
.statfs = f2fs_statfs,
};
@@ -387,10 +405,11 @@ static int sanity_check_raw_super(struct super_block *sb,
return 0;
}
-static int sanity_check_ckpt(struct f2fs_super_block *raw_super,
- struct f2fs_checkpoint *ckpt)
+static int sanity_check_ckpt(struct f2fs_sb_info *sbi)
{
unsigned int total, fsmeta;
+ struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi);
+ struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
total = le32_to_cpu(raw_super->segment_count);
fsmeta = le32_to_cpu(raw_super->segment_count_ckpt);
@@ -401,6 +420,11 @@ static int sanity_check_ckpt(struct f2fs_super_block *raw_super,
if (fsmeta >= total)
return 1;
+
+ if (is_set_ckpt_flags(ckpt, CP_ERROR_FLAG)) {
+ f2fs_msg(sbi->sb, KERN_ERR, "A bug case: need to run fsck");
+ return 1;
+ }
return 0;
}
@@ -525,7 +549,7 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
/* sanity checking of checkpoint */
err = -EINVAL;
- if (sanity_check_ckpt(raw_super, sbi->ckpt)) {
+ if (sanity_check_ckpt(sbi)) {
f2fs_msg(sb, KERN_ERR, "Invalid F2FS checkpoint");
goto free_cp;
}
diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c
index b906ed17a839..9802de0f85e6 100644
--- a/fs/gfs2/lock_dlm.c
+++ b/fs/gfs2/lock_dlm.c
@@ -281,6 +281,7 @@ static void gdlm_put_lock(struct gfs2_glock *gl)
{
struct gfs2_sbd *sdp = gl->gl_sbd;
struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+ int lvb_needs_unlock = 0;
int error;
if (gl->gl_lksb.sb_lkid == 0) {
@@ -294,8 +295,12 @@ static void gdlm_put_lock(struct gfs2_glock *gl)
gfs2_update_request_times(gl);
/* don't want to skip dlm_unlock writing the lvb when lock is ex */
+
+ if (gl->gl_lksb.sb_lvbptr && (gl->gl_state == LM_ST_EXCLUSIVE))
+ lvb_needs_unlock = 1;
+
if (test_bit(SDF_SKIP_DLM_UNLOCK, &sdp->sd_flags) &&
- gl->gl_lksb.sb_lvbptr && (gl->gl_state != LM_ST_EXCLUSIVE)) {
+ !lvb_needs_unlock) {
gfs2_glock_free(gl);
return;
}
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 3091d42992f0..20dde86a2d0a 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -424,7 +424,8 @@ void jbd2_journal_commit_transaction(journal_t *journal)
J_ASSERT(journal->j_committing_transaction == NULL);
commit_transaction = journal->j_running_transaction;
- J_ASSERT(commit_transaction->t_state == T_RUNNING);
+ J_ASSERT(commit_transaction->t_state == T_REQUESTED ||
+ commit_transaction->t_state == T_RUNNING);
trace_jbd2_start_commit(journal, commit_transaction);
jbd_debug(1, "JBD2: starting commit of transaction %d\n",
@@ -435,7 +436,14 @@ void jbd2_journal_commit_transaction(journal_t *journal)
trace_jbd2_commit_locking(journal, commit_transaction);
stats.run.rs_wait = commit_transaction->t_max_wait;
+ stats.run.rs_request_delay = 0;
stats.run.rs_locked = jiffies;
+ if (commit_transaction->t_requested) {
+ stats.run.rs_request_delay =
+ jbd2_time_diff(commit_transaction->t_requested,
+ stats.run.rs_locked);
+ stats.run.rs_locked = commit_transaction->t_requested;
+ }
stats.run.rs_running = jbd2_time_diff(commit_transaction->t_start,
stats.run.rs_locked);
@@ -1116,7 +1124,10 @@ restart_loop:
*/
spin_lock(&journal->j_history_lock);
journal->j_stats.ts_tid++;
+ if (commit_transaction->t_requested)
+ journal->j_stats.ts_requested++;
journal->j_stats.run.rs_wait += stats.run.rs_wait;
+ journal->j_stats.run.rs_request_delay += stats.run.rs_request_delay;
journal->j_stats.run.rs_running += stats.run.rs_running;
journal->j_stats.run.rs_locked += stats.run.rs_locked;
journal->j_stats.run.rs_flushing += stats.run.rs_flushing;
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index dbf41f9452db..f6ad0c086da6 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -513,6 +513,10 @@ int __jbd2_log_space_left(journal_t *journal)
*/
int __jbd2_log_start_commit(journal_t *journal, tid_t target)
{
+ /* Return if the txn has already requested to be committed */
+ if (journal->j_commit_request == target)
+ return 0;
+
/*
* The only transaction we can possibly wait upon is the
* currently running transaction (if it exists). Otherwise,
@@ -529,6 +533,8 @@ int __jbd2_log_start_commit(journal_t *journal, tid_t target)
jbd_debug(1, "JBD2: requesting commit %d/%d\n",
journal->j_commit_request,
journal->j_commit_sequence);
+ journal->j_running_transaction->t_state = T_REQUESTED;
+ journal->j_running_transaction->t_requested = jiffies;
wake_up(&journal->j_wait_commit);
return 1;
} else if (!tid_geq(journal->j_commit_request, target))
@@ -894,13 +900,18 @@ static int jbd2_seq_info_show(struct seq_file *seq, void *v)
if (v != SEQ_START_TOKEN)
return 0;
- seq_printf(seq, "%lu transaction, each up to %u blocks\n",
- s->stats->ts_tid,
- s->journal->j_max_transaction_buffers);
+ seq_printf(seq, "%lu transactions (%lu requested), "
+ "each up to %u blocks\n",
+ s->stats->ts_tid, s->stats->ts_requested,
+ s->journal->j_max_transaction_buffers);
if (s->stats->ts_tid == 0)
return 0;
seq_printf(seq, "average: \n %ums waiting for transaction\n",
jiffies_to_msecs(s->stats->run.rs_wait / s->stats->ts_tid));
+ seq_printf(seq, " %ums request delay\n",
+ (s->stats->ts_requested == 0) ? 0 :
+ jiffies_to_msecs(s->stats->run.rs_request_delay /
+ s->stats->ts_requested));
seq_printf(seq, " %ums running transaction\n",
jiffies_to_msecs(s->stats->run.rs_running / s->stats->ts_tid));
seq_printf(seq, " %ums transaction was being locked\n",
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index df9f29760efa..7fbc68fc326b 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -100,6 +100,7 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction)
journal->j_running_transaction = transaction;
transaction->t_max_wait = 0;
transaction->t_start = jiffies;
+ transaction->t_requested = 0;
return transaction;
}
@@ -224,7 +225,8 @@ repeat:
* If the current transaction is locked down for commit, wait for the
* lock to be released.
*/
- if (transaction->t_state == T_LOCKED) {
+ if ((transaction->t_state == T_LOCKED) ||
+ (transaction->t_state == T_REQUESTED)) {
DEFINE_WAIT(wait);
prepare_to_wait(&journal->j_wait_transaction_locked,
@@ -2179,7 +2181,8 @@ void __jbd2_journal_refile_buffer(struct journal_head *jh)
else
jlist = BJ_Reserved;
__jbd2_journal_file_buffer(jh, jh->b_transaction, jlist);
- J_ASSERT_JH(jh, jh->b_transaction->t_state == T_RUNNING);
+ J_ASSERT_JH(jh, (jh->b_transaction->t_state == T_RUNNING ||
+ jh->b_transaction->t_state == T_REQUESTED));
if (was_dirty)
set_buffer_jbddirty(bh);
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
index dd057bc6b65b..fc8dc20fdeb9 100644
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -177,11 +177,31 @@ out_nofree:
return mnt;
}
+static int
+nfs_namespace_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
+{
+ if (NFS_FH(dentry->d_inode)->size != 0)
+ return nfs_getattr(mnt, dentry, stat);
+ generic_fillattr(dentry->d_inode, stat);
+ return 0;
+}
+
+static int
+nfs_namespace_setattr(struct dentry *dentry, struct iattr *attr)
+{
+ if (NFS_FH(dentry->d_inode)->size != 0)
+ return nfs_setattr(dentry, attr);
+ return -EACCES;
+}
+
const struct inode_operations nfs_mountpoint_inode_operations = {
.getattr = nfs_getattr,
+ .setattr = nfs_setattr,
};
const struct inode_operations nfs_referral_inode_operations = {
+ .getattr = nfs_namespace_getattr,
+ .setattr = nfs_namespace_setattr,
};
static void nfs_expire_automounts(struct work_struct *work)
diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c
index acc347268124..2e9779b58b7a 100644
--- a/fs/nfs/nfs4client.c
+++ b/fs/nfs/nfs4client.c
@@ -236,11 +236,10 @@ struct nfs_client *nfs4_init_client(struct nfs_client *clp,
error = nfs4_discover_server_trunking(clp, &old);
if (error < 0)
goto error;
+ nfs_put_client(clp);
if (clp != old) {
clp->cl_preserve_clid = true;
- nfs_put_client(clp);
clp = old;
- atomic_inc(&clp->cl_count);
}
return clp;
@@ -306,7 +305,7 @@ int nfs40_walk_client_list(struct nfs_client *new,
.clientid = new->cl_clientid,
.confirm = new->cl_confirm,
};
- int status;
+ int status = -NFS4ERR_STALE_CLIENTID;
spin_lock(&nn->nfs_client_lock);
list_for_each_entry_safe(pos, n, &nn->nfs_client_list, cl_share_link) {
@@ -332,40 +331,33 @@ int nfs40_walk_client_list(struct nfs_client *new,
if (prev)
nfs_put_client(prev);
+ prev = pos;
status = nfs4_proc_setclientid_confirm(pos, &clid, cred);
- if (status == 0) {
+ switch (status) {
+ case -NFS4ERR_STALE_CLIENTID:
+ break;
+ case 0:
nfs4_swap_callback_idents(pos, new);
- nfs_put_client(pos);
+ prev = NULL;
*result = pos;
dprintk("NFS: <-- %s using nfs_client = %p ({%d})\n",
__func__, pos, atomic_read(&pos->cl_count));
- return 0;
- }
- if (status != -NFS4ERR_STALE_CLIENTID) {
- nfs_put_client(pos);
- dprintk("NFS: <-- %s status = %d, no result\n",
- __func__, status);
- return status;
+ default:
+ goto out;
}
spin_lock(&nn->nfs_client_lock);
- prev = pos;
}
+ spin_unlock(&nn->nfs_client_lock);
- /*
- * No matching nfs_client found. This should be impossible,
- * because the new nfs_client has already been added to
- * nfs_client_list by nfs_get_client().
- *
- * Don't BUG(), since the caller is holding a mutex.
- */
+ /* No match found. The server lost our clientid */
+out:
if (prev)
nfs_put_client(prev);
- spin_unlock(&nn->nfs_client_lock);
- pr_err("NFS: %s Error: no matching nfs_client found\n", __func__);
- return -NFS4ERR_STALE_CLIENTID;
+ dprintk("NFS: <-- %s status = %d\n", __func__, status);
+ return status;
}
#ifdef CONFIG_NFS_V4_1
@@ -432,7 +424,7 @@ int nfs41_walk_client_list(struct nfs_client *new,
{
struct nfs_net *nn = net_generic(new->cl_net, nfs_net_id);
struct nfs_client *pos, *n, *prev = NULL;
- int error;
+ int status = -NFS4ERR_STALE_CLIENTID;
spin_lock(&nn->nfs_client_lock);
list_for_each_entry_safe(pos, n, &nn->nfs_client_list, cl_share_link) {
@@ -448,14 +440,17 @@ int nfs41_walk_client_list(struct nfs_client *new,
nfs_put_client(prev);
prev = pos;
- error = nfs_wait_client_init_complete(pos);
- if (error < 0) {
+ nfs4_schedule_lease_recovery(pos);
+ status = nfs_wait_client_init_complete(pos);
+ if (status < 0) {
nfs_put_client(pos);
spin_lock(&nn->nfs_client_lock);
continue;
}
-
+ status = pos->cl_cons_state;
spin_lock(&nn->nfs_client_lock);
+ if (status < 0)
+ continue;
}
if (pos->rpc_ops != new->rpc_ops)
@@ -473,6 +468,7 @@ int nfs41_walk_client_list(struct nfs_client *new,
if (!nfs4_match_serverowners(pos, new))
continue;
+ atomic_inc(&pos->cl_count);
spin_unlock(&nn->nfs_client_lock);
dprintk("NFS: <-- %s using nfs_client = %p ({%d})\n",
__func__, pos, atomic_read(&pos->cl_count));
@@ -481,16 +477,10 @@ int nfs41_walk_client_list(struct nfs_client *new,
return 0;
}
- /*
- * No matching nfs_client found. This should be impossible,
- * because the new nfs_client has already been added to
- * nfs_client_list by nfs_get_client().
- *
- * Don't BUG(), since the caller is holding a mutex.
- */
+ /* No matching nfs_client found. */
spin_unlock(&nn->nfs_client_lock);
- pr_err("NFS: %s Error: no matching nfs_client found\n", __func__);
- return -NFS4ERR_STALE_CLIENTID;
+ dprintk("NFS: <-- %s status = %d\n", __func__, status);
+ return status;
}
#endif /* CONFIG_NFS_V4_1 */
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 9448c579d41a..e61f68d5ef21 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -136,16 +136,11 @@ int nfs40_discover_server_trunking(struct nfs_client *clp,
clp->cl_confirm = clid.confirm;
status = nfs40_walk_client_list(clp, result, cred);
- switch (status) {
- case -NFS4ERR_STALE_CLIENTID:
- set_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state);
- case 0:
+ if (status == 0) {
/* Sustain the lease, even if it's empty. If the clientid4
* goes stale it's of no use for trunking discovery. */
nfs4_schedule_state_renewal(*result);
- break;
}
-
out:
return status;
}
@@ -1863,6 +1858,7 @@ again:
case -ETIMEDOUT:
case -EAGAIN:
ssleep(1);
+ case -NFS4ERR_STALE_CLIENTID:
dprintk("NFS: %s after status %d, retrying\n",
__func__, status);
goto again;
@@ -2022,8 +2018,18 @@ static int nfs4_reset_session(struct nfs_client *clp)
nfs4_begin_drain_session(clp);
cred = nfs4_get_exchange_id_cred(clp);
status = nfs4_proc_destroy_session(clp->cl_session, cred);
- if (status && status != -NFS4ERR_BADSESSION &&
- status != -NFS4ERR_DEADSESSION) {
+ switch (status) {
+ case 0:
+ case -NFS4ERR_BADSESSION:
+ case -NFS4ERR_DEADSESSION:
+ break;
+ case -NFS4ERR_BACK_CHAN_BUSY:
+ case -NFS4ERR_DELAY:
+ set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state);
+ status = 0;
+ ssleep(1);
+ goto out;
+ default:
status = nfs4_recovery_handle_error(clp, status);
goto out;
}
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 2e7e8c878e5d..b056b1628722 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -2589,27 +2589,23 @@ nfs_xdev_mount(struct file_system_type *fs_type, int flags,
struct nfs_server *server;
struct dentry *mntroot = ERR_PTR(-ENOMEM);
struct nfs_subversion *nfs_mod = NFS_SB(data->sb)->nfs_client->cl_nfs_mod;
- int error;
- dprintk("--> nfs_xdev_mount_common()\n");
+ dprintk("--> nfs_xdev_mount()\n");
mount_info.mntfh = mount_info.cloned->fh;
/* create a new volume representation */
server = nfs_mod->rpc_ops->clone_server(NFS_SB(data->sb), data->fh, data->fattr, data->authflavor);
- if (IS_ERR(server)) {
- error = PTR_ERR(server);
- goto out_err;
- }
- mntroot = nfs_fs_mount_common(server, flags, dev_name, &mount_info, nfs_mod);
- dprintk("<-- nfs_xdev_mount_common() = 0\n");
-out:
- return mntroot;
+ if (IS_ERR(server))
+ mntroot = ERR_CAST(server);
+ else
+ mntroot = nfs_fs_mount_common(server, flags,
+ dev_name, &mount_info, nfs_mod);
-out_err:
- dprintk("<-- nfs_xdev_mount_common() = %d [error]\n", error);
- goto out;
+ dprintk("<-- nfs_xdev_mount() = %ld\n",
+ IS_ERR(mntroot) ? PTR_ERR(mntroot) : 0L);
+ return mntroot;
}
#if IS_ENABLED(CONFIG_NFS_V4)
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index 8fe4e2892ab9..fc121350d8cb 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -67,7 +67,6 @@
#define NAMEI_RA_CHUNKS 2
#define NAMEI_RA_BLOCKS 4
#define NAMEI_RA_SIZE (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS)
-#define NAMEI_RA_INDEX(c,b) (((c) * NAMEI_RA_BLOCKS) + (b))
static unsigned char ocfs2_filetype_table[] = {
DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
index 5ea2e77ff023..86d1038b5a12 100644
--- a/fs/pstore/platform.c
+++ b/fs/pstore/platform.c
@@ -96,6 +96,27 @@ static const char *get_reason_str(enum kmsg_dump_reason reason)
}
}
+bool pstore_cannot_block_path(enum kmsg_dump_reason reason)
+{
+ /*
+ * In case of NMI path, pstore shouldn't be blocked
+ * regardless of reason.
+ */
+ if (in_nmi())
+ return true;
+
+ switch (reason) {
+ /* In panic case, other cpus are stopped by smp_send_stop(). */
+ case KMSG_DUMP_PANIC:
+ /* Emergency restart shouldn't be blocked by spin lock. */
+ case KMSG_DUMP_EMERG:
+ return true;
+ default:
+ return false;
+ }
+}
+EXPORT_SYMBOL_GPL(pstore_cannot_block_path);
+
/*
* callback from kmsg_dump. (s2,l2) has the most recently
* written bytes, older bytes are in (s1,l1). Save as much
@@ -114,10 +135,12 @@ static void pstore_dump(struct kmsg_dumper *dumper,
why = get_reason_str(reason);
- if (in_nmi()) {
- is_locked = spin_trylock(&psinfo->buf_lock);
- if (!is_locked)
- pr_err("pstore dump routine blocked in NMI, may corrupt error record\n");
+ if (pstore_cannot_block_path(reason)) {
+ is_locked = spin_trylock_irqsave(&psinfo->buf_lock, flags);
+ if (!is_locked) {
+ pr_err("pstore dump routine blocked in %s path, may corrupt error record\n"
+ , in_nmi() ? "NMI" : why);
+ }
} else
spin_lock_irqsave(&psinfo->buf_lock, flags);
oopscount++;
@@ -143,9 +166,9 @@ static void pstore_dump(struct kmsg_dumper *dumper,
total += hsize + len;
part++;
}
- if (in_nmi()) {
+ if (pstore_cannot_block_path(reason)) {
if (is_locked)
- spin_unlock(&psinfo->buf_lock);
+ spin_unlock_irqrestore(&psinfo->buf_lock, flags);
} else
spin_unlock_irqrestore(&psinfo->buf_lock, flags);
}
diff --git a/fs/udf/super.c b/fs/udf/super.c
index e9be396a558d..186adbf94b20 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -1866,6 +1866,8 @@ static void udf_open_lvid(struct super_block *sb)
mark_buffer_dirty(bh);
sbi->s_lvid_dirty = 0;
mutex_unlock(&sbi->s_alloc_mutex);
+ /* Make opening of filesystem visible on the media immediately */
+ sync_dirty_buffer(bh);
}
static void udf_close_lvid(struct super_block *sb)
@@ -1906,6 +1908,8 @@ static void udf_close_lvid(struct super_block *sb)
mark_buffer_dirty(bh);
sbi->s_lvid_dirty = 0;
mutex_unlock(&sbi->s_alloc_mutex);
+ /* Make closing of filesystem visible on the media immediately */
+ sync_dirty_buffer(bh);
}
u64 lvid_get_unique_id(struct super_block *sb)
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 4111a40ebe1a..5f707e537171 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -86,11 +86,11 @@ xfs_destroy_ioend(
}
if (ioend->io_iocb) {
+ inode_dio_done(ioend->io_inode);
if (ioend->io_isasync) {
aio_complete(ioend->io_iocb, ioend->io_error ?
ioend->io_error : ioend->io_result, 0);
}
- inode_dio_done(ioend->io_inode);
}
mempool_free(ioend, xfs_ioend_pool);
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 0e92d12765d2..cdb2d3348583 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -4680,9 +4680,6 @@ __xfs_bmapi_allocate(
return error;
}
- if (bma->flags & XFS_BMAPI_STACK_SWITCH)
- bma->stack_switch = 1;
-
error = xfs_bmap_alloc(bma);
if (error)
return error;
@@ -4956,6 +4953,9 @@ xfs_bmapi_write(
bma.flist = flist;
bma.firstblock = firstblock;
+ if (flags & XFS_BMAPI_STACK_SWITCH)
+ bma.stack_switch = 1;
+
while (bno < end && n < *nmap) {
inhole = eof || bma.got.br_startoff > bno;
wasdelay = !inhole && isnullstartblock(bma.got.br_startblock);
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 56d1614760cf..fbbb9eb92e32 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -487,6 +487,7 @@ _xfs_buf_find(
struct rb_node *parent;
xfs_buf_t *bp;
xfs_daddr_t blkno = map[0].bm_bn;
+ xfs_daddr_t eofs;
int numblks = 0;
int i;
@@ -498,6 +499,23 @@ _xfs_buf_find(
ASSERT(!(numbytes < (1 << btp->bt_sshift)));
ASSERT(!(BBTOB(blkno) & (xfs_off_t)btp->bt_smask));
+ /*
+ * Corrupted block numbers can get through to here, unfortunately, so we
+ * have to check that the buffer falls within the filesystem bounds.
+ */
+ eofs = XFS_FSB_TO_BB(btp->bt_mount, btp->bt_mount->m_sb.sb_dblocks);
+ if (blkno >= eofs) {
+ /*
+ * XXX (dgc): we should really be returning EFSCORRUPTED here,
+ * but none of the higher level infrastructure supports
+ * returning a specific error on buffer lookup failures.
+ */
+ xfs_alert(btp->bt_mount,
+ "%s: Block out of range: block 0x%llx, EOFS 0x%llx ",
+ __func__, blkno, eofs);
+ return NULL;
+ }
+
/* get tree root */
pag = xfs_perag_get(btp->bt_mount,
xfs_daddr_to_agno(btp->bt_mount, blkno));
@@ -1487,6 +1505,8 @@ restart:
while (!list_empty(&btp->bt_lru)) {
bp = list_first_entry(&btp->bt_lru, struct xfs_buf, b_lru);
if (atomic_read(&bp->b_hold) > 1) {
+ trace_xfs_buf_wait_buftarg(bp, _RET_IP_);
+ list_move_tail(&bp->b_lru, &btp->bt_lru);
spin_unlock(&btp->bt_lru_lock);
delay(100);
goto restart;
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 77b09750e92c..3f9949fee391 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -652,7 +652,10 @@ xfs_buf_item_unlock(
/*
* If the buf item isn't tracking any data, free it, otherwise drop the
- * reference we hold to it.
+ * reference we hold to it. If we are aborting the transaction, this may
+ * be the only reference to the buf item, so we free it anyway
+ * regardless of whether it is dirty or not. A dirty abort implies a
+ * shutdown, anyway.
*/
clean = 1;
for (i = 0; i < bip->bli_format_count; i++) {
@@ -664,7 +667,12 @@ xfs_buf_item_unlock(
}
if (clean)
xfs_buf_item_relse(bp);
- else
+ else if (aborted) {
+ if (atomic_dec_and_test(&bip->bli_refcount)) {
+ ASSERT(XFS_FORCED_SHUTDOWN(lip->li_mountp));
+ xfs_buf_item_relse(bp);
+ }
+ } else
atomic_dec(&bip->bli_refcount);
if (!hold)
diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c
index d0e9c74d3d96..a8bd26b82ecb 100644
--- a/fs/xfs/xfs_dfrag.c
+++ b/fs/xfs/xfs_dfrag.c
@@ -246,10 +246,10 @@ xfs_swap_extents(
goto out_unlock;
}
- error = -filemap_write_and_wait(VFS_I(ip)->i_mapping);
+ error = -filemap_write_and_wait(VFS_I(tip)->i_mapping);
if (error)
goto out_unlock;
- truncate_pagecache_range(VFS_I(ip), 0, -1);
+ truncate_pagecache_range(VFS_I(tip), 0, -1);
/* Verify O_DIRECT for ftmp */
if (VN_CACHED(VFS_I(tip)) != 0) {
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index add06b4e9a63..364818eef40e 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -351,6 +351,15 @@ xfs_iomap_prealloc_size(
}
if (shift)
alloc_blocks >>= shift;
+
+ /*
+ * If we are still trying to allocate more space than is
+ * available, squash the prealloc hard. This can happen if we
+ * have a large file on a small filesystem and the above
+ * lowspace thresholds are smaller than MAXEXTLEN.
+ */
+ while (alloc_blocks >= freesp)
+ alloc_blocks >>= 4;
}
if (alloc_blocks < mp->m_writeio_blocks)
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index da508463ff10..7d6df7c00c36 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -658,7 +658,7 @@ xfs_sb_quiet_read_verify(
return;
}
/* quietly fail */
- xfs_buf_ioerror(bp, EFSCORRUPTED);
+ xfs_buf_ioerror(bp, EWRONGFS);
}
static void
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 2e137d4a85ae..16a812977eab 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -341,6 +341,7 @@ DEFINE_BUF_EVENT(xfs_buf_item_relse);
DEFINE_BUF_EVENT(xfs_buf_item_iodone);
DEFINE_BUF_EVENT(xfs_buf_item_iodone_async);
DEFINE_BUF_EVENT(xfs_buf_error_relse);
+DEFINE_BUF_EVENT(xfs_buf_wait_buftarg);
DEFINE_BUF_EVENT(xfs_trans_read_buf_io);
DEFINE_BUF_EVENT(xfs_trans_read_buf_shut);