summaryrefslogtreecommitdiff
path: root/fs/ext4
diff options
context:
space:
mode:
authorSage Weil <sage@newdream.net>2010-02-18 10:40:25 -0800
committerSage Weil <sage@newdream.net>2010-02-18 10:40:25 -0800
commiteb9bbff08c35a99f82c5b414b833cec4321c4167 (patch)
tree5063cdf9059e1d8aad9b42cb1229944eceed5337 /fs/ext4
parent2c27c9a57c93a0757b9b4b0e7dc1abeaf1db1ce2 (diff)
parent716c28c0bc8bcbdd26e819f38dfc8fdfaafc0289 (diff)
Merge remote branch 'vfs/write_inode' into for-next
Conflicts: Documentation/ioctl/ioctl-number.txt
Diffstat (limited to 'fs/ext4')
-rw-r--r--fs/ext4/Kconfig19
-rw-r--r--fs/ext4/acl.c74
-rw-r--r--fs/ext4/balloc.c46
-rw-r--r--fs/ext4/block_validity.c4
-rw-r--r--fs/ext4/ext4.h87
-rw-r--r--fs/ext4/ext4_extents.h10
-rw-r--r--fs/ext4/ext4_jbd2.c82
-rw-r--r--fs/ext4/ext4_jbd2.h50
-rw-r--r--fs/ext4/extents.c579
-rw-r--r--fs/ext4/fsync.c69
-rw-r--r--fs/ext4/inode.c1012
-rw-r--r--fs/ext4/ioctl.c29
-rw-r--r--fs/ext4/mballoc.c416
-rw-r--r--fs/ext4/mballoc.h36
-rw-r--r--fs/ext4/migrate.c29
-rw-r--r--fs/ext4/move_extent.c302
-rw-r--r--fs/ext4/namei.c49
-rw-r--r--fs/ext4/resize.c2
-rw-r--r--fs/ext4/super.c303
-rw-r--r--fs/ext4/xattr.c48
-rw-r--r--fs/ext4/xattr_security.c20
-rw-r--r--fs/ext4/xattr_trusted.c20
-rw-r--r--fs/ext4/xattr_user.c25
23 files changed, 2012 insertions, 1299 deletions
diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig
index d5c0ea2e8f2d..9ed1bb1f319f 100644
--- a/fs/ext4/Kconfig
+++ b/fs/ext4/Kconfig
@@ -26,19 +26,16 @@ config EXT4_FS
If unsure, say N.
-config EXT4DEV_COMPAT
- bool "Enable ext4dev compatibility"
+config EXT4_USE_FOR_EXT23
+ bool "Use ext4 for ext2/ext3 file systems"
depends on EXT4_FS
+ depends on EXT3_FS=n || EXT2_FS=n
+ default y
help
- Starting with 2.6.28, the name of the ext4 filesystem was
- renamed from ext4dev to ext4. Unfortunately there are some
- legacy userspace programs (such as klibc's fstype) have
- "ext4dev" hardcoded.
-
- To enable backwards compatibility so that systems that are
- still expecting to mount ext4 filesystems using ext4dev,
- choose Y here. This feature will go away by 2.6.31, so
- please arrange to get your userspace programs fixed!
+ Allow the ext4 file system driver code to be used for ext2 or
+ ext3 file system mounts. This allows users to reduce their
+ compiled kernel size by using one file system driver for
+ ext2, ext3, and ext4 file systems.
config EXT4_FS_XATTR
bool "Ext4 extended attributes"
diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
index 0df88b2a69b0..8a2a29d35a6f 100644
--- a/fs/ext4/acl.c
+++ b/fs/ext4/acl.c
@@ -364,12 +364,12 @@ out:
* Extended attribute handlers
*/
static size_t
-ext4_xattr_list_acl_access(struct inode *inode, char *list, size_t list_len,
- const char *name, size_t name_len)
+ext4_xattr_list_acl_access(struct dentry *dentry, char *list, size_t list_len,
+ const char *name, size_t name_len, int type)
{
const size_t size = sizeof(POSIX_ACL_XATTR_ACCESS);
- if (!test_opt(inode->i_sb, POSIX_ACL))
+ if (!test_opt(dentry->d_sb, POSIX_ACL))
return 0;
if (list && size <= list_len)
memcpy(list, POSIX_ACL_XATTR_ACCESS, size);
@@ -377,12 +377,12 @@ ext4_xattr_list_acl_access(struct inode *inode, char *list, size_t list_len,
}
static size_t
-ext4_xattr_list_acl_default(struct inode *inode, char *list, size_t list_len,
- const char *name, size_t name_len)
+ext4_xattr_list_acl_default(struct dentry *dentry, char *list, size_t list_len,
+ const char *name, size_t name_len, int type)
{
const size_t size = sizeof(POSIX_ACL_XATTR_DEFAULT);
- if (!test_opt(inode->i_sb, POSIX_ACL))
+ if (!test_opt(dentry->d_sb, POSIX_ACL))
return 0;
if (list && size <= list_len)
memcpy(list, POSIX_ACL_XATTR_DEFAULT, size);
@@ -390,15 +390,18 @@ ext4_xattr_list_acl_default(struct inode *inode, char *list, size_t list_len,
}
static int
-ext4_xattr_get_acl(struct inode *inode, int type, void *buffer, size_t size)
+ext4_xattr_get_acl(struct dentry *dentry, const char *name, void *buffer,
+ size_t size, int type)
{
struct posix_acl *acl;
int error;
- if (!test_opt(inode->i_sb, POSIX_ACL))
+ if (strcmp(name, "") != 0)
+ return -EINVAL;
+ if (!test_opt(dentry->d_sb, POSIX_ACL))
return -EOPNOTSUPP;
- acl = ext4_get_acl(inode, type);
+ acl = ext4_get_acl(dentry->d_inode, type);
if (IS_ERR(acl))
return PTR_ERR(acl);
if (acl == NULL)
@@ -410,31 +413,16 @@ ext4_xattr_get_acl(struct inode *inode, int type, void *buffer, size_t size)
}
static int
-ext4_xattr_get_acl_access(struct inode *inode, const char *name,
- void *buffer, size_t size)
-{
- if (strcmp(name, "") != 0)
- return -EINVAL;
- return ext4_xattr_get_acl(inode, ACL_TYPE_ACCESS, buffer, size);
-}
-
-static int
-ext4_xattr_get_acl_default(struct inode *inode, const char *name,
- void *buffer, size_t size)
-{
- if (strcmp(name, "") != 0)
- return -EINVAL;
- return ext4_xattr_get_acl(inode, ACL_TYPE_DEFAULT, buffer, size);
-}
-
-static int
-ext4_xattr_set_acl(struct inode *inode, int type, const void *value,
- size_t size)
+ext4_xattr_set_acl(struct dentry *dentry, const char *name, const void *value,
+ size_t size, int flags, int type)
{
+ struct inode *inode = dentry->d_inode;
handle_t *handle;
struct posix_acl *acl;
int error, retries = 0;
+ if (strcmp(name, "") != 0)
+ return -EINVAL;
if (!test_opt(inode->i_sb, POSIX_ACL))
return -EOPNOTSUPP;
if (!is_owner_or_cap(inode))
@@ -466,34 +454,18 @@ release_and_out:
return error;
}
-static int
-ext4_xattr_set_acl_access(struct inode *inode, const char *name,
- const void *value, size_t size, int flags)
-{
- if (strcmp(name, "") != 0)
- return -EINVAL;
- return ext4_xattr_set_acl(inode, ACL_TYPE_ACCESS, value, size);
-}
-
-static int
-ext4_xattr_set_acl_default(struct inode *inode, const char *name,
- const void *value, size_t size, int flags)
-{
- if (strcmp(name, "") != 0)
- return -EINVAL;
- return ext4_xattr_set_acl(inode, ACL_TYPE_DEFAULT, value, size);
-}
-
struct xattr_handler ext4_xattr_acl_access_handler = {
.prefix = POSIX_ACL_XATTR_ACCESS,
+ .flags = ACL_TYPE_ACCESS,
.list = ext4_xattr_list_acl_access,
- .get = ext4_xattr_get_acl_access,
- .set = ext4_xattr_set_acl_access,
+ .get = ext4_xattr_get_acl,
+ .set = ext4_xattr_set_acl,
};
struct xattr_handler ext4_xattr_acl_default_handler = {
.prefix = POSIX_ACL_XATTR_DEFAULT,
+ .flags = ACL_TYPE_DEFAULT,
.list = ext4_xattr_list_acl_default,
- .get = ext4_xattr_get_acl_default,
- .set = ext4_xattr_set_acl_default,
+ .get = ext4_xattr_get_acl,
+ .set = ext4_xattr_set_acl,
};
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 1d0418980f8d..22bc7435d913 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -499,44 +499,6 @@ error_return:
}
/**
- * ext4_free_blocks() -- Free given blocks and update quota
- * @handle: handle for this transaction
- * @inode: inode
- * @block: start physical block to free
- * @count: number of blocks to count
- * @metadata: Are these metadata blocks
- */
-void ext4_free_blocks(handle_t *handle, struct inode *inode,
- ext4_fsblk_t block, unsigned long count,
- int metadata)
-{
- struct super_block *sb;
- unsigned long dquot_freed_blocks;
-
- /* this isn't the right place to decide whether block is metadata
- * inode.c/extents.c knows better, but for safety ... */
- if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
- metadata = 1;
-
- /* We need to make sure we don't reuse
- * block released untill the transaction commit.
- * writeback mode have weak data consistency so
- * don't force data as metadata when freeing block
- * for writeback mode.
- */
- if (metadata == 0 && !ext4_should_writeback_data(inode))
- metadata = 1;
-
- sb = inode->i_sb;
-
- ext4_mb_free_blocks(handle, inode, block, count,
- metadata, &dquot_freed_blocks);
- if (dquot_freed_blocks)
- vfs_dq_free_block(inode, dquot_freed_blocks);
- return;
-}
-
-/**
* ext4_has_free_blocks()
* @sbi: in-core super block structure.
* @nblocks: number of needed blocks
@@ -761,7 +723,13 @@ static unsigned long ext4_bg_num_gdb_meta(struct super_block *sb,
static unsigned long ext4_bg_num_gdb_nometa(struct super_block *sb,
ext4_group_t group)
{
- return ext4_bg_has_super(sb, group) ? EXT4_SB(sb)->s_gdb_count : 0;
+ if (!ext4_bg_has_super(sb, group))
+ return 0;
+
+ if (EXT4_HAS_INCOMPAT_FEATURE(sb,EXT4_FEATURE_INCOMPAT_META_BG))
+ return le32_to_cpu(EXT4_SB(sb)->s_es->s_first_meta_bg);
+ else
+ return EXT4_SB(sb)->s_gdb_count;
}
/**
diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c
index 50784ef07563..a60ab9aad57d 100644
--- a/fs/ext4/block_validity.c
+++ b/fs/ext4/block_validity.c
@@ -16,7 +16,6 @@
#include <linux/module.h>
#include <linux/swap.h>
#include <linux/pagemap.h>
-#include <linux/version.h>
#include <linux/blkdev.h>
#include <linux/mutex.h>
#include "ext4.h"
@@ -160,7 +159,7 @@ int ext4_setup_system_zone(struct super_block *sb)
if (ext4_bg_has_super(sb, i) &&
((i < 5) || ((i % flex_size) == 0)))
add_system_zone(sbi, ext4_group_first_block_no(sb, i),
- sbi->s_gdb_count + 1);
+ ext4_bg_num_gdb(sb, i) + 1);
gdp = ext4_get_group_desc(sb, i, NULL);
ret = add_system_zone(sbi, ext4_block_bitmap(sb, gdp), 1);
if (ret)
@@ -228,6 +227,7 @@ int ext4_data_block_valid(struct ext4_sb_info *sbi, ext4_fsblk_t start_blk,
struct rb_node *n = sbi->system_blks.rb_node;
if ((start_blk <= le32_to_cpu(sbi->s_es->s_first_data_block)) ||
+ (start_blk + count < start_blk) ||
(start_blk + count > ext4_blocks_count(sbi->s_es)))
return 0;
while (n) {
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index e227eea23f05..44b1b443a4dc 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -65,6 +65,12 @@ typedef __u32 ext4_lblk_t;
/* data type for block group number */
typedef unsigned int ext4_group_t;
+/*
+ * Flags used in mballoc's allocation_context flags field.
+ *
+ * Also used to show what's going on for debugging purposes when the
+ * flag field is exported via the traceport interface
+ */
/* prefer goal again. length */
#define EXT4_MB_HINT_MERGE 0x0001
@@ -127,6 +133,16 @@ struct mpage_da_data {
int pages_written;
int retval;
};
+#define DIO_AIO_UNWRITTEN 0x1
+typedef struct ext4_io_end {
+ struct list_head list; /* per-file finished AIO list */
+ struct inode *inode; /* file being written to */
+ unsigned int flag; /* unwritten or not */
+ int error; /* I/O error code */
+ ext4_lblk_t offset; /* offset in the file */
+ size_t size; /* size of the extent */
+ struct work_struct work; /* data work queue */
+} ext4_io_end_t;
/*
* Special inodes numbers
@@ -306,6 +322,7 @@ static inline __u32 ext4_mask_flags(umode_t mode, __u32 flags)
#define EXT4_STATE_NO_EXPAND 0x00000008 /* No space for expansion */
#define EXT4_STATE_DA_ALLOC_CLOSE 0x00000010 /* Alloc DA blks on close */
#define EXT4_STATE_EXT_MIGRATE 0x00000020 /* Inode is migrating */
+#define EXT4_STATE_DIO_UNWRITTEN 0x00000040 /* need convert on dio done*/
/* Used to pass group descriptor data when online resize is done */
struct ext4_new_group_input {
@@ -347,7 +364,22 @@ struct ext4_new_group_data {
/* Call ext4_da_update_reserve_space() after successfully
allocating the blocks */
#define EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE 0x0008
+ /* caller is from the direct IO path, request to creation of an
+ unitialized extents if not allocated, split the uninitialized
+ extent if blocks has been preallocated already*/
+#define EXT4_GET_BLOCKS_DIO 0x0010
+#define EXT4_GET_BLOCKS_CONVERT 0x0020
+#define EXT4_GET_BLOCKS_DIO_CREATE_EXT (EXT4_GET_BLOCKS_DIO|\
+ EXT4_GET_BLOCKS_CREATE_UNINIT_EXT)
+ /* Convert extent to initialized after direct IO complete */
+#define EXT4_GET_BLOCKS_DIO_CONVERT_EXT (EXT4_GET_BLOCKS_CONVERT|\
+ EXT4_GET_BLOCKS_DIO_CREATE_EXT)
+/*
+ * Flags used by ext4_free_blocks
+ */
+#define EXT4_FREE_BLOCKS_METADATA 0x0001
+#define EXT4_FREE_BLOCKS_FORGET 0x0002
/*
* ioctl commands
@@ -500,8 +532,8 @@ struct move_extent {
static inline __le32 ext4_encode_extra_time(struct timespec *time)
{
return cpu_to_le32((sizeof(time->tv_sec) > 4 ?
- time->tv_sec >> 32 : 0) |
- ((time->tv_nsec << 2) & EXT4_NSEC_MASK));
+ (time->tv_sec >> 32) & EXT4_EPOCH_MASK : 0) |
+ ((time->tv_nsec << EXT4_EPOCH_BITS) & EXT4_NSEC_MASK));
}
static inline void ext4_decode_extra_time(struct timespec *time, __le32 extra)
@@ -509,7 +541,7 @@ static inline void ext4_decode_extra_time(struct timespec *time, __le32 extra)
if (sizeof(time->tv_sec) > 4)
time->tv_sec |= (__u64)(le32_to_cpu(extra) & EXT4_EPOCH_MASK)
<< 32;
- time->tv_nsec = (le32_to_cpu(extra) & EXT4_NSEC_MASK) >> 2;
+ time->tv_nsec = (le32_to_cpu(extra) & EXT4_NSEC_MASK) >> EXT4_EPOCH_BITS;
}
#define EXT4_INODE_SET_XTIME(xtime, inode, raw_inode) \
@@ -667,11 +699,29 @@ struct ext4_inode_info {
unsigned int i_reserved_meta_blocks;
unsigned int i_allocated_meta_blocks;
unsigned short i_delalloc_reserved_flag;
+ sector_t i_da_metadata_calc_last_lblock;
+ int i_da_metadata_calc_len;
/* on-disk additional length */
__u16 i_extra_isize;
spinlock_t i_block_reservation_lock;
+#ifdef CONFIG_QUOTA
+ /* quota space reservation, managed internally by quota code */
+ qsize_t i_reserved_quota;
+#endif
+
+ /* completed async DIOs that might need unwritten extents handling */
+ struct list_head i_aio_dio_complete_list;
+ /* current io_end structure for async DIO write*/
+ ext4_io_end_t *cur_aio_dio;
+
+ /*
+ * Transactions that contain inode's metadata needed to complete
+ * fsync and fdatasync, respectively.
+ */
+ tid_t i_sync_tid;
+ tid_t i_datasync_tid;
};
/*
@@ -713,11 +763,13 @@ struct ext4_inode_info {
#define EXT4_MOUNT_QUOTA 0x80000 /* Some quota option set */
#define EXT4_MOUNT_USRQUOTA 0x100000 /* "old" user quota */
#define EXT4_MOUNT_GRPQUOTA 0x200000 /* "old" group quota */
+#define EXT4_MOUNT_JOURNAL_CHECKSUM 0x800000 /* Journal checksums */
#define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */
#define EXT4_MOUNT_I_VERSION 0x2000000 /* i_version support */
#define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */
#define EXT4_MOUNT_DATA_ERR_ABORT 0x10000000 /* Abort on file data write */
#define EXT4_MOUNT_BLOCK_VALIDITY 0x20000000 /* Block validity checking */
+#define EXT4_MOUNT_DISCARD 0x40000000 /* Issue DISCARD requests */
#define clear_opt(o, opt) o &= ~EXT4_MOUNT_##opt
#define set_opt(o, opt) o |= EXT4_MOUNT_##opt
@@ -942,18 +994,11 @@ struct ext4_sb_info {
unsigned int s_mb_stats;
unsigned int s_mb_order2_reqs;
unsigned int s_mb_group_prealloc;
+ unsigned int s_max_writeback_mb_bump;
/* where last allocation was done - for stream allocation */
unsigned long s_mb_last_group;
unsigned long s_mb_last_start;
- /* history to debug policy */
- struct ext4_mb_history *s_mb_history;
- int s_mb_history_cur;
- int s_mb_history_max;
- int s_mb_history_num;
- spinlock_t s_mb_history_lock;
- int s_mb_history_filter;
-
/* stats for buddy allocator */
spinlock_t s_mb_pa_lock;
atomic_t s_bal_reqs; /* number of reqs with len > 1 */
@@ -980,6 +1025,9 @@ struct ext4_sb_info {
unsigned int s_log_groups_per_flex;
struct flex_groups *s_flex_groups;
+
+ /* workqueue for dio unwritten */
+ struct workqueue_struct *dio_unwritten_wq;
};
static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)
@@ -1296,8 +1344,6 @@ extern ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
ext4_fsblk_t goal, unsigned long *count, int *errp);
extern int ext4_claim_free_blocks(struct ext4_sb_info *sbi, s64 nblocks);
extern int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks);
-extern void ext4_free_blocks(handle_t *handle, struct inode *inode,
- ext4_fsblk_t block, unsigned long count, int metadata);
extern void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
ext4_fsblk_t block, unsigned long count);
extern ext4_fsblk_t ext4_count_free_blocks(struct super_block *);
@@ -1356,16 +1402,15 @@ extern int ext4_mb_reserve_blocks(struct super_block *, int);
extern void ext4_discard_preallocations(struct inode *);
extern int __init init_ext4_mballoc(void);
extern void exit_ext4_mballoc(void);
-extern void ext4_mb_free_blocks(handle_t *, struct inode *,
- ext4_fsblk_t, unsigned long, int, unsigned long *);
+extern void ext4_free_blocks(handle_t *handle, struct inode *inode,
+ struct buffer_head *bh, ext4_fsblk_t block,
+ unsigned long count, int flags);
extern int ext4_mb_add_groupinfo(struct super_block *sb,
ext4_group_t i, struct ext4_group_desc *desc);
extern int ext4_mb_get_buddy_cache_lock(struct super_block *, ext4_group_t);
extern void ext4_mb_put_buddy_cache_lock(struct super_block *,
ext4_group_t, int);
/* inode.c */
-int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode,
- struct buffer_head *bh, ext4_fsblk_t blocknr);
struct buffer_head *ext4_getblk(handle_t *, struct inode *,
ext4_lblk_t, int, int *);
struct buffer_head *ext4_bread(handle_t *, struct inode *,
@@ -1374,7 +1419,7 @@ int ext4_get_block(struct inode *inode, sector_t iblock,
struct buffer_head *bh_result, int create);
extern struct inode *ext4_iget(struct super_block *, unsigned long);
-extern int ext4_write_inode(struct inode *, int);
+extern int ext4_write_inode(struct inode *, struct writeback_control *);
extern int ext4_setattr(struct dentry *, struct iattr *);
extern int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
struct kstat *stat);
@@ -1396,8 +1441,8 @@ extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks);
extern int ext4_block_truncate_page(handle_t *handle,
struct address_space *mapping, loff_t from);
extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
-extern qsize_t ext4_get_reserved_space(struct inode *inode);
-
+extern qsize_t *ext4_get_reserved_space(struct inode *inode);
+extern int flush_aio_dio_completed_IO(struct inode *inode);
/* ioctl.c */
extern long ext4_ioctl(struct file *, unsigned int, unsigned long);
extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long);
@@ -1699,6 +1744,8 @@ extern void ext4_ext_init(struct super_block *);
extern void ext4_ext_release(struct super_block *);
extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset,
loff_t len);
+extern int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
+ loff_t len);
extern int ext4_get_blocks(handle_t *handle, struct inode *inode,
sector_t block, unsigned int max_blocks,
struct buffer_head *bh, int flags);
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
index 61652f1d15e6..bdb6ce7e2eb4 100644
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -220,7 +220,13 @@ static inline int ext4_ext_get_actual_len(struct ext4_extent *ext)
(le16_to_cpu(ext->ee_len) - EXT_INIT_MAX_LEN));
}
-extern int ext4_ext_calc_metadata_amount(struct inode *inode, int blocks);
+static inline void ext4_ext_mark_initialized(struct ext4_extent *ext)
+{
+ ext->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ext));
+}
+
+extern int ext4_ext_calc_metadata_amount(struct inode *inode,
+ sector_t lblocks);
extern ext4_fsblk_t ext_pblock(struct ext4_extent *ex);
extern ext4_fsblk_t idx_pblock(struct ext4_extent_idx *);
extern void ext4_ext_store_pblock(struct ext4_extent *, ext4_fsblk_t);
@@ -235,7 +241,7 @@ extern int ext4_ext_try_to_merge(struct inode *inode,
struct ext4_ext_path *path,
struct ext4_extent *);
extern unsigned int ext4_ext_check_overlap(struct inode *, struct ext4_extent *, struct ext4_ext_path *);
-extern int ext4_ext_insert_extent(handle_t *, struct inode *, struct ext4_ext_path *, struct ext4_extent *);
+extern int ext4_ext_insert_extent(handle_t *, struct inode *, struct ext4_ext_path *, struct ext4_extent *, int);
extern int ext4_ext_walk_space(struct inode *, ext4_lblk_t, ext4_lblk_t,
ext_prepare_callback, void *);
extern struct ext4_ext_path *ext4_ext_find_extent(struct inode *, ext4_lblk_t,
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index 6a9409920dee..b57e5c711b6d 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -4,6 +4,8 @@
#include "ext4_jbd2.h"
+#include <trace/events/ext4.h>
+
int __ext4_journal_get_undo_access(const char *where, handle_t *handle,
struct buffer_head *bh)
{
@@ -32,35 +34,69 @@ int __ext4_journal_get_write_access(const char *where, handle_t *handle,
return err;
}
-int __ext4_journal_forget(const char *where, handle_t *handle,
- struct buffer_head *bh)
+/*
+ * The ext4 forget function must perform a revoke if we are freeing data
+ * which has been journaled. Metadata (eg. indirect blocks) must be
+ * revoked in all cases.
+ *
+ * "bh" may be NULL: a metadata block may have been freed from memory
+ * but there may still be a record of it in the journal, and that record
+ * still needs to be revoked.
+ *
+ * If the handle isn't valid we're not journaling, but we still need to
+ * call into ext4_journal_revoke() to put the buffer head.
+ */
+int __ext4_forget(const char *where, handle_t *handle, int is_metadata,
+ struct inode *inode, struct buffer_head *bh,
+ ext4_fsblk_t blocknr)
{
- int err = 0;
+ int err;
- if (ext4_handle_valid(handle)) {
- err = jbd2_journal_forget(handle, bh);
- if (err)
- ext4_journal_abort_handle(where, __func__, bh,
- handle, err);
- }
- else
+ might_sleep();
+
+ trace_ext4_forget(inode, is_metadata, blocknr);
+ BUFFER_TRACE(bh, "enter");
+
+ jbd_debug(4, "forgetting bh %p: is_metadata = %d, mode %o, "
+ "data mode %x\n",
+ bh, is_metadata, inode->i_mode,
+ test_opt(inode->i_sb, DATA_FLAGS));
+
+ /* In the no journal case, we can just do a bforget and return */
+ if (!ext4_handle_valid(handle)) {
bforget(bh);
- return err;
-}
+ return 0;
+ }
-int __ext4_journal_revoke(const char *where, handle_t *handle,
- ext4_fsblk_t blocknr, struct buffer_head *bh)
-{
- int err = 0;
+ /* Never use the revoke function if we are doing full data
+ * journaling: there is no need to, and a V1 superblock won't
+ * support it. Otherwise, only skip the revoke on un-journaled
+ * data blocks. */
- if (ext4_handle_valid(handle)) {
- err = jbd2_journal_revoke(handle, blocknr, bh);
- if (err)
- ext4_journal_abort_handle(where, __func__, bh,
- handle, err);
+ if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA ||
+ (!is_metadata && !ext4_should_journal_data(inode))) {
+ if (bh) {
+ BUFFER_TRACE(bh, "call jbd2_journal_forget");
+ err = jbd2_journal_forget(handle, bh);
+ if (err)
+ ext4_journal_abort_handle(where, __func__, bh,
+ handle, err);
+ return err;
+ }
+ return 0;
}
- else
- bforget(bh);
+
+ /*
+ * data!=journal && (is_metadata || should_journal_data(inode))
+ */
+ BUFFER_TRACE(bh, "call jbd2_journal_revoke");
+ err = jbd2_journal_revoke(handle, blocknr, bh);
+ if (err) {
+ ext4_journal_abort_handle(where, __func__, bh, handle, err);
+ ext4_abort(inode->i_sb, __func__,
+ "error %d when attempting revoke", err);
+ }
+ BUFFER_TRACE(bh, "exit");
return err;
}
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index 139fb8cb87e4..05eca817d704 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -49,7 +49,7 @@
#define EXT4_DATA_TRANS_BLOCKS(sb) (EXT4_SINGLEDATA_TRANS_BLOCKS(sb) + \
EXT4_XATTR_TRANS_BLOCKS - 2 + \
- 2*EXT4_QUOTA_TRANS_BLOCKS(sb))
+ EXT4_MAXQUOTAS_TRANS_BLOCKS(sb))
/*
* Define the number of metadata blocks we need to account to modify data.
@@ -57,7 +57,7 @@
* This include super block, inode block, quota blocks and xattr blocks
*/
#define EXT4_META_TRANS_BLOCKS(sb) (EXT4_XATTR_TRANS_BLOCKS + \
- 2*EXT4_QUOTA_TRANS_BLOCKS(sb))
+ EXT4_MAXQUOTAS_TRANS_BLOCKS(sb))
/* Delete operations potentially hit one directory's namespace plus an
* entire inode, plus arbitrary amounts of bitmap/indirection data. Be
@@ -92,6 +92,7 @@
* but inode, sb and group updates are done only once */
#define EXT4_QUOTA_INIT_BLOCKS(sb) (test_opt(sb, QUOTA) ? (DQUOT_INIT_ALLOC*\
(EXT4_SINGLEDATA_TRANS_BLOCKS(sb)-3)+3+DQUOT_INIT_REWRITE) : 0)
+
#define EXT4_QUOTA_DEL_BLOCKS(sb) (test_opt(sb, QUOTA) ? (DQUOT_DEL_ALLOC*\
(EXT4_SINGLEDATA_TRANS_BLOCKS(sb)-3)+3+DQUOT_DEL_REWRITE) : 0)
#else
@@ -99,6 +100,9 @@
#define EXT4_QUOTA_INIT_BLOCKS(sb) 0
#define EXT4_QUOTA_DEL_BLOCKS(sb) 0
#endif
+#define EXT4_MAXQUOTAS_TRANS_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_TRANS_BLOCKS(sb))
+#define EXT4_MAXQUOTAS_INIT_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_INIT_BLOCKS(sb))
+#define EXT4_MAXQUOTAS_DEL_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_DEL_BLOCKS(sb))
int
ext4_mark_iloc_dirty(handle_t *handle,
@@ -116,12 +120,8 @@ int ext4_reserve_inode_write(handle_t *handle, struct inode *inode,
int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode);
/*
- * Wrapper functions with which ext4 calls into JBD. The intent here is
- * to allow these to be turned into appropriate stubs so ext4 can control
- * ext2 filesystems, so ext2+ext4 systems only nee one fs. This work hasn't
- * been done yet.
+ * Wrapper functions with which ext4 calls into JBD.
*/
-
void ext4_journal_abort_handle(const char *caller, const char *err_fn,
struct buffer_head *bh, handle_t *handle, int err);
@@ -131,13 +131,9 @@ int __ext4_journal_get_undo_access(const char *where, handle_t *handle,
int __ext4_journal_get_write_access(const char *where, handle_t *handle,
struct buffer_head *bh);
-/* When called with an invalid handle, this will still do a put on the BH */
-int __ext4_journal_forget(const char *where, handle_t *handle,
- struct buffer_head *bh);
-
-/* When called with an invalid handle, this will still do a put on the BH */
-int __ext4_journal_revoke(const char *where, handle_t *handle,
- ext4_fsblk_t blocknr, struct buffer_head *bh);
+int __ext4_forget(const char *where, handle_t *handle, int is_metadata,
+ struct inode *inode, struct buffer_head *bh,
+ ext4_fsblk_t blocknr);
int __ext4_journal_get_create_access(const char *where,
handle_t *handle, struct buffer_head *bh);
@@ -149,23 +145,24 @@ int __ext4_handle_dirty_metadata(const char *where, handle_t *handle,
__ext4_journal_get_undo_access(__func__, (handle), (bh))
#define ext4_journal_get_write_access(handle, bh) \
__ext4_journal_get_write_access(__func__, (handle), (bh))
-#define ext4_journal_revoke(handle, blocknr, bh) \
- __ext4_journal_revoke(__func__, (handle), (blocknr), (bh))
+#define ext4_forget(handle, is_metadata, inode, bh, block_nr) \
+ __ext4_forget(__func__, (handle), (is_metadata), (inode), (bh),\
+ (block_nr))
#define ext4_journal_get_create_access(handle, bh) \
__ext4_journal_get_create_access(__func__, (handle), (bh))
-#define ext4_journal_forget(handle, bh) \
- __ext4_journal_forget(__func__, (handle), (bh))
#define ext4_handle_dirty_metadata(handle, inode, bh) \
__ext4_handle_dirty_metadata(__func__, (handle), (inode), (bh))
handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks);
int __ext4_journal_stop(const char *where, handle_t *handle);
-#define EXT4_NOJOURNAL_HANDLE ((handle_t *) 0x1)
+#define EXT4_NOJOURNAL_MAX_REF_COUNT ((unsigned long) 4096)
+/* Note: Do not use this for NULL handles. This is only to determine if
+ * a properly allocated handle is using a journal or not. */
static inline int ext4_handle_valid(handle_t *handle)
{
- if (handle == EXT4_NOJOURNAL_HANDLE)
+ if ((unsigned long)handle < EXT4_NOJOURNAL_MAX_REF_COUNT)
return 0;
return 1;
}
@@ -252,6 +249,19 @@ static inline int ext4_jbd2_file_inode(handle_t *handle, struct inode *inode)
return 0;
}
+static inline void ext4_update_inode_fsync_trans(handle_t *handle,
+ struct inode *inode,
+ int datasync)
+{
+ struct ext4_inode_info *ei = EXT4_I(inode);
+
+ if (ext4_handle_valid(handle)) {
+ ei->i_sync_tid = handle->h_transaction->t_tid;
+ if (datasync)
+ ei->i_datasync_tid = handle->h_transaction->t_tid;
+ }
+}
+
/* super.c */
int ext4_force_commit(struct super_block *sb);
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 7a3832577923..7d7b74e94687 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -296,29 +296,44 @@ static inline int ext4_ext_space_root_idx(struct inode *inode, int check)
* to allocate @blocks
* Worse case is one block per extent
*/
-int ext4_ext_calc_metadata_amount(struct inode *inode, int blocks)
+int ext4_ext_calc_metadata_amount(struct inode *inode, sector_t lblock)
{
- int lcap, icap, rcap, leafs, idxs, num;
- int newextents = blocks;
-
- rcap = ext4_ext_space_root_idx(inode, 0);
- lcap = ext4_ext_space_block(inode, 0);
- icap = ext4_ext_space_block_idx(inode, 0);
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ int idxs, num = 0;
- /* number of new leaf blocks needed */
- num = leafs = (newextents + lcap - 1) / lcap;
+ idxs = ((inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
+ / sizeof(struct ext4_extent_idx));
/*
- * Worse case, we need separate index block(s)
- * to link all new leaf blocks
+ * If the new delayed allocation block is contiguous with the
+ * previous da block, it can share index blocks with the
+ * previous block, so we only need to allocate a new index
+ * block every idxs leaf blocks. At ldxs**2 blocks, we need
+ * an additional index block, and at ldxs**3 blocks, yet
+ * another index blocks.
*/
- idxs = (leafs + icap - 1) / icap;
- do {
- num += idxs;
- idxs = (idxs + icap - 1) / icap;
- } while (idxs > rcap);
+ if (ei->i_da_metadata_calc_len &&
+ ei->i_da_metadata_calc_last_lblock+1 == lblock) {
+ if ((ei->i_da_metadata_calc_len % idxs) == 0)
+ num++;
+ if ((ei->i_da_metadata_calc_len % (idxs*idxs)) == 0)
+ num++;
+ if ((ei->i_da_metadata_calc_len % (idxs*idxs*idxs)) == 0) {
+ num++;
+ ei->i_da_metadata_calc_len = 0;
+ } else
+ ei->i_da_metadata_calc_len++;
+ ei->i_da_metadata_calc_last_lblock++;
+ return num;
+ }
- return num;
+ /*
+ * In the worst case we need a new set of index blocks at
+ * every level of the inode's extent tree.
+ */
+ ei->i_da_metadata_calc_len = 1;
+ ei->i_da_metadata_calc_last_lblock = lblock;
+ return ext_depth(inode) + 1;
}
static int
@@ -723,7 +738,7 @@ err:
* insert new index [@logical;@ptr] into the block at @curp;
* check where to insert: before @curp or after @curp
*/
-static int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
+int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
struct ext4_ext_path *curp,
int logical, ext4_fsblk_t ptr)
{
@@ -1007,7 +1022,8 @@ cleanup:
for (i = 0; i < depth; i++) {
if (!ablocks[i])
continue;
- ext4_free_blocks(handle, inode, ablocks[i], 1, 1);
+ ext4_free_blocks(handle, inode, 0, ablocks[i], 1,
+ EXT4_FREE_BLOCKS_METADATA);
}
}
kfree(ablocks);
@@ -1586,7 +1602,7 @@ out:
*/
int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
struct ext4_ext_path *path,
- struct ext4_extent *newext)
+ struct ext4_extent *newext, int flag)
{
struct ext4_extent_header *eh;
struct ext4_extent *ex, *fex;
@@ -1602,7 +1618,8 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
BUG_ON(path[depth].p_hdr == NULL);
/* try to insert block into found extent and return */
- if (ex && ext4_can_extents_be_merged(inode, ex, newext)) {
+ if (ex && (flag != EXT4_GET_BLOCKS_DIO_CREATE_EXT)
+ && ext4_can_extents_be_merged(inode, ex, newext)) {
ext_debug("append [%d]%d block to %d:[%d]%d (from %llu)\n",
ext4_ext_is_uninitialized(newext),
ext4_ext_get_actual_len(newext),
@@ -1722,7 +1739,8 @@ has_space:
merge:
/* try to merge extents to the right */
- ext4_ext_try_to_merge(inode, path, nearex);
+ if (flag != EXT4_GET_BLOCKS_DIO_CREATE_EXT)
+ ext4_ext_try_to_merge(inode, path, nearex);
/* try to merge extents to the left */
@@ -1759,7 +1777,9 @@ int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block,
while (block < last && block != EXT_MAX_BLOCK) {
num = last - block;
/* find extent for this block */
+ down_read(&EXT4_I(inode)->i_data_sem);
path = ext4_ext_find_extent(inode, block, path);
+ up_read(&EXT4_I(inode)->i_data_sem);
if (IS_ERR(path)) {
err = PTR_ERR(path);
path = NULL;
@@ -1955,7 +1975,6 @@ errout:
static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode,
struct ext4_ext_path *path)
{
- struct buffer_head *bh;
int err;
ext4_fsblk_t leaf;
@@ -1971,9 +1990,8 @@ static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode,
if (err)
return err;
ext_debug("index is empty, remove it, free block %llu\n", leaf);
- bh = sb_find_get_block(inode->i_sb, leaf);
- ext4_forget(handle, 1, inode, bh, leaf);
- ext4_free_blocks(handle, inode, leaf, 1, 1);
+ ext4_free_blocks(handle, inode, 0, leaf, 1,
+ EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET);
return err;
}
@@ -2040,12 +2058,11 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
struct ext4_extent *ex,
ext4_lblk_t from, ext4_lblk_t to)
{
- struct buffer_head *bh;
unsigned short ee_len = ext4_ext_get_actual_len(ex);
- int i, metadata = 0;
+ int flags = EXT4_FREE_BLOCKS_FORGET;
if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
- metadata = 1;
+ flags |= EXT4_FREE_BLOCKS_METADATA;
#ifdef EXTENTS_STATS
{
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
@@ -2070,11 +2087,7 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
num = le32_to_cpu(ex->ee_block) + ee_len - from;
start = ext_pblock(ex) + ee_len - num;
ext_debug("free last %u blocks starting %llu\n", num, start);
- for (i = 0; i < num; i++) {
- bh = sb_find_get_block(inode->i_sb, start + i);
- ext4_forget(handle, 0, inode, bh, start + i);
- }
- ext4_free_blocks(handle, inode, start, num, metadata);
+ ext4_free_blocks(handle, inode, 0, start, num, flags);
} else if (from == le32_to_cpu(ex->ee_block)
&& to <= le32_to_cpu(ex->ee_block) + ee_len - 1) {
printk(KERN_INFO "strange request: removal %u-%u from %u:%u\n",
@@ -2165,7 +2178,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
correct_index = 1;
credits += (ext_depth(inode)) + 1;
}
- credits += 2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb);
+ credits += EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb);
err = ext4_ext_truncate_extend_restart(handle, inode, credits);
if (err)
@@ -2378,6 +2391,7 @@ void ext4_ext_init(struct super_block *sb)
*/
if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) {
+#if defined(AGGRESSIVE_TEST) || defined(CHECK_BINSEARCH) || defined(EXTENTS_STATS)
printk(KERN_INFO "EXT4-fs: file extents enabled");
#ifdef AGGRESSIVE_TEST
printk(", aggressive tests");
@@ -2389,6 +2403,7 @@ void ext4_ext_init(struct super_block *sb)
printk(", stats");
#endif
printk("\n");
+#endif
#ifdef EXTENTS_STATS
spin_lock_init(&EXT4_SB(sb)->s_ext_stats_lock);
EXT4_SB(sb)->s_ext_min = 1 << 30;
@@ -2490,7 +2505,6 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
}
#define EXT4_EXT_ZERO_LEN 7
-
/*
* This function is called by ext4_ext_get_blocks() if someone tries to write
* to an uninitialized extent. It may result in splitting the uninitialized
@@ -2583,7 +2597,8 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
ex3->ee_block = cpu_to_le32(iblock);
ext4_ext_store_pblock(ex3, newblock);
ex3->ee_len = cpu_to_le16(allocated);
- err = ext4_ext_insert_extent(handle, inode, path, ex3);
+ err = ext4_ext_insert_extent(handle, inode, path,
+ ex3, 0);
if (err == -ENOSPC) {
err = ext4_ext_zeroout(inode, &orig_ex);
if (err)
@@ -2639,7 +2654,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
ext4_ext_store_pblock(ex3, newblock + max_blocks);
ex3->ee_len = cpu_to_le16(allocated - max_blocks);
ext4_ext_mark_uninitialized(ex3);
- err = ext4_ext_insert_extent(handle, inode, path, ex3);
+ err = ext4_ext_insert_extent(handle, inode, path, ex3, 0);
if (err == -ENOSPC) {
err = ext4_ext_zeroout(inode, &orig_ex);
if (err)
@@ -2757,7 +2772,192 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
err = ext4_ext_dirty(handle, inode, path + depth);
goto out;
insert:
- err = ext4_ext_insert_extent(handle, inode, path, &newex);
+ err = ext4_ext_insert_extent(handle, inode, path, &newex, 0);
+ if (err == -ENOSPC) {
+ err = ext4_ext_zeroout(inode, &orig_ex);
+ if (err)
+ goto fix_extent_len;
+ /* update the extent length and mark as initialized */
+ ex->ee_block = orig_ex.ee_block;
+ ex->ee_len = orig_ex.ee_len;
+ ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
+ ext4_ext_dirty(handle, inode, path + depth);
+ /* zero out the first half */
+ return allocated;
+ } else if (err)
+ goto fix_extent_len;
+out:
+ ext4_ext_show_leaf(inode, path);
+ return err ? err : allocated;
+
+fix_extent_len:
+ ex->ee_block = orig_ex.ee_block;
+ ex->ee_len = orig_ex.ee_len;
+ ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
+ ext4_ext_mark_uninitialized(ex);
+ ext4_ext_dirty(handle, inode, path + depth);
+ return err;
+}
+
+/*
+ * This function is called by ext4_ext_get_blocks() from
+ * ext4_get_blocks_dio_write() when DIO to write
+ * to an uninitialized extent.
+ *
+ * Writing to an uninitized extent may result in splitting the uninitialized
+ * extent into multiple /intialized unintialized extents (up to three)
+ * There are three possibilities:
+ * a> There is no split required: Entire extent should be uninitialized
+ * b> Splits in two extents: Write is happening at either end of the extent
+ * c> Splits in three extents: Somone is writing in middle of the extent
+ *
+ * One of more index blocks maybe needed if the extent tree grow after
+ * the unintialized extent split. To prevent ENOSPC occur at the IO
+ * complete, we need to split the uninitialized extent before DIO submit
+ * the IO. The uninitilized extent called at this time will be split
+ * into three uninitialized extent(at most). After IO complete, the part
+ * being filled will be convert to initialized by the end_io callback function
+ * via ext4_convert_unwritten_extents().
+ *
+ * Returns the size of uninitialized extent to be written on success.
+ */
+static int ext4_split_unwritten_extents(handle_t *handle,
+ struct inode *inode,
+ struct ext4_ext_path *path,
+ ext4_lblk_t iblock,
+ unsigned int max_blocks,
+ int flags)
+{
+ struct ext4_extent *ex, newex, orig_ex;
+ struct ext4_extent *ex1 = NULL;
+ struct ext4_extent *ex2 = NULL;
+ struct ext4_extent *ex3 = NULL;
+ struct ext4_extent_header *eh;
+ ext4_lblk_t ee_block;
+ unsigned int allocated, ee_len, depth;
+ ext4_fsblk_t newblock;
+ int err = 0;
+
+ ext_debug("ext4_split_unwritten_extents: inode %lu,"
+ "iblock %llu, max_blocks %u\n", inode->i_ino,
+ (unsigned long long)iblock, max_blocks);
+ depth = ext_depth(inode);
+ eh = path[depth].p_hdr;
+ ex = path[depth].p_ext;
+ ee_block = le32_to_cpu(ex->ee_block);
+ ee_len = ext4_ext_get_actual_len(ex);
+ allocated = ee_len - (iblock - ee_block);
+ newblock = iblock - ee_block + ext_pblock(ex);
+ ex2 = ex;
+ orig_ex.ee_block = ex->ee_block;
+ orig_ex.ee_len = cpu_to_le16(ee_len);
+ ext4_ext_store_pblock(&orig_ex, ext_pblock(ex));
+
+ /*
+ * If the uninitialized extent begins at the same logical
+ * block where the write begins, and the write completely
+ * covers the extent, then we don't need to split it.
+ */
+ if ((iblock == ee_block) && (allocated <= max_blocks))
+ return allocated;
+
+ err = ext4_ext_get_access(handle, inode, path + depth);
+ if (err)
+ goto out;
+ /* ex1: ee_block to iblock - 1 : uninitialized */
+ if (iblock > ee_block) {
+ ex1 = ex;
+ ex1->ee_len = cpu_to_le16(iblock - ee_block);
+ ext4_ext_mark_uninitialized(ex1);
+ ex2 = &newex;
+ }
+ /*
+ * for sanity, update the length of the ex2 extent before
+ * we insert ex3, if ex1 is NULL. This is to avoid temporary
+ * overlap of blocks.
+ */
+ if (!ex1 && allocated > max_blocks)
+ ex2->ee_len = cpu_to_le16(max_blocks);
+ /* ex3: to ee_block + ee_len : uninitialised */
+ if (allocated > max_blocks) {
+ unsigned int newdepth;
+ ex3 = &newex;
+ ex3->ee_block = cpu_to_le32(iblock + max_blocks);
+ ext4_ext_store_pblock(ex3, newblock + max_blocks);
+ ex3->ee_len = cpu_to_le16(allocated - max_blocks);
+ ext4_ext_mark_uninitialized(ex3);
+ err = ext4_ext_insert_extent(handle, inode, path, ex3, flags);
+ if (err == -ENOSPC) {
+ err = ext4_ext_zeroout(inode, &orig_ex);
+ if (err)
+ goto fix_extent_len;
+ /* update the extent length and mark as initialized */
+ ex->ee_block = orig_ex.ee_block;
+ ex->ee_len = orig_ex.ee_len;
+ ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
+ ext4_ext_dirty(handle, inode, path + depth);
+ /* zeroed the full extent */
+ /* blocks available from iblock */
+ return allocated;
+
+ } else if (err)
+ goto fix_extent_len;
+ /*
+ * The depth, and hence eh & ex might change
+ * as part of the insert above.
+ */
+ newdepth = ext_depth(inode);
+ /*
+ * update the extent length after successful insert of the
+ * split extent
+ */
+ orig_ex.ee_len = cpu_to_le16(ee_len -
+ ext4_ext_get_actual_len(ex3));
+ depth = newdepth;
+ ext4_ext_drop_refs(path);
+ path = ext4_ext_find_extent(inode, iblock, path);
+ if (IS_ERR(path)) {
+ err = PTR_ERR(path);
+ goto out;
+ }
+ eh = path[depth].p_hdr;
+ ex = path[depth].p_ext;
+ if (ex2 != &newex)
+ ex2 = ex;
+
+ err = ext4_ext_get_access(handle, inode, path + depth);
+ if (err)
+ goto out;
+
+ allocated = max_blocks;
+ }
+ /*
+ * If there was a change of depth as part of the
+ * insertion of ex3 above, we need to update the length
+ * of the ex1 extent again here
+ */
+ if (ex1 && ex1 != ex) {
+ ex1 = ex;
+ ex1->ee_len = cpu_to_le16(iblock - ee_block);
+ ext4_ext_mark_uninitialized(ex1);
+ ex2 = &newex;
+ }
+ /*
+ * ex2: iblock to iblock + maxblocks-1 : to be direct IO written,
+ * uninitialised still.
+ */
+ ex2->ee_block = cpu_to_le32(iblock);
+ ext4_ext_store_pblock(ex2, newblock);
+ ex2->ee_len = cpu_to_le16(allocated);
+ ext4_ext_mark_uninitialized(ex2);
+ if (ex2 != ex)
+ goto insert;
+ /* Mark modified extent as dirty */
+ err = ext4_ext_dirty(handle, inode, path + depth);
+ ext_debug("out here\n");
+ goto out;
+insert:
+ err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
if (err == -ENOSPC) {
err = ext4_ext_zeroout(inode, &orig_ex);
if (err)
@@ -2783,7 +2983,171 @@ fix_extent_len:
ext4_ext_dirty(handle, inode, path + depth);
return err;
}
+static int ext4_convert_unwritten_extents_dio(handle_t *handle,
+ struct inode *inode,
+ struct ext4_ext_path *path)
+{
+ struct ext4_extent *ex;
+ struct ext4_extent_header *eh;
+ int depth;
+ int err = 0;
+ int ret = 0;
+
+ depth = ext_depth(inode);
+ eh = path[depth].p_hdr;
+ ex = path[depth].p_ext;
+
+ err = ext4_ext_get_access(handle, inode, path + depth);
+ if (err)
+ goto out;
+ /* first mark the extent as initialized */
+ ext4_ext_mark_initialized(ex);
+
+ /*
+ * We have to see if it can be merged with the extent
+ * on the left.
+ */
+ if (ex > EXT_FIRST_EXTENT(eh)) {
+ /*
+ * To merge left, pass "ex - 1" to try_to_merge(),
+ * since it merges towards right _only_.
+ */
+ ret = ext4_ext_try_to_merge(inode, path, ex - 1);
+ if (ret) {
+ err = ext4_ext_correct_indexes(handle, inode, path);
+ if (err)
+ goto out;
+ depth = ext_depth(inode);
+ ex--;
+ }
+ }
+ /*
+ * Try to Merge towards right.
+ */
+ ret = ext4_ext_try_to_merge(inode, path, ex);
+ if (ret) {
+ err = ext4_ext_correct_indexes(handle, inode, path);
+ if (err)
+ goto out;
+ depth = ext_depth(inode);
+ }
+ /* Mark modified extent as dirty */
+ err = ext4_ext_dirty(handle, inode, path + depth);
+out:
+ ext4_ext_show_leaf(inode, path);
+ return err;
+}
+
+static void unmap_underlying_metadata_blocks(struct block_device *bdev,
+ sector_t block, int count)
+{
+ int i;
+ for (i = 0; i < count; i++)
+ unmap_underlying_metadata(bdev, block + i);
+}
+
+static int
+ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
+ ext4_lblk_t iblock, unsigned int max_blocks,
+ struct ext4_ext_path *path, int flags,
+ unsigned int allocated, struct buffer_head *bh_result,
+ ext4_fsblk_t newblock)
+{
+ int ret = 0;
+ int err = 0;
+ ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;
+
+ ext_debug("ext4_ext_handle_uninitialized_extents: inode %lu, logical"
+ "block %llu, max_blocks %u, flags %d, allocated %u",
+ inode->i_ino, (unsigned long long)iblock, max_blocks,
+ flags, allocated);
+ ext4_ext_show_leaf(inode, path);
+
+ /* DIO get_block() before submit the IO, split the extent */
+ if (flags == EXT4_GET_BLOCKS_DIO_CREATE_EXT) {
+ ret = ext4_split_unwritten_extents(handle,
+ inode, path, iblock,
+ max_blocks, flags);
+ /*
+ * Flag the inode(non aio case) or end_io struct (aio case)
+ * that this IO needs to convertion to written when IO is
+ * completed
+ */
+ if (io)
+ io->flag = DIO_AIO_UNWRITTEN;
+ else
+ EXT4_I(inode)->i_state |= EXT4_STATE_DIO_UNWRITTEN;
+ goto out;
+ }
+ /* async DIO end_io complete, convert the filled extent to written */
+ if (flags == EXT4_GET_BLOCKS_DIO_CONVERT_EXT) {
+ ret = ext4_convert_unwritten_extents_dio(handle, inode,
+ path);
+ if (ret >= 0)
+ ext4_update_inode_fsync_trans(handle, inode, 1);
+ goto out2;
+ }
+ /* buffered IO case */
+ /*
+ * repeat fallocate creation request
+ * we already have an unwritten extent
+ */
+ if (flags & EXT4_GET_BLOCKS_UNINIT_EXT)
+ goto map_out;
+
+ /* buffered READ or buffered write_begin() lookup */
+ if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
+ /*
+ * We have blocks reserved already. We
+ * return allocated blocks so that delalloc
+ * won't do block reservation for us. But
+ * the buffer head will be unmapped so that
+ * a read from the block returns 0s.
+ */
+ set_buffer_unwritten(bh_result);
+ goto out1;
+ }
+ /* buffered write, writepage time, convert*/
+ ret = ext4_ext_convert_to_initialized(handle, inode,
+ path, iblock,
+ max_blocks);
+ if (ret >= 0)
+ ext4_update_inode_fsync_trans(handle, inode, 1);
+out:
+ if (ret <= 0) {
+ err = ret;
+ goto out2;
+ } else
+ allocated = ret;
+ set_buffer_new(bh_result);
+ /*
+ * if we allocated more blocks than requested
+ * we need to make sure we unmap the extra block
+ * allocated. The actual needed block will get
+ * unmapped later when we find the buffer_head marked
+ * new.
+ */
+ if (allocated > max_blocks) {
+ unmap_underlying_metadata_blocks(inode->i_sb->s_bdev,
+ newblock + max_blocks,
+ allocated - max_blocks);
+ }
+map_out:
+ set_buffer_mapped(bh_result);
+out1:
+ if (allocated > max_blocks)
+ allocated = max_blocks;
+ ext4_ext_show_leaf(inode, path);
+ bh_result->b_bdev = inode->i_sb->s_bdev;
+ bh_result->b_blocknr = newblock;
+out2:
+ if (path) {
+ ext4_ext_drop_refs(path);
+ kfree(path);
+ }
+ return err ? err : allocated;
+}
/*
* Block allocation/map/preallocation routine for extents based files
*
@@ -2814,6 +3178,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
int err = 0, depth, ret, cache_type;
unsigned int allocated = 0;
struct ext4_allocation_request ar;
+ ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;
__clear_bit(BH_New, &bh_result->b_state);
ext_debug("blocks %u/%u requested for inode %lu\n",
@@ -2860,7 +3225,13 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
* this situation is possible, though, _during_ tree modification;
* this is why assert can't be put in ext4_ext_find_extent()
*/
- BUG_ON(path[depth].p_ext == NULL && depth != 0);
+ if (path[depth].p_ext == NULL && depth != 0) {
+ ext4_error(inode->i_sb, __func__, "bad extent address "
+ "inode: %lu, iblock: %d, depth: %d",
+ inode->i_ino, iblock, depth);
+ err = -EIO;
+ goto out2;
+ }
eh = path[depth].p_hdr;
ex = path[depth].p_ext;
@@ -2889,33 +3260,10 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
EXT4_EXT_CACHE_EXTENT);
goto out;
}
- if (flags & EXT4_GET_BLOCKS_UNINIT_EXT)
- goto out;
- if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
- if (allocated > max_blocks)
- allocated = max_blocks;
- /*
- * We have blocks reserved already. We
- * return allocated blocks so that delalloc
- * won't do block reservation for us. But
- * the buffer head will be unmapped so that
- * a read from the block returns 0s.
- */
- set_buffer_unwritten(bh_result);
- bh_result->b_bdev = inode->i_sb->s_bdev;
- bh_result->b_blocknr = newblock;
- goto out2;
- }
-
- ret = ext4_ext_convert_to_initialized(handle, inode,
- path, iblock,
- max_blocks);
- if (ret <= 0) {
- err = ret;
- goto out2;
- } else
- allocated = ret;
- goto outnew;
+ ret = ext4_ext_handle_uninitialized_extents(handle,
+ inode, iblock, max_blocks, path,
+ flags, allocated, bh_result, newblock);
+ return ret;
}
}
@@ -2986,29 +3334,52 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
/* try to insert new extent into found leaf and return */
ext4_ext_store_pblock(&newex, newblock);
newex.ee_len = cpu_to_le16(ar.len);
- if (flags & EXT4_GET_BLOCKS_UNINIT_EXT) /* Mark uninitialized */
+ /* Mark uninitialized */
+ if (flags & EXT4_GET_BLOCKS_UNINIT_EXT){
ext4_ext_mark_uninitialized(&newex);
- err = ext4_ext_insert_extent(handle, inode, path, &newex);
+ /*
+ * io_end structure was created for every async
+ * direct IO write to the middle of the file.
+ * To avoid unecessary convertion for every aio dio rewrite
+ * to the mid of file, here we flag the IO that is really
+ * need the convertion.
+ * For non asycn direct IO case, flag the inode state
+ * that we need to perform convertion when IO is done.
+ */
+ if (flags == EXT4_GET_BLOCKS_DIO_CREATE_EXT) {
+ if (io)
+ io->flag = DIO_AIO_UNWRITTEN;
+ else
+ EXT4_I(inode)->i_state |=
+ EXT4_STATE_DIO_UNWRITTEN;;
+ }
+ }
+ err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
if (err) {
/* free data blocks we just allocated */
/* not a good idea to call discard here directly,
* but otherwise we'd need to call it every free() */
ext4_discard_preallocations(inode);
- ext4_free_blocks(handle, inode, ext_pblock(&newex),
- ext4_ext_get_actual_len(&newex), 0);
+ ext4_free_blocks(handle, inode, 0, ext_pblock(&newex),
+ ext4_ext_get_actual_len(&newex), 0);
goto out2;
}
/* previous routine could use block we allocated */
newblock = ext_pblock(&newex);
allocated = ext4_ext_get_actual_len(&newex);
-outnew:
set_buffer_new(bh_result);
- /* Cache only when it is _not_ an uninitialized extent */
- if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0)
+ /*
+ * Cache the extent and update transaction to commit on fdatasync only
+ * when it is _not_ an uninitialized extent.
+ */
+ if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0) {
ext4_ext_put_in_cache(inode, iblock, allocated, newblock,
EXT4_EXT_CACHE_EXTENT);
+ ext4_update_inode_fsync_trans(handle, inode, 1);
+ } else
+ ext4_update_inode_fsync_trans(handle, inode, 0);
out:
if (allocated > max_blocks)
allocated = max_blocks;
@@ -3201,6 +3572,64 @@ retry:
}
/*
+ * This function convert a range of blocks to written extents
+ * The caller of this function will pass the start offset and the size.
+ * all unwritten extents within this range will be converted to
+ * written extents.
+ *
+ * This function is called from the direct IO end io call back
+ * function, to convert the fallocated extents after IO is completed.
+ * Returns 0 on success.
+ */
+int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
+ loff_t len)
+{
+ handle_t *handle;
+ ext4_lblk_t block;
+ unsigned int max_blocks;
+ int ret = 0;
+ int ret2 = 0;
+ struct buffer_head map_bh;
+ unsigned int credits, blkbits = inode->i_blkbits;
+
+ block = offset >> blkbits;
+ /*
+ * We can't just convert len to max_blocks because
+ * If blocksize = 4096 offset = 3072 and len = 2048
+ */
+ max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits)
+ - block;
+ /*
+ * credits to insert 1 extent into extent tree
+ */
+ credits = ext4_chunk_trans_blocks(inode, max_blocks);
+ while (ret >= 0 && ret < max_blocks) {
+ block = block + ret;
+ max_blocks = max_blocks - ret;
+ handle = ext4_journal_start(inode, credits);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ break;
+ }
+ map_bh.b_state = 0;
+ ret = ext4_get_blocks(handle, inode, block,
+ max_blocks, &map_bh,
+ EXT4_GET_BLOCKS_DIO_CONVERT_EXT);
+ if (ret <= 0) {
+ WARN_ON(ret <= 0);
+ printk(KERN_ERR "%s: ext4_ext_get_blocks "
+ "returned error inode#%lu, block=%u, "
+ "max_blocks=%u", __func__,
+ inode->i_ino, block, max_blocks);
+ }
+ ext4_mark_inode_dirty(handle, inode);
+ ret2 = ext4_journal_stop(handle);
+ if (ret <= 0 || ret2 )
+ break;
+ }
+ return ret > 0 ? ret2 : ret;
+}
+/*
* Callback function called for each extent to gather FIEMAP information.
*/
static int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path,
@@ -3338,10 +3767,8 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
* Walk the extent tree gathering extent information.
* ext4_ext_fiemap_cb will push extents back to user.
*/
- down_read(&EXT4_I(inode)->i_data_sem);
error = ext4_ext_walk_space(inode, start_blk, len_blks,
ext4_ext_fiemap_cb, fieinfo);
- up_read(&EXT4_I(inode)->i_data_sem);
}
return error;
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 07475740b512..98bd140aad01 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -44,27 +44,37 @@
*
* What we do is just kick off a commit and wait on it. This will snapshot the
* inode to disk.
+ *
+ * i_mutex lock is held when entering and exiting this function
*/
int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
{
struct inode *inode = dentry->d_inode;
+ struct ext4_inode_info *ei = EXT4_I(inode);
journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
- int err, ret = 0;
+ int ret;
+ tid_t commit_tid;
J_ASSERT(ext4_journal_current_handle() == NULL);
trace_ext4_sync_file(file, dentry, datasync);
+ if (inode->i_sb->s_flags & MS_RDONLY)
+ return 0;
+
+ ret = flush_aio_dio_completed_IO(inode);
+ if (ret < 0)
+ return ret;
+
+ if (!journal)
+ return simple_fsync(file, dentry, datasync);
+
/*
- * data=writeback:
+ * data=writeback,ordered:
* The caller's filemap_fdatawrite()/wait will sync the data.
- * sync_inode() will sync the metadata
- *
- * data=ordered:
- * The caller's filemap_fdatawrite() will write the data and
- * sync_inode() will write the inode if it is dirty. Then the caller's
- * filemap_fdatawait() will wait on the pages.
+ * Metadata is in the journal, we wait for proper transaction to
+ * commit here.
*
* data=journal:
* filemap_fdatawrite won't do anything (the buffers are clean).
@@ -74,32 +84,25 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
* (they were dirtied by commit). But that's OK - the blocks are
* safe in-journal, which is all fsync() needs to ensure.
*/
- if (ext4_should_journal_data(inode)) {
- ret = ext4_force_commit(inode->i_sb);
- goto out;
- }
+ if (ext4_should_journal_data(inode))
+ return ext4_force_commit(inode->i_sb);
- if (!journal)
- ret = sync_mapping_buffers(inode->i_mapping);
-
- if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
- goto out;
-
- /*
- * The VFS has written the file data. If the inode is unaltered
- * then we need not start a commit.
- */
- if (inode->i_state & (I_DIRTY_SYNC|I_DIRTY_DATASYNC)) {
- struct writeback_control wbc = {
- .sync_mode = WB_SYNC_ALL,
- .nr_to_write = 0, /* sys_fsync did this */
- };
- err = sync_inode(inode, &wbc);
- if (ret == 0)
- ret = err;
- }
-out:
- if (journal && (journal->j_flags & JBD2_BARRIER))
+ commit_tid = datasync ? ei->i_datasync_tid : ei->i_sync_tid;
+ if (jbd2_log_start_commit(journal, commit_tid)) {
+ /*
+ * When the journal is on a different device than the
+ * fs data disk, we need to issue the barrier in
+ * writeback mode. (In ordered mode, the jbd2 layer
+ * will take care of issuing the barrier. In
+ * data=journal, all of the data blocks are written to
+ * the journal device.)
+ */
+ if (ext4_should_writeback_data(inode) &&
+ (journal->j_fs_dev != journal->j_dev) &&
+ (journal->j_flags & JBD2_BARRIER))
+ blkdev_issue_flush(inode->i_sb->s_bdev, NULL);
+ jbd2_log_wait_commit(journal, commit_tid);
+ } else if (journal->j_flags & JBD2_BARRIER)
blkdev_issue_flush(inode->i_sb->s_bdev, NULL);
return ret;
}
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 064746fad581..ef503d683862 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -37,6 +37,7 @@
#include <linux/namei.h>
#include <linux/uio.h>
#include <linux/bio.h>
+#include <linux/workqueue.h>
#include "ext4_jbd2.h"
#include "xattr.h"
@@ -70,58 +71,6 @@ static int ext4_inode_is_fast_symlink(struct inode *inode)
}
/*
- * The ext4 forget function must perform a revoke if we are freeing data
- * which has been journaled. Metadata (eg. indirect blocks) must be
- * revoked in all cases.
- *
- * "bh" may be NULL: a metadata block may have been freed from memory
- * but there may still be a record of it in the journal, and that record
- * still needs to be revoked.
- *
- * If the handle isn't valid we're not journaling, but we still need to
- * call into ext4_journal_revoke() to put the buffer head.
- */
-int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode,
- struct buffer_head *bh, ext4_fsblk_t blocknr)
-{
- int err;
-
- might_sleep();
-
- BUFFER_TRACE(bh, "enter");
-
- jbd_debug(4, "forgetting bh %p: is_metadata = %d, mode %o, "
- "data mode %x\n",
- bh, is_metadata, inode->i_mode,
- test_opt(inode->i_sb, DATA_FLAGS));
-
- /* Never use the revoke function if we are doing full data
- * journaling: there is no need to, and a V1 superblock won't
- * support it. Otherwise, only skip the revoke on un-journaled
- * data blocks. */
-
- if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA ||
- (!is_metadata && !ext4_should_journal_data(inode))) {
- if (bh) {
- BUFFER_TRACE(bh, "call jbd2_journal_forget");
- return ext4_journal_forget(handle, bh);
- }
- return 0;
- }
-
- /*
- * data!=journal && (is_metadata || should_journal_data(inode))
- */
- BUFFER_TRACE(bh, "call ext4_journal_revoke");
- err = ext4_journal_revoke(handle, blocknr, bh);
- if (err)
- ext4_abort(inode->i_sb, __func__,
- "error %d when attempting revoke", err);
- BUFFER_TRACE(bh, "exit");
- return err;
-}
-
-/*
* Work out how many blocks we need to proceed with the next chunk of a
* truncate transaction.
*/
@@ -192,7 +141,7 @@ static int try_to_extend_transaction(handle_t *handle, struct inode *inode)
* so before we call here everything must be consistently dirtied against
* this transaction.
*/
- int ext4_truncate_restart_trans(handle_t *handle, struct inode *inode,
+int ext4_truncate_restart_trans(handle_t *handle, struct inode *inode,
int nblocks)
{
int ret;
@@ -208,6 +157,7 @@ static int try_to_extend_transaction(handle_t *handle, struct inode *inode)
up_write(&EXT4_I(inode)->i_data_sem);
ret = ext4_journal_restart(handle, blocks_for_truncate(inode));
down_write(&EXT4_I(inode)->i_data_sem);
+ ext4_discard_preallocations(inode);
return ret;
}
@@ -719,7 +669,7 @@ allocated:
return ret;
failed_out:
for (i = 0; i < index; i++)
- ext4_free_blocks(handle, inode, new_blocks[i], 1, 0);
+ ext4_free_blocks(handle, inode, 0, new_blocks[i], 1, 0);
return ret;
}
@@ -815,14 +765,20 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
return err;
failed:
/* Allocation failed, free what we already allocated */
+ ext4_free_blocks(handle, inode, 0, new_blocks[0], 1, 0);
for (i = 1; i <= n ; i++) {
- BUFFER_TRACE(branch[i].bh, "call jbd2_journal_forget");
- ext4_journal_forget(handle, branch[i].bh);
+ /*
+ * branch[i].bh is newly allocated, so there is no
+ * need to revoke the block, which is why we don't
+ * need to set EXT4_FREE_BLOCKS_METADATA.
+ */
+ ext4_free_blocks(handle, inode, 0, new_blocks[i], 1,
+ EXT4_FREE_BLOCKS_FORGET);
}
- for (i = 0; i < indirect_blks; i++)
- ext4_free_blocks(handle, inode, new_blocks[i], 1, 0);
+ for (i = n+1; i < indirect_blks; i++)
+ ext4_free_blocks(handle, inode, 0, new_blocks[i], 1, 0);
- ext4_free_blocks(handle, inode, new_blocks[i], num, 0);
+ ext4_free_blocks(handle, inode, 0, new_blocks[i], num, 0);
return err;
}
@@ -901,12 +857,16 @@ static int ext4_splice_branch(handle_t *handle, struct inode *inode,
err_out:
for (i = 1; i <= num; i++) {
- BUFFER_TRACE(where[i].bh, "call jbd2_journal_forget");
- ext4_journal_forget(handle, where[i].bh);
- ext4_free_blocks(handle, inode,
- le32_to_cpu(where[i-1].key), 1, 0);
+ /*
+ * branch[i].bh is newly allocated, so there is no
+ * need to revoke the block, which is why we don't
+ * need to set EXT4_FREE_BLOCKS_METADATA.
+ */
+ ext4_free_blocks(handle, inode, where[i].bh, 0, 1,
+ EXT4_FREE_BLOCKS_FORGET);
}
- ext4_free_blocks(handle, inode, le32_to_cpu(where[num].key), blks, 0);
+ ext4_free_blocks(handle, inode, 0, le32_to_cpu(where[num].key),
+ blks, 0);
return err;
}
@@ -1019,10 +979,12 @@ static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode,
if (!err)
err = ext4_splice_branch(handle, inode, iblock,
partial, indirect_blks, count);
- else
+ if (err)
goto cleanup;
set_buffer_new(bh_result);
+
+ ext4_update_inode_fsync_trans(handle, inode, 1);
got_it:
map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key));
if (count > blocks_to_boundary)
@@ -1041,83 +1003,94 @@ out:
return err;
}
-qsize_t ext4_get_reserved_space(struct inode *inode)
+#ifdef CONFIG_QUOTA
+qsize_t *ext4_get_reserved_space(struct inode *inode)
{
- unsigned long long total;
-
- spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
- total = EXT4_I(inode)->i_reserved_data_blocks +
- EXT4_I(inode)->i_reserved_meta_blocks;
- spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
-
- return total;
+ return &EXT4_I(inode)->i_reserved_quota;
}
+#endif
+
/*
* Calculate the number of metadata blocks need to reserve
- * to allocate @blocks for non extent file based file
+ * to allocate a new block at @lblocks for non extent file based file
*/
-static int ext4_indirect_calc_metadata_amount(struct inode *inode, int blocks)
+static int ext4_indirect_calc_metadata_amount(struct inode *inode,
+ sector_t lblock)
{
- int icap = EXT4_ADDR_PER_BLOCK(inode->i_sb);
- int ind_blks, dind_blks, tind_blks;
-
- /* number of new indirect blocks needed */
- ind_blks = (blocks + icap - 1) / icap;
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ int dind_mask = EXT4_ADDR_PER_BLOCK(inode->i_sb) - 1;
+ int blk_bits;
- dind_blks = (ind_blks + icap - 1) / icap;
+ if (lblock < EXT4_NDIR_BLOCKS)
+ return 0;
- tind_blks = 1;
+ lblock -= EXT4_NDIR_BLOCKS;
- return ind_blks + dind_blks + tind_blks;
+ if (ei->i_da_metadata_calc_len &&
+ (lblock & dind_mask) == ei->i_da_metadata_calc_last_lblock) {
+ ei->i_da_metadata_calc_len++;
+ return 0;
+ }
+ ei->i_da_metadata_calc_last_lblock = lblock & dind_mask;
+ ei->i_da_metadata_calc_len = 1;
+ blk_bits = roundup_pow_of_two(lblock + 1);
+ return (blk_bits / EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb)) + 1;
}
/*
* Calculate the number of metadata blocks need to reserve
- * to allocate given number of blocks
+ * to allocate a block located at @lblock
*/
-static int ext4_calc_metadata_amount(struct inode *inode, int blocks)
+static int ext4_calc_metadata_amount(struct inode *inode, sector_t lblock)
{
- if (!blocks)
- return 0;
-
if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)
- return ext4_ext_calc_metadata_amount(inode, blocks);
+ return ext4_ext_calc_metadata_amount(inode, lblock);
- return ext4_indirect_calc_metadata_amount(inode, blocks);
+ return ext4_indirect_calc_metadata_amount(inode, lblock);
}
+/*
+ * Called with i_data_sem down, which is important since we can call
+ * ext4_discard_preallocations() from here.
+ */
static void ext4_da_update_reserve_space(struct inode *inode, int used)
{
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
- int total, mdb, mdb_free;
-
- spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
- /* recalculate the number of metablocks still need to be reserved */
- total = EXT4_I(inode)->i_reserved_data_blocks - used;
- mdb = ext4_calc_metadata_amount(inode, total);
-
- /* figure out how many metablocks to release */
- BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks);
- mdb_free = EXT4_I(inode)->i_reserved_meta_blocks - mdb;
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ int mdb_free = 0;
+
+ spin_lock(&ei->i_block_reservation_lock);
+ if (unlikely(used > ei->i_reserved_data_blocks)) {
+ ext4_msg(inode->i_sb, KERN_NOTICE, "%s: ino %lu, used %d "
+ "with only %d reserved data blocks\n",
+ __func__, inode->i_ino, used,
+ ei->i_reserved_data_blocks);
+ WARN_ON(1);
+ used = ei->i_reserved_data_blocks;
+ }
- if (mdb_free) {
- /* Account for allocated meta_blocks */
- mdb_free -= EXT4_I(inode)->i_allocated_meta_blocks;
+ /* Update per-inode reservations */
+ ei->i_reserved_data_blocks -= used;
+ used += ei->i_allocated_meta_blocks;
+ ei->i_reserved_meta_blocks -= ei->i_allocated_meta_blocks;
+ ei->i_allocated_meta_blocks = 0;
+ percpu_counter_sub(&sbi->s_dirtyblocks_counter, used);
- /* update fs dirty blocks counter */
+ if (ei->i_reserved_data_blocks == 0) {
+ /*
+ * We can release all of the reserved metadata blocks
+ * only when we have written all of the delayed
+ * allocation blocks.
+ */
+ mdb_free = ei->i_reserved_meta_blocks;
+ ei->i_reserved_meta_blocks = 0;
+ ei->i_da_metadata_calc_len = 0;
percpu_counter_sub(&sbi->s_dirtyblocks_counter, mdb_free);
- EXT4_I(inode)->i_allocated_meta_blocks = 0;
- EXT4_I(inode)->i_reserved_meta_blocks = mdb;
}
-
- /* update per-inode reservations */
- BUG_ON(used > EXT4_I(inode)->i_reserved_data_blocks);
- EXT4_I(inode)->i_reserved_data_blocks -= used;
spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
- /*
- * free those over-booking quota for metadata blocks
- */
+ /* Update quota subsystem */
+ vfs_dq_claim_block(inode, used);
if (mdb_free)
vfs_dq_release_reservation_block(inode, mdb_free);
@@ -1126,7 +1099,8 @@ static void ext4_da_update_reserve_space(struct inode *inode, int used)
* there aren't any writers on the inode, we can discard the
* inode's preallocations.
*/
- if (!total && (atomic_read(&inode->i_writecount) == 0))
+ if ((ei->i_reserved_data_blocks == 0) &&
+ (atomic_read(&inode->i_writecount) == 0))
ext4_discard_preallocations(inode);
}
@@ -1145,6 +1119,64 @@ static int check_block_validity(struct inode *inode, const char *msg,
}
/*
+ * Return the number of contiguous dirty pages in a given inode
+ * starting at page frame idx.
+ */
+static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx,
+ unsigned int max_pages)
+{
+ struct address_space *mapping = inode->i_mapping;
+ pgoff_t index;
+ struct pagevec pvec;
+ pgoff_t num = 0;
+ int i, nr_pages, done = 0;
+
+ if (max_pages == 0)
+ return 0;
+ pagevec_init(&pvec, 0);
+ while (!done) {
+ index = idx;
+ nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
+ PAGECACHE_TAG_DIRTY,
+ (pgoff_t)PAGEVEC_SIZE);
+ if (nr_pages == 0)
+ break;
+ for (i = 0; i < nr_pages; i++) {
+ struct page *page = pvec.pages[i];
+ struct buffer_head *bh, *head;
+
+ lock_page(page);
+ if (unlikely(page->mapping != mapping) ||
+ !PageDirty(page) ||
+ PageWriteback(page) ||
+ page->index != idx) {
+ done = 1;
+ unlock_page(page);
+ break;
+ }
+ if (page_has_buffers(page)) {
+ bh = head = page_buffers(page);
+ do {
+ if (!buffer_delay(bh) &&
+ !buffer_unwritten(bh))
+ done = 1;
+ bh = bh->b_this_page;
+ } while (!done && (bh != head));
+ }
+ unlock_page(page);
+ if (done)
+ break;
+ idx++;
+ num++;
+ if (num >= max_pages)
+ break;
+ }
+ pagevec_release(&pvec);
+ }
+ return num;
+}
+
+/*
* The ext4_get_blocks() function tries to look up the requested blocks,
* and returns if the blocks are already mapped.
*
@@ -1175,6 +1207,9 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
clear_buffer_mapped(bh);
clear_buffer_unwritten(bh);
+ ext_debug("ext4_get_blocks(): inode %lu, flag %d, max_blocks %u,"
+ "logical block %lu\n", inode->i_ino, flags, max_blocks,
+ (unsigned long)block);
/*
* Try to see if we can get the block without requesting a new
* file system block.
@@ -1471,6 +1506,16 @@ static int do_journal_get_write_access(handle_t *handle,
return ext4_journal_get_write_access(handle, bh);
}
+/*
+ * Truncate blocks that were not used by write. We have to truncate the
+ * pagecache as well so that corresponding buffers get properly unmapped.
+ */
+static void ext4_truncate_failed_write(struct inode *inode)
+{
+ truncate_inode_pages(inode->i_mapping, inode->i_size);
+ ext4_truncate(inode);
+}
+
static int ext4_write_begin(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned flags,
struct page **pagep, void **fsdata)
@@ -1536,7 +1581,7 @@ retry:
ext4_journal_stop(handle);
if (pos + len > inode->i_size) {
- ext4_truncate(inode);
+ ext4_truncate_failed_write(inode);
/*
* If truncate failed early the inode might
* still be on the orphan list; we need to
@@ -1646,7 +1691,7 @@ static int ext4_ordered_write_end(struct file *file,
ret = ret2;
if (pos + len > inode->i_size) {
- ext4_truncate(inode);
+ ext4_truncate_failed_write(inode);
/*
* If truncate failed early the inode might still be
* on the orphan list; we need to make sure the inode
@@ -1688,7 +1733,7 @@ static int ext4_writeback_write_end(struct file *file,
ret = ret2;
if (pos + len > inode->i_size) {
- ext4_truncate(inode);
+ ext4_truncate_failed_write(inode);
/*
* If truncate failed early the inode might still be
* on the orphan list; we need to make sure the inode
@@ -1751,7 +1796,7 @@ static int ext4_journalled_write_end(struct file *file,
if (!ret)
ret = ret2;
if (pos + len > inode->i_size) {
- ext4_truncate(inode);
+ ext4_truncate_failed_write(inode);
/*
* If truncate failed early the inode might still be
* on the orphan list; we need to make sure the inode
@@ -1764,11 +1809,15 @@ static int ext4_journalled_write_end(struct file *file,
return ret ? ret : copied;
}
-static int ext4_da_reserve_space(struct inode *inode, int nrblocks)
+/*
+ * Reserve a single block located at lblock
+ */
+static int ext4_da_reserve_space(struct inode *inode, sector_t lblock)
{
int retries = 0;
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
- unsigned long md_needed, mdblocks, total = 0;
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ unsigned long md_needed, md_reserved;
/*
* recalculate the amount of metadata blocks to reserve
@@ -1776,86 +1825,90 @@ static int ext4_da_reserve_space(struct inode *inode, int nrblocks)
* worse case is one extent per block
*/
repeat:
- spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
- total = EXT4_I(inode)->i_reserved_data_blocks + nrblocks;
- mdblocks = ext4_calc_metadata_amount(inode, total);
- BUG_ON(mdblocks < EXT4_I(inode)->i_reserved_meta_blocks);
-
- md_needed = mdblocks - EXT4_I(inode)->i_reserved_meta_blocks;
- total = md_needed + nrblocks;
+ spin_lock(&ei->i_block_reservation_lock);
+ md_reserved = ei->i_reserved_meta_blocks;
+ md_needed = ext4_calc_metadata_amount(inode, lblock);
+ spin_unlock(&ei->i_block_reservation_lock);
/*
* Make quota reservation here to prevent quota overflow
* later. Real quota accounting is done at pages writeout
* time.
*/
- if (vfs_dq_reserve_block(inode, total)) {
- spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
+ if (vfs_dq_reserve_block(inode, md_needed + 1)) {
+ /*
+ * We tend to badly over-estimate the amount of
+ * metadata blocks which are needed, so if we have
+ * reserved any metadata blocks, try to force out the
+ * inode and see if we have any better luck.
+ */
+ if (md_reserved && retries++ <= 3)
+ goto retry;
return -EDQUOT;
}
- if (ext4_claim_free_blocks(sbi, total)) {
- spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
+ if (ext4_claim_free_blocks(sbi, md_needed + 1)) {
+ vfs_dq_release_reservation_block(inode, md_needed + 1);
if (ext4_should_retry_alloc(inode->i_sb, &retries)) {
+ retry:
+ if (md_reserved)
+ write_inode_now(inode, (retries == 3));
yield();
goto repeat;
}
- vfs_dq_release_reservation_block(inode, total);
return -ENOSPC;
}
- EXT4_I(inode)->i_reserved_data_blocks += nrblocks;
- EXT4_I(inode)->i_reserved_meta_blocks = mdblocks;
+ spin_lock(&ei->i_block_reservation_lock);
+ ei->i_reserved_data_blocks++;
+ ei->i_reserved_meta_blocks += md_needed;
+ spin_unlock(&ei->i_block_reservation_lock);
- spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
return 0; /* success */
}
static void ext4_da_release_space(struct inode *inode, int to_free)
{
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
- int total, mdb, mdb_free, release;
+ struct ext4_inode_info *ei = EXT4_I(inode);
if (!to_free)
return; /* Nothing to release, exit */
spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
- if (!EXT4_I(inode)->i_reserved_data_blocks) {
+ if (unlikely(to_free > ei->i_reserved_data_blocks)) {
/*
- * if there is no reserved blocks, but we try to free some
- * then the counter is messed up somewhere.
- * but since this function is called from invalidate
- * page, it's harmless to return without any action
+ * if there aren't enough reserved blocks, then the
+ * counter is messed up somewhere. Since this
+ * function is called from invalidate page, it's
+ * harmless to return without any action.
*/
- printk(KERN_INFO "ext4 delalloc try to release %d reserved "
- "blocks for inode %lu, but there is no reserved "
- "data blocks\n", to_free, inode->i_ino);
- spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
- return;
+ ext4_msg(inode->i_sb, KERN_NOTICE, "ext4_da_release_space: "
+ "ino %lu, to_free %d with only %d reserved "
+ "data blocks\n", inode->i_ino, to_free,
+ ei->i_reserved_data_blocks);
+ WARN_ON(1);
+ to_free = ei->i_reserved_data_blocks;
}
+ ei->i_reserved_data_blocks -= to_free;
- /* recalculate the number of metablocks still need to be reserved */
- total = EXT4_I(inode)->i_reserved_data_blocks - to_free;
- mdb = ext4_calc_metadata_amount(inode, total);
-
- /* figure out how many metablocks to release */
- BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks);
- mdb_free = EXT4_I(inode)->i_reserved_meta_blocks - mdb;
-
- release = to_free + mdb_free;
-
- /* update fs dirty blocks counter for truncate case */
- percpu_counter_sub(&sbi->s_dirtyblocks_counter, release);
+ if (ei->i_reserved_data_blocks == 0) {
+ /*
+ * We can release all of the reserved metadata blocks
+ * only when we have written all of the delayed
+ * allocation blocks.
+ */
+ to_free += ei->i_reserved_meta_blocks;
+ ei->i_reserved_meta_blocks = 0;
+ ei->i_da_metadata_calc_len = 0;
+ }
- /* update per-inode reservations */
- BUG_ON(to_free > EXT4_I(inode)->i_reserved_data_blocks);
- EXT4_I(inode)->i_reserved_data_blocks -= to_free;
+ /* update fs dirty blocks counter */
+ percpu_counter_sub(&sbi->s_dirtyblocks_counter, to_free);
- BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks);
- EXT4_I(inode)->i_reserved_meta_blocks = mdb;
spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
- vfs_dq_release_reservation_block(inode, release);
+ vfs_dq_release_reservation_block(inode, to_free);
}
static void ext4_da_page_release_reservation(struct page *page,
@@ -2092,18 +2145,18 @@ static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd,
static void ext4_print_free_blocks(struct inode *inode)
{
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
- printk(KERN_EMERG "Total free blocks count %lld\n",
- ext4_count_free_blocks(inode->i_sb));
- printk(KERN_EMERG "Free/Dirty block details\n");
- printk(KERN_EMERG "free_blocks=%lld\n",
- (long long)percpu_counter_sum(&sbi->s_freeblocks_counter));
- printk(KERN_EMERG "dirty_blocks=%lld\n",
- (long long)percpu_counter_sum(&sbi->s_dirtyblocks_counter));
- printk(KERN_EMERG "Block reservation details\n");
- printk(KERN_EMERG "i_reserved_data_blocks=%u\n",
- EXT4_I(inode)->i_reserved_data_blocks);
- printk(KERN_EMERG "i_reserved_meta_blocks=%u\n",
- EXT4_I(inode)->i_reserved_meta_blocks);
+ printk(KERN_CRIT "Total free blocks count %lld\n",
+ ext4_count_free_blocks(inode->i_sb));
+ printk(KERN_CRIT "Free/Dirty block details\n");
+ printk(KERN_CRIT "free_blocks=%lld\n",
+ (long long) percpu_counter_sum(&sbi->s_freeblocks_counter));
+ printk(KERN_CRIT "dirty_blocks=%lld\n",
+ (long long) percpu_counter_sum(&sbi->s_dirtyblocks_counter));
+ printk(KERN_CRIT "Block reservation details\n");
+ printk(KERN_CRIT "i_reserved_data_blocks=%u\n",
+ EXT4_I(inode)->i_reserved_data_blocks);
+ printk(KERN_CRIT "i_reserved_meta_blocks=%u\n",
+ EXT4_I(inode)->i_reserved_meta_blocks);
return;
}
@@ -2189,14 +2242,14 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
* writepage and writepages will again try to write
* the same.
*/
- printk(KERN_EMERG "%s block allocation failed for inode %lu "
- "at logical offset %llu with max blocks "
- "%zd with error %d\n",
- __func__, mpd->inode->i_ino,
- (unsigned long long)next,
- mpd->b_size >> mpd->inode->i_blkbits, err);
- printk(KERN_EMERG "This should not happen.!! "
- "Data will be lost\n");
+ ext4_msg(mpd->inode->i_sb, KERN_CRIT,
+ "delayed block allocation failed for inode %lu at "
+ "logical offset %llu with max blocks %zd with "
+ "error %d\n", mpd->inode->i_ino,
+ (unsigned long long) next,
+ mpd->b_size >> mpd->inode->i_blkbits, err);
+ printk(KERN_CRIT "This should not happen!! "
+ "Data will be lost\n");
if (err == -ENOSPC) {
ext4_print_free_blocks(mpd->inode);
}
@@ -2461,7 +2514,7 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
* XXX: __block_prepare_write() unmaps passed block,
* is it OK?
*/
- ret = ext4_da_reserve_space(inode, 1);
+ ret = ext4_da_reserve_space(inode, iblock);
if (ret)
/* not enough space to reserve */
return ret;
@@ -2537,7 +2590,6 @@ static int bput_one(handle_t *handle, struct buffer_head *bh)
}
static int __ext4_journalled_writepage(struct page *page,
- struct writeback_control *wbc,
unsigned int len)
{
struct address_space *mapping = page->mapping;
@@ -2695,7 +2747,7 @@ static int ext4_writepage(struct page *page,
* doesn't seem much point in redirtying the page here.
*/
ClearPageChecked(page);
- return __ext4_journalled_writepage(page, wbc, len);
+ return __ext4_journalled_writepage(page, len);
}
if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode))
@@ -2725,7 +2777,7 @@ static int ext4_da_writepages_trans_blocks(struct inode *inode)
* number of contiguous block. So we will limit
* number of contiguous block to a sane value
*/
- if (!(inode->i_flags & EXT4_EXTENTS_FL) &&
+ if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) &&
(max_blocks > EXT4_MAX_TRANS_DATA))
max_blocks = EXT4_MAX_TRANS_DATA;
@@ -2743,8 +2795,10 @@ static int ext4_da_writepages(struct address_space *mapping,
int no_nrwrite_index_update;
int pages_written = 0;
long pages_skipped;
+ unsigned int max_pages;
int range_cyclic, cycled = 1, io_done = 0;
- int needed_blocks, ret = 0, nr_to_writebump = 0;
+ int needed_blocks, ret = 0;
+ long desired_nr_to_write, nr_to_writebump = 0;
loff_t range_start = wbc->range_start;
struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
@@ -2771,16 +2825,6 @@ static int ext4_da_writepages(struct address_space *mapping,
if (unlikely(sbi->s_mount_flags & EXT4_MF_FS_ABORTED))
return -EROFS;
- /*
- * Make sure nr_to_write is >= sbi->s_mb_stream_request
- * This make sure small files blocks are allocated in
- * single attempt. This ensure that small files
- * get less fragmented.
- */
- if (wbc->nr_to_write < sbi->s_mb_stream_request) {
- nr_to_writebump = sbi->s_mb_stream_request - wbc->nr_to_write;
- wbc->nr_to_write = sbi->s_mb_stream_request;
- }
if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
range_whole = 1;
@@ -2795,6 +2839,36 @@ static int ext4_da_writepages(struct address_space *mapping,
} else
index = wbc->range_start >> PAGE_CACHE_SHIFT;
+ /*
+ * This works around two forms of stupidity. The first is in
+ * the writeback code, which caps the maximum number of pages
+ * written to be 1024 pages. This is wrong on multiple
+ * levels; different architectues have a different page size,
+ * which changes the maximum amount of data which gets
+ * written. Secondly, 4 megabytes is way too small. XFS
+ * forces this value to be 16 megabytes by multiplying
+ * nr_to_write parameter by four, and then relies on its
+ * allocator to allocate larger extents to make them
+ * contiguous. Unfortunately this brings us to the second
+ * stupidity, which is that ext4's mballoc code only allocates
+ * at most 2048 blocks. So we force contiguous writes up to
+ * the number of dirty blocks in the inode, or
+ * sbi->max_writeback_mb_bump whichever is smaller.
+ */
+ max_pages = sbi->s_max_writeback_mb_bump << (20 - PAGE_CACHE_SHIFT);
+ if (!range_cyclic && range_whole)
+ desired_nr_to_write = wbc->nr_to_write * 8;
+ else
+ desired_nr_to_write = ext4_num_dirty_pages(inode, index,
+ max_pages);
+ if (desired_nr_to_write > max_pages)
+ desired_nr_to_write = max_pages;
+
+ if (wbc->nr_to_write < desired_nr_to_write) {
+ nr_to_writebump = desired_nr_to_write - wbc->nr_to_write;
+ wbc->nr_to_write = desired_nr_to_write;
+ }
+
mpd.wbc = wbc;
mpd.inode = mapping->host;
@@ -2822,10 +2896,9 @@ retry:
handle = ext4_journal_start(inode, needed_blocks);
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
- printk(KERN_CRIT "%s: jbd2_start: "
+ ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: "
"%ld pages, ino %lu; err %d\n", __func__,
wbc->nr_to_write, inode->i_ino, ret);
- dump_stack();
goto out_writepages;
}
@@ -2849,7 +2922,7 @@ retry:
ret = write_cache_pages(mapping, wbc, __mpage_da_writepage,
&mpd);
/*
- * If we have a contigous extent of pages and we
+ * If we have a contiguous extent of pages and we
* haven't done the I/O yet, map the blocks and submit
* them for I/O.
*/
@@ -2897,9 +2970,10 @@ retry:
goto retry;
}
if (pages_skipped != wbc->pages_skipped)
- printk(KERN_EMERG "This should not happen leaving %s "
- "with nr_to_write = %ld ret = %d\n",
- __func__, wbc->nr_to_write, ret);
+ ext4_msg(inode->i_sb, KERN_CRIT,
+ "This should not happen leaving %s "
+ "with nr_to_write = %ld ret = %d\n",
+ __func__, wbc->nr_to_write, ret);
/* Update index */
index += pages_written;
@@ -2939,11 +3013,18 @@ static int ext4_nonda_switch(struct super_block *sb)
if (2 * free_blocks < 3 * dirty_blocks ||
free_blocks < (dirty_blocks + EXT4_FREEBLOCKS_WATERMARK)) {
/*
- * free block count is less that 150% of dirty blocks
- * or free blocks is less that watermark
+ * free block count is less than 150% of dirty blocks
+ * or free blocks is less than watermark
*/
return 1;
}
+ /*
+ * Even if we don't switch but are nearing capacity,
+ * start pushing delalloc when 1/2 of free blocks are dirty.
+ */
+ if (free_blocks < 2 * dirty_blocks)
+ writeback_inodes_sb_if_idle(sb);
+
return 0;
}
@@ -3005,7 +3086,7 @@ retry:
* i_size_read because we hold i_mutex.
*/
if (pos + len > inode->i_size)
- ext4_truncate(inode);
+ ext4_truncate_failed_write(inode);
}
if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
@@ -3272,6 +3353,8 @@ static int ext4_releasepage(struct page *page, gfp_t wait)
}
/*
+ * O_DIRECT for ext3 (or indirect map) based files
+ *
* If the O_DIRECT write will extend the file then add this inode to the
* orphan list. So recovery will truncate it back to the original size
* if the machine crashes during the write.
@@ -3280,7 +3363,7 @@ static int ext4_releasepage(struct page *page, gfp_t wait)
* crashes then stale disk data _may_ be exposed inside the file. But current
* VFS code falls back into buffered path in that case so we are safe.
*/
-static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
+static ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,
const struct iovec *iov, loff_t offset,
unsigned long nr_segs)
{
@@ -3291,6 +3374,7 @@ static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
ssize_t ret;
int orphan = 0;
size_t count = iov_length(iov, nr_segs);
+ int retries = 0;
if (rw == WRITE) {
loff_t final_size = offset + count;
@@ -3313,9 +3397,12 @@ static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
}
}
+retry:
ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
offset, nr_segs,
ext4_get_block, NULL);
+ if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
+ goto retry;
if (orphan) {
int err;
@@ -3354,6 +3441,364 @@ out:
return ret;
}
+static int ext4_get_block_dio_write(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh_result, int create)
+{
+ handle_t *handle = NULL;
+ int ret = 0;
+ unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
+ int dio_credits;
+
+ ext4_debug("ext4_get_block_dio_write: inode %lu, create flag %d\n",
+ inode->i_ino, create);
+ /*
+ * DIO VFS code passes create = 0 flag for write to
+ * the middle of file. It does this to avoid block
+ * allocation for holes, to prevent expose stale data
+ * out when there is parallel buffered read (which does
+ * not hold the i_mutex lock) while direct IO write has
+ * not completed. DIO request on holes finally falls back
+ * to buffered IO for this reason.
+ *
+ * For ext4 extent based file, since we support fallocate,
+ * new allocated extent as uninitialized, for holes, we
+ * could fallocate blocks for holes, thus parallel
+ * buffered IO read will zero out the page when read on
+ * a hole while parallel DIO write to the hole has not completed.
+ *
+ * when we come here, we know it's a direct IO write to
+ * to the middle of file (<i_size)
+ * so it's safe to override the create flag from VFS.
+ */
+ create = EXT4_GET_BLOCKS_DIO_CREATE_EXT;
+
+ if (max_blocks > DIO_MAX_BLOCKS)
+ max_blocks = DIO_MAX_BLOCKS;
+ dio_credits = ext4_chunk_trans_blocks(inode, max_blocks);
+ handle = ext4_journal_start(inode, dio_credits);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ goto out;
+ }
+ ret = ext4_get_blocks(handle, inode, iblock, max_blocks, bh_result,
+ create);
+ if (ret > 0) {
+ bh_result->b_size = (ret << inode->i_blkbits);
+ ret = 0;
+ }
+ ext4_journal_stop(handle);
+out:
+ return ret;
+}
+
+static void ext4_free_io_end(ext4_io_end_t *io)
+{
+ BUG_ON(!io);
+ iput(io->inode);
+ kfree(io);
+}
+static void dump_aio_dio_list(struct inode * inode)
+{
+#ifdef EXT4_DEBUG
+ struct list_head *cur, *before, *after;
+ ext4_io_end_t *io, *io0, *io1;
+
+ if (list_empty(&EXT4_I(inode)->i_aio_dio_complete_list)){
+ ext4_debug("inode %lu aio dio list is empty\n", inode->i_ino);
+ return;
+ }
+
+ ext4_debug("Dump inode %lu aio_dio_completed_IO list \n", inode->i_ino);
+ list_for_each_entry(io, &EXT4_I(inode)->i_aio_dio_complete_list, list){
+ cur = &io->list;
+ before = cur->prev;
+ io0 = container_of(before, ext4_io_end_t, list);
+ after = cur->next;
+ io1 = container_of(after, ext4_io_end_t, list);
+
+ ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n",
+ io, inode->i_ino, io0, io1);
+ }
+#endif
+}
+
+/*
+ * check a range of space and convert unwritten extents to written.
+ */
+static int ext4_end_aio_dio_nolock(ext4_io_end_t *io)
+{
+ struct inode *inode = io->inode;
+ loff_t offset = io->offset;
+ size_t size = io->size;
+ int ret = 0;
+
+ ext4_debug("end_aio_dio_onlock: io 0x%p from inode %lu,list->next 0x%p,"
+ "list->prev 0x%p\n",
+ io, inode->i_ino, io->list.next, io->list.prev);
+
+ if (list_empty(&io->list))
+ return ret;
+
+ if (io->flag != DIO_AIO_UNWRITTEN)
+ return ret;
+
+ if (offset + size <= i_size_read(inode))
+ ret = ext4_convert_unwritten_extents(inode, offset, size);
+
+ if (ret < 0) {
+ printk(KERN_EMERG "%s: failed to convert unwritten"
+ "extents to written extents, error is %d"
+ " io is still on inode %lu aio dio list\n",
+ __func__, ret, inode->i_ino);
+ return ret;
+ }
+
+ /* clear the DIO AIO unwritten flag */
+ io->flag = 0;
+ return ret;
+}
+/*
+ * work on completed aio dio IO, to convert unwritten extents to extents
+ */
+static void ext4_end_aio_dio_work(struct work_struct *work)
+{
+ ext4_io_end_t *io = container_of(work, ext4_io_end_t, work);
+ struct inode *inode = io->inode;
+ int ret = 0;
+
+ mutex_lock(&inode->i_mutex);
+ ret = ext4_end_aio_dio_nolock(io);
+ if (ret >= 0) {
+ if (!list_empty(&io->list))
+ list_del_init(&io->list);
+ ext4_free_io_end(io);
+ }
+ mutex_unlock(&inode->i_mutex);
+}
+/*
+ * This function is called from ext4_sync_file().
+ *
+ * When AIO DIO IO is completed, the work to convert unwritten
+ * extents to written is queued on workqueue but may not get immediately
+ * scheduled. When fsync is called, we need to ensure the
+ * conversion is complete before fsync returns.
+ * The inode keeps track of a list of completed AIO from DIO path
+ * that might needs to do the conversion. This function walks through
+ * the list and convert the related unwritten extents to written.
+ */
+int flush_aio_dio_completed_IO(struct inode *inode)
+{
+ ext4_io_end_t *io;
+ int ret = 0;
+ int ret2 = 0;
+
+ if (list_empty(&EXT4_I(inode)->i_aio_dio_complete_list))
+ return ret;
+
+ dump_aio_dio_list(inode);
+ while (!list_empty(&EXT4_I(inode)->i_aio_dio_complete_list)){
+ io = list_entry(EXT4_I(inode)->i_aio_dio_complete_list.next,
+ ext4_io_end_t, list);
+ /*
+ * Calling ext4_end_aio_dio_nolock() to convert completed
+ * IO to written.
+ *
+ * When ext4_sync_file() is called, run_queue() may already
+ * about to flush the work corresponding to this io structure.
+ * It will be upset if it founds the io structure related
+ * to the work-to-be schedule is freed.
+ *
+ * Thus we need to keep the io structure still valid here after
+ * convertion finished. The io structure has a flag to
+ * avoid double converting from both fsync and background work
+ * queue work.
+ */
+ ret = ext4_end_aio_dio_nolock(io);
+ if (ret < 0)
+ ret2 = ret;
+ else
+ list_del_init(&io->list);
+ }
+ return (ret2 < 0) ? ret2 : 0;
+}
+
+static ext4_io_end_t *ext4_init_io_end (struct inode *inode)
+{
+ ext4_io_end_t *io = NULL;
+
+ io = kmalloc(sizeof(*io), GFP_NOFS);
+
+ if (io) {
+ igrab(inode);
+ io->inode = inode;
+ io->flag = 0;
+ io->offset = 0;
+ io->size = 0;
+ io->error = 0;
+ INIT_WORK(&io->work, ext4_end_aio_dio_work);
+ INIT_LIST_HEAD(&io->list);
+ }
+
+ return io;
+}
+
+static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
+ ssize_t size, void *private)
+{
+ ext4_io_end_t *io_end = iocb->private;
+ struct workqueue_struct *wq;
+
+ /* if not async direct IO or dio with 0 bytes write, just return */
+ if (!io_end || !size)
+ return;
+
+ ext_debug("ext4_end_io_dio(): io_end 0x%p"
+ "for inode %lu, iocb 0x%p, offset %llu, size %llu\n",
+ iocb->private, io_end->inode->i_ino, iocb, offset,
+ size);
+
+ /* if not aio dio with unwritten extents, just free io and return */
+ if (io_end->flag != DIO_AIO_UNWRITTEN){
+ ext4_free_io_end(io_end);
+ iocb->private = NULL;
+ return;
+ }
+
+ io_end->offset = offset;
+ io_end->size = size;
+ wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq;
+
+ /* queue the work to convert unwritten extents to written */
+ queue_work(wq, &io_end->work);
+
+ /* Add the io_end to per-inode completed aio dio list*/
+ list_add_tail(&io_end->list,
+ &EXT4_I(io_end->inode)->i_aio_dio_complete_list);
+ iocb->private = NULL;
+}
+/*
+ * For ext4 extent files, ext4 will do direct-io write to holes,
+ * preallocated extents, and those write extend the file, no need to
+ * fall back to buffered IO.
+ *
+ * For holes, we fallocate those blocks, mark them as unintialized
+ * If those blocks were preallocated, we mark sure they are splited, but
+ * still keep the range to write as unintialized.
+ *
+ * The unwrritten extents will be converted to written when DIO is completed.
+ * For async direct IO, since the IO may still pending when return, we
+ * set up an end_io call back function, which will do the convertion
+ * when async direct IO completed.
+ *
+ * If the O_DIRECT write will extend the file then add this inode to the
+ * orphan list. So recovery will truncate it back to the original size
+ * if the machine crashes during the write.
+ *
+ */
+static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
+ const struct iovec *iov, loff_t offset,
+ unsigned long nr_segs)
+{
+ struct file *file = iocb->ki_filp;
+ struct inode *inode = file->f_mapping->host;
+ ssize_t ret;
+ size_t count = iov_length(iov, nr_segs);
+
+ loff_t final_size = offset + count;
+ if (rw == WRITE && final_size <= inode->i_size) {
+ /*
+ * We could direct write to holes and fallocate.
+ *
+ * Allocated blocks to fill the hole are marked as uninitialized
+ * to prevent paralel buffered read to expose the stale data
+ * before DIO complete the data IO.
+ *
+ * As to previously fallocated extents, ext4 get_block
+ * will just simply mark the buffer mapped but still
+ * keep the extents uninitialized.
+ *
+ * for non AIO case, we will convert those unwritten extents
+ * to written after return back from blockdev_direct_IO.
+ *
+ * for async DIO, the conversion needs to be defered when
+ * the IO is completed. The ext4 end_io callback function
+ * will be called to take care of the conversion work.
+ * Here for async case, we allocate an io_end structure to
+ * hook to the iocb.
+ */
+ iocb->private = NULL;
+ EXT4_I(inode)->cur_aio_dio = NULL;
+ if (!is_sync_kiocb(iocb)) {
+ iocb->private = ext4_init_io_end(inode);
+ if (!iocb->private)
+ return -ENOMEM;
+ /*
+ * we save the io structure for current async
+ * direct IO, so that later ext4_get_blocks()
+ * could flag the io structure whether there
+ * is a unwritten extents needs to be converted
+ * when IO is completed.
+ */
+ EXT4_I(inode)->cur_aio_dio = iocb->private;
+ }
+
+ ret = blockdev_direct_IO(rw, iocb, inode,
+ inode->i_sb->s_bdev, iov,
+ offset, nr_segs,
+ ext4_get_block_dio_write,
+ ext4_end_io_dio);
+ if (iocb->private)
+ EXT4_I(inode)->cur_aio_dio = NULL;
+ /*
+ * The io_end structure takes a reference to the inode,
+ * that structure needs to be destroyed and the
+ * reference to the inode need to be dropped, when IO is
+ * complete, even with 0 byte write, or failed.
+ *
+ * In the successful AIO DIO case, the io_end structure will be
+ * desctroyed and the reference to the inode will be dropped
+ * after the end_io call back function is called.
+ *
+ * In the case there is 0 byte write, or error case, since
+ * VFS direct IO won't invoke the end_io call back function,
+ * we need to free the end_io structure here.
+ */
+ if (ret != -EIOCBQUEUED && ret <= 0 && iocb->private) {
+ ext4_free_io_end(iocb->private);
+ iocb->private = NULL;
+ } else if (ret > 0 && (EXT4_I(inode)->i_state &
+ EXT4_STATE_DIO_UNWRITTEN)) {
+ int err;
+ /*
+ * for non AIO case, since the IO is already
+ * completed, we could do the convertion right here
+ */
+ err = ext4_convert_unwritten_extents(inode,
+ offset, ret);
+ if (err < 0)
+ ret = err;
+ EXT4_I(inode)->i_state &= ~EXT4_STATE_DIO_UNWRITTEN;
+ }
+ return ret;
+ }
+
+ /* for write the the end of file case, we fall back to old way */
+ return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs);
+}
+
+static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
+ const struct iovec *iov, loff_t offset,
+ unsigned long nr_segs)
+{
+ struct file *file = iocb->ki_filp;
+ struct inode *inode = file->f_mapping->host;
+
+ if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)
+ return ext4_ext_direct_IO(rw, iocb, iov, offset, nr_segs);
+
+ return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs);
+}
+
/*
* Pages can be marked dirty completely asynchronously from ext4's journalling
* activity. By filemap_sync_pte(), try_to_unmap_one(), etc. We cannot do
@@ -3614,7 +4059,7 @@ static Indirect *ext4_find_shared(struct inode *inode, int depth,
int k, err;
*top = 0;
- /* Make k index the deepest non-null offest + 1 */
+ /* Make k index the deepest non-null offset + 1 */
for (k = depth; k > 1 && !offsets[k-1]; k--)
;
partial = ext4_get_branch(inode, k, offsets, chain, &err);
@@ -3670,6 +4115,11 @@ static void ext4_clear_blocks(handle_t *handle, struct inode *inode,
__le32 *last)
{
__le32 *p;
+ int flags = EXT4_FREE_BLOCKS_FORGET;
+
+ if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
+ flags |= EXT4_FREE_BLOCKS_METADATA;
+
if (try_to_extend_transaction(handle, inode)) {
if (bh) {
BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
@@ -3684,27 +4134,10 @@ static void ext4_clear_blocks(handle_t *handle, struct inode *inode,
}
}
- /*
- * Any buffers which are on the journal will be in memory. We
- * find them on the hash table so jbd2_journal_revoke() will
- * run jbd2_journal_forget() on them. We've already detached
- * each block from the file, so bforget() in
- * jbd2_journal_forget() should be safe.
- *
- * AKPM: turn on bforget in jbd2_journal_forget()!!!
- */
- for (p = first; p < last; p++) {
- u32 nr = le32_to_cpu(*p);
- if (nr) {
- struct buffer_head *tbh;
+ for (p = first; p < last; p++)
+ *p = 0;
- *p = 0;
- tbh = sb_find_get_block(inode->i_sb, nr);
- ext4_forget(handle, 0, inode, tbh, nr);
- }
- }
-
- ext4_free_blocks(handle, inode, block_to_free, count, 0);
+ ext4_free_blocks(handle, inode, 0, block_to_free, count, flags);
}
/**
@@ -3892,7 +4325,8 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
blocks_for_truncate(inode));
}
- ext4_free_blocks(handle, inode, nr, 1, 1);
+ ext4_free_blocks(handle, inode, 0, nr, 1,
+ EXT4_FREE_BLOCKS_METADATA);
if (parent_bh) {
/*
@@ -4331,8 +4765,8 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
struct ext4_iloc iloc;
struct ext4_inode *raw_inode;
struct ext4_inode_info *ei;
- struct buffer_head *bh;
struct inode *inode;
+ journal_t *journal = EXT4_SB(sb)->s_journal;
long ret;
int block;
@@ -4343,11 +4777,11 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
return inode;
ei = EXT4_I(inode);
+ iloc.bh = 0;
ret = __ext4_get_inode_loc(inode, &iloc, 0);
if (ret < 0)
goto bad_inode;
- bh = iloc.bh;
raw_inode = ext4_raw_inode(&iloc);
inode->i_mode = le16_to_cpu(raw_inode->i_mode);
inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low);
@@ -4370,7 +4804,6 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
if (inode->i_mode == 0 ||
!(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) {
/* this inode is deleted */
- brelse(bh);
ret = -ESTALE;
goto bad_inode;
}
@@ -4387,6 +4820,9 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32;
inode->i_size = ext4_isize(raw_inode);
ei->i_disksize = inode->i_size;
+#ifdef CONFIG_QUOTA
+ ei->i_reserved_quota = 0;
+#endif
inode->i_generation = le32_to_cpu(raw_inode->i_generation);
ei->i_block_group = iloc.block_group;
ei->i_last_alloc_group = ~0;
@@ -4398,11 +4834,35 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
ei->i_data[block] = raw_inode->i_block[block];
INIT_LIST_HEAD(&ei->i_orphan);
+ /*
+ * Set transaction id's of transactions that have to be committed
+ * to finish f[data]sync. We set them to currently running transaction
+ * as we cannot be sure that the inode or some of its metadata isn't
+ * part of the transaction - the inode could have been reclaimed and
+ * now it is reread from disk.
+ */
+ if (journal) {
+ transaction_t *transaction;
+ tid_t tid;
+
+ spin_lock(&journal->j_state_lock);
+ if (journal->j_running_transaction)
+ transaction = journal->j_running_transaction;
+ else
+ transaction = journal->j_committing_transaction;
+ if (transaction)
+ tid = transaction->t_tid;
+ else
+ tid = journal->j_commit_sequence;
+ spin_unlock(&journal->j_state_lock);
+ ei->i_sync_tid = tid;
+ ei->i_datasync_tid = tid;
+ }
+
if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize);
if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize >
EXT4_INODE_SIZE(inode->i_sb)) {
- brelse(bh);
ret = -EIO;
goto bad_inode;
}
@@ -4434,10 +4894,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
ret = 0;
if (ei->i_file_acl &&
- ((ei->i_file_acl <
- (le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block) +
- EXT4_SB(sb)->s_gdb_count)) ||
- (ei->i_file_acl >= ext4_blocks_count(EXT4_SB(sb)->s_es)))) {
+ !ext4_data_block_valid(EXT4_SB(sb), ei->i_file_acl, 1)) {
ext4_error(sb, __func__,
"bad extended attribute block %llu in inode #%lu",
ei->i_file_acl, inode->i_ino);
@@ -4455,10 +4912,8 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
/* Validate block references which are part of inode */
ret = ext4_check_inode_blockref(inode);
}
- if (ret) {
- brelse(bh);
+ if (ret)
goto bad_inode;
- }
if (S_ISREG(inode->i_mode)) {
inode->i_op = &ext4_file_inode_operations;
@@ -4486,7 +4941,6 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
init_special_inode(inode, inode->i_mode,
new_decode_dev(le32_to_cpu(raw_inode->i_block[1])));
} else {
- brelse(bh);
ret = -EIO;
ext4_error(inode->i_sb, __func__,
"bogus i_mode (%o) for inode=%lu",
@@ -4499,6 +4953,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
return inode;
bad_inode:
+ brelse(iloc.bh);
iget_failed(inode);
return ERR_PTR(ret);
}
@@ -4551,8 +5006,7 @@ static int ext4_inode_blocks_set(handle_t *handle,
*/
static int ext4_do_update_inode(handle_t *handle,
struct inode *inode,
- struct ext4_iloc *iloc,
- int do_sync)
+ struct ext4_iloc *iloc)
{
struct ext4_inode *raw_inode = ext4_raw_inode(iloc);
struct ext4_inode_info *ei = EXT4_I(inode);
@@ -4653,24 +5107,13 @@ static int ext4_do_update_inode(handle_t *handle,
raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize);
}
- /*
- * If we're not using a journal and we were called from
- * ext4_write_inode() to sync the inode (making do_sync true),
- * we can just use sync_dirty_buffer() directly to do our dirty
- * work. Testing s_journal here is a bit redundant but it's
- * worth it to avoid potential future trouble.
- */
- if (EXT4_SB(inode->i_sb)->s_journal == NULL && do_sync) {
- BUFFER_TRACE(bh, "call sync_dirty_buffer");
- sync_dirty_buffer(bh);
- } else {
- BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
- rc = ext4_handle_dirty_metadata(handle, inode, bh);
- if (!err)
- err = rc;
- }
+ BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
+ rc = ext4_handle_dirty_metadata(handle, inode, bh);
+ if (!err)
+ err = rc;
ei->i_state &= ~EXT4_STATE_NEW;
+ ext4_update_inode_fsync_trans(handle, inode, 0);
out_brelse:
brelse(bh);
ext4_std_error(inode->i_sb, err);
@@ -4712,7 +5155,7 @@ out_brelse:
* `stuff()' is running, and the new i_size will be lost. Plus the inode
* will no longer be on the superblock's dirty inode list.
*/
-int ext4_write_inode(struct inode *inode, int wait)
+int ext4_write_inode(struct inode *inode, struct writeback_control *wbc)
{
int err;
@@ -4726,7 +5169,7 @@ int ext4_write_inode(struct inode *inode, int wait)
return -EIO;
}
- if (!wait)
+ if (wbc->sync_mode != WB_SYNC_ALL)
return 0;
err = ext4_force_commit(inode->i_sb);
@@ -4736,8 +5179,16 @@ int ext4_write_inode(struct inode *inode, int wait)
err = ext4_get_inode_loc(inode, &iloc);
if (err)
return err;
- err = ext4_do_update_inode(EXT4_NOJOURNAL_HANDLE,
- inode, &iloc, wait);
+ if (wbc->sync_mode == WB_SYNC_ALL)
+ sync_dirty_buffer(iloc.bh);
+ if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) {
+ ext4_error(inode->i_sb, __func__,
+ "IO error syncing inode, "
+ "inode=%lu, block=%llu",
+ inode->i_ino,
+ (unsigned long long)iloc.bh->b_blocknr);
+ err = -EIO;
+ }
}
return err;
}
@@ -4782,8 +5233,8 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
/* (user+group)*(old+new) structure, inode write (sb,
* inode block, ? - but truncate inode update has it) */
- handle = ext4_journal_start(inode, 2*(EXT4_QUOTA_INIT_BLOCKS(inode->i_sb)+
- EXT4_QUOTA_DEL_BLOCKS(inode->i_sb))+3);
+ handle = ext4_journal_start(inode, (EXT4_MAXQUOTAS_INIT_BLOCKS(inode->i_sb)+
+ EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb))+3);
if (IS_ERR(handle)) {
error = PTR_ERR(handle);
goto err_out;
@@ -4931,7 +5382,7 @@ static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
* worse case, the indexs blocks spread over different block groups
*
* If datablocks are discontiguous, they are possible to spread over
- * different block groups too. If they are contiugous, with flexbg,
+ * different block groups too. If they are contiuguous, with flexbg,
* they could still across block group boundary.
*
* Also account for superblock, inode, quota and xattr blocks
@@ -5007,7 +5458,7 @@ int ext4_writepage_trans_blocks(struct inode *inode)
* Calculate the journal credits for a chunk of data modification.
*
* This is called from DIO, fallocate or whoever calling
- * ext4_get_blocks() to map/allocate a chunk of contigous disk blocks.
+ * ext4_get_blocks() to map/allocate a chunk of contiguous disk blocks.
*
* journal buffers for data blocks are not included here, as DIO
* and fallocate do no need to journal data buffers.
@@ -5033,7 +5484,7 @@ int ext4_mark_iloc_dirty(handle_t *handle,
get_bh(iloc->bh);
/* ext4_do_update_inode() does jbd2_journal_dirty_metadata */
- err = ext4_do_update_inode(handle, inode, iloc, 0);
+ err = ext4_do_update_inode(handle, inode, iloc);
put_bh(iloc->bh);
return err;
}
@@ -5177,27 +5628,14 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
*/
void ext4_dirty_inode(struct inode *inode)
{
- handle_t *current_handle = ext4_journal_current_handle();
handle_t *handle;
- if (!ext4_handle_valid(current_handle)) {
- ext4_mark_inode_dirty(current_handle, inode);
- return;
- }
-
handle = ext4_journal_start(inode, 2);
if (IS_ERR(handle))
goto out;
- if (current_handle &&
- current_handle->h_transaction != handle->h_transaction) {
- /* This task has a transaction open against a different fs */
- printk(KERN_EMERG "%s: transactions do not match!\n",
- __func__);
- } else {
- jbd_debug(5, "marking dirty. outer handle=%p\n",
- current_handle);
- ext4_mark_inode_dirty(handle, inode);
- }
+
+ ext4_mark_inode_dirty(handle, inode);
+
ext4_journal_stop(handle);
out:
return;
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index c1cdf613e725..b63d193126db 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -221,31 +221,38 @@ setversion_out:
struct file *donor_filp;
int err;
+ if (!(filp->f_mode & FMODE_READ) ||
+ !(filp->f_mode & FMODE_WRITE))
+ return -EBADF;
+
if (copy_from_user(&me,
(struct move_extent __user *)arg, sizeof(me)))
return -EFAULT;
+ me.moved_len = 0;
donor_filp = fget(me.donor_fd);
if (!donor_filp)
return -EBADF;
- if (!capable(CAP_DAC_OVERRIDE)) {
- if ((current->real_cred->fsuid != inode->i_uid) ||
- !(inode->i_mode & S_IRUSR) ||
- !(donor_filp->f_dentry->d_inode->i_mode &
- S_IRUSR)) {
- fput(donor_filp);
- return -EACCES;
- }
+ if (!(donor_filp->f_mode & FMODE_WRITE)) {
+ err = -EBADF;
+ goto mext_out;
}
+ err = mnt_want_write(filp->f_path.mnt);
+ if (err)
+ goto mext_out;
+
err = ext4_move_extents(filp, donor_filp, me.orig_start,
me.donor_start, me.len, &me.moved_len);
- fput(donor_filp);
+ mnt_drop_write(filp->f_path.mnt);
+ if (me.moved_len > 0)
+ file_remove_suid(donor_filp);
if (copy_to_user((struct move_extent *)arg, &me, sizeof(me)))
- return -EFAULT;
-
+ err = -EFAULT;
+mext_out:
+ fput(donor_filp);
return err;
}
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index e9c61896d605..d34afad3e137 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -142,7 +142,7 @@
* 2 blocks and the order of allocation is >= sbi->s_mb_order2_reqs. The
* value of s_mb_order2_reqs can be tuned via
* /sys/fs/ext4/<partition>/mb_order2_req. If the request len is equal to
- * stripe size (sbi->s_stripe), we try to search for contigous block in
+ * stripe size (sbi->s_stripe), we try to search for contiguous block in
* stripe size. This should result in better allocation on RAID setups. If
* not, we search in the specific group using bitmap for best extents. The
* tunable min_to_scan and max_to_scan control the behaviour here.
@@ -2096,207 +2096,6 @@ out:
return err;
}
-#ifdef EXT4_MB_HISTORY
-struct ext4_mb_proc_session {
- struct ext4_mb_history *history;
- struct super_block *sb;
- int start;
- int max;
-};
-
-static void *ext4_mb_history_skip_empty(struct ext4_mb_proc_session *s,
- struct ext4_mb_history *hs,
- int first)
-{
- if (hs == s->history + s->max)
- hs = s->history;
- if (!first && hs == s->history + s->start)
- return NULL;
- while (hs->orig.fe_len == 0) {
- hs++;
- if (hs == s->history + s->max)
- hs = s->history;
- if (hs == s->history + s->start)
- return NULL;
- }
- return hs;
-}
-
-static void *ext4_mb_seq_history_start(struct seq_file *seq, loff_t *pos)
-{
- struct ext4_mb_proc_session *s = seq->private;
- struct ext4_mb_history *hs;
- int l = *pos;
-
- if (l == 0)
- return SEQ_START_TOKEN;
- hs = ext4_mb_history_skip_empty(s, s->history + s->start, 1);
- if (!hs)
- return NULL;
- while (--l && (hs = ext4_mb_history_skip_empty(s, ++hs, 0)) != NULL);
- return hs;
-}
-
-static void *ext4_mb_seq_history_next(struct seq_file *seq, void *v,
- loff_t *pos)
-{
- struct ext4_mb_proc_session *s = seq->private;
- struct ext4_mb_history *hs = v;
-
- ++*pos;
- if (v == SEQ_START_TOKEN)
- return ext4_mb_history_skip_empty(s, s->history + s->start, 1);
- else
- return ext4_mb_history_skip_empty(s, ++hs, 0);
-}
-
-static int ext4_mb_seq_history_show(struct seq_file *seq, void *v)
-{
- char buf[25], buf2[25], buf3[25], *fmt;
- struct ext4_mb_history *hs = v;
-
- if (v == SEQ_START_TOKEN) {
- seq_printf(seq, "%-5s %-8s %-23s %-23s %-23s %-5s "
- "%-5s %-2s %-6s %-5s %-5s %-6s\n",
- "pid", "inode", "original", "goal", "result", "found",
- "grps", "cr", "flags", "merge", "tail", "broken");
- return 0;
- }
-
- if (hs->op == EXT4_MB_HISTORY_ALLOC) {
- fmt = "%-5u %-8u %-23s %-23s %-23s %-5u %-5u %-2u "
- "0x%04x %-5s %-5u %-6u\n";
- sprintf(buf2, "%u/%d/%u@%u", hs->result.fe_group,
- hs->result.fe_start, hs->result.fe_len,
- hs->result.fe_logical);
- sprintf(buf, "%u/%d/%u@%u", hs->orig.fe_group,
- hs->orig.fe_start, hs->orig.fe_len,
- hs->orig.fe_logical);
- sprintf(buf3, "%u/%d/%u@%u", hs->goal.fe_group,
- hs->goal.fe_start, hs->goal.fe_len,
- hs->goal.fe_logical);
- seq_printf(seq, fmt, hs->pid, hs->ino, buf, buf3, buf2,
- hs->found, hs->groups, hs->cr, hs->flags,
- hs->merged ? "M" : "", hs->tail,
- hs->buddy ? 1 << hs->buddy : 0);
- } else if (hs->op == EXT4_MB_HISTORY_PREALLOC) {
- fmt = "%-5u %-8u %-23s %-23s %-23s\n";
- sprintf(buf2, "%u/%d/%u@%u", hs->result.fe_group,
- hs->result.fe_start, hs->result.fe_len,
- hs->result.fe_logical);
- sprintf(buf, "%u/%d/%u@%u", hs->orig.fe_group,
- hs->orig.fe_start, hs->orig.fe_len,
- hs->orig.fe_logical);
- seq_printf(seq, fmt, hs->pid, hs->ino, buf, "", buf2);
- } else if (hs->op == EXT4_MB_HISTORY_DISCARD) {
- sprintf(buf2, "%u/%d/%u", hs->result.fe_group,
- hs->result.fe_start, hs->result.fe_len);
- seq_printf(seq, "%-5u %-8u %-23s discard\n",
- hs->pid, hs->ino, buf2);
- } else if (hs->op == EXT4_MB_HISTORY_FREE) {
- sprintf(buf2, "%u/%d/%u", hs->result.fe_group,
- hs->result.fe_start, hs->result.fe_len);
- seq_printf(seq, "%-5u %-8u %-23s free\n",
- hs->pid, hs->ino, buf2);
- }
- return 0;
-}
-
-static void ext4_mb_seq_history_stop(struct seq_file *seq, void *v)
-{
-}
-
-static const struct seq_operations ext4_mb_seq_history_ops = {
- .start = ext4_mb_seq_history_start,
- .next = ext4_mb_seq_history_next,
- .stop = ext4_mb_seq_history_stop,
- .show = ext4_mb_seq_history_show,
-};
-
-static int ext4_mb_seq_history_open(struct inode *inode, struct file *file)
-{
- struct super_block *sb = PDE(inode)->data;
- struct ext4_sb_info *sbi = EXT4_SB(sb);
- struct ext4_mb_proc_session *s;
- int rc;
- int size;
-
- if (unlikely(sbi->s_mb_history == NULL))
- return -ENOMEM;
- s = kmalloc(sizeof(*s), GFP_KERNEL);
- if (s == NULL)
- return -ENOMEM;
- s->sb = sb;
- size = sizeof(struct ext4_mb_history) * sbi->s_mb_history_max;
- s->history = kmalloc(size, GFP_KERNEL);
- if (s->history == NULL) {
- kfree(s);
- return -ENOMEM;
- }
-
- spin_lock(&sbi->s_mb_history_lock);
- memcpy(s->history, sbi->s_mb_history, size);
- s->max = sbi->s_mb_history_max;
- s->start = sbi->s_mb_history_cur % s->max;
- spin_unlock(&sbi->s_mb_history_lock);
-
- rc = seq_open(file, &ext4_mb_seq_history_ops);
- if (rc == 0) {
- struct seq_file *m = (struct seq_file *)file->private_data;
- m->private = s;
- } else {
- kfree(s->history);
- kfree(s);
- }
- return rc;
-
-}
-
-static int ext4_mb_seq_history_release(struct inode *inode, struct file *file)
-{
- struct seq_file *seq = (struct seq_file *)file->private_data;
- struct ext4_mb_proc_session *s = seq->private;
- kfree(s->history);
- kfree(s);
- return seq_release(inode, file);
-}
-
-static ssize_t ext4_mb_seq_history_write(struct file *file,
- const char __user *buffer,
- size_t count, loff_t *ppos)
-{
- struct seq_file *seq = (struct seq_file *)file->private_data;
- struct ext4_mb_proc_session *s = seq->private;
- struct super_block *sb = s->sb;
- char str[32];
- int value;
-
- if (count >= sizeof(str)) {
- printk(KERN_ERR "EXT4-fs: %s string too long, max %u bytes\n",
- "mb_history", (int)sizeof(str));
- return -EOVERFLOW;
- }
-
- if (copy_from_user(str, buffer, count))
- return -EFAULT;
-
- value = simple_strtol(str, NULL, 0);
- if (value < 0)
- return -ERANGE;
- EXT4_SB(sb)->s_mb_history_filter = value;
-
- return count;
-}
-
-static const struct file_operations ext4_mb_seq_history_fops = {
- .owner = THIS_MODULE,
- .open = ext4_mb_seq_history_open,
- .read = seq_read,
- .write = ext4_mb_seq_history_write,
- .llseek = seq_lseek,
- .release = ext4_mb_seq_history_release,
-};
-
static void *ext4_mb_seq_groups_start(struct seq_file *seq, loff_t *pos)
{
struct super_block *sb = seq->private;
@@ -2396,82 +2195,6 @@ static const struct file_operations ext4_mb_seq_groups_fops = {
.release = seq_release,
};
-static void ext4_mb_history_release(struct super_block *sb)
-{
- struct ext4_sb_info *sbi = EXT4_SB(sb);
-
- if (sbi->s_proc != NULL) {
- remove_proc_entry("mb_groups", sbi->s_proc);
- if (sbi->s_mb_history_max)
- remove_proc_entry("mb_history", sbi->s_proc);
- }
- kfree(sbi->s_mb_history);
-}
-
-static void ext4_mb_history_init(struct super_block *sb)
-{
- struct ext4_sb_info *sbi = EXT4_SB(sb);
- int i;
-
- if (sbi->s_proc != NULL) {
- if (sbi->s_mb_history_max)
- proc_create_data("mb_history", S_IRUGO, sbi->s_proc,
- &ext4_mb_seq_history_fops, sb);
- proc_create_data("mb_groups", S_IRUGO, sbi->s_proc,
- &ext4_mb_seq_groups_fops, sb);
- }
-
- sbi->s_mb_history_cur = 0;
- spin_lock_init(&sbi->s_mb_history_lock);
- i = sbi->s_mb_history_max * sizeof(struct ext4_mb_history);
- sbi->s_mb_history = i ? kzalloc(i, GFP_KERNEL) : NULL;
- /* if we can't allocate history, then we simple won't use it */
-}
-
-static noinline_for_stack void
-ext4_mb_store_history(struct ext4_allocation_context *ac)
-{
- struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
- struct ext4_mb_history h;
-
- if (sbi->s_mb_history == NULL)
- return;
-
- if (!(ac->ac_op & sbi->s_mb_history_filter))
- return;
-
- h.op = ac->ac_op;
- h.pid = current->pid;
- h.ino = ac->ac_inode ? ac->ac_inode->i_ino : 0;
- h.orig = ac->ac_o_ex;
- h.result = ac->ac_b_ex;
- h.flags = ac->ac_flags;
- h.found = ac->ac_found;
- h.groups = ac->ac_groups_scanned;
- h.cr = ac->ac_criteria;
- h.tail = ac->ac_tail;
- h.buddy = ac->ac_buddy;
- h.merged = 0;
- if (ac->ac_op == EXT4_MB_HISTORY_ALLOC) {
- if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start &&
- ac->ac_g_ex.fe_group == ac->ac_b_ex.fe_group)
- h.merged = 1;
- h.goal = ac->ac_g_ex;
- h.result = ac->ac_f_ex;
- }
-
- spin_lock(&sbi->s_mb_history_lock);
- memcpy(sbi->s_mb_history + sbi->s_mb_history_cur, &h, sizeof(h));
- if (++sbi->s_mb_history_cur >= sbi->s_mb_history_max)
- sbi->s_mb_history_cur = 0;
- spin_unlock(&sbi->s_mb_history_lock);
-}
-
-#else
-#define ext4_mb_history_release(sb)
-#define ext4_mb_history_init(sb)
-#endif
-
/* Create and initialize ext4_group_info data for the given group. */
int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
@@ -2690,7 +2413,6 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
sbi->s_mb_stats = MB_DEFAULT_STATS;
sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD;
sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS;
- sbi->s_mb_history_filter = EXT4_MB_HISTORY_DEFAULT;
sbi->s_mb_group_prealloc = MB_DEFAULT_GROUP_PREALLOC;
sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group);
@@ -2708,12 +2430,12 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
spin_lock_init(&lg->lg_prealloc_lock);
}
- ext4_mb_history_init(sb);
+ if (sbi->s_proc)
+ proc_create_data("mb_groups", S_IRUGO, sbi->s_proc,
+ &ext4_mb_seq_groups_fops, sb);
if (sbi->s_journal)
sbi->s_journal->j_commit_callback = release_blocks_on_commit;
-
- printk(KERN_INFO "EXT4-fs: mballoc enabled\n");
return 0;
}
@@ -2790,7 +2512,8 @@ int ext4_mb_release(struct super_block *sb)
}
free_percpu(sbi->s_locality_groups);
- ext4_mb_history_release(sb);
+ if (sbi->s_proc)
+ remove_proc_entry("mb_groups", sbi->s_proc);
return 0;
}
@@ -2806,7 +2529,6 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
struct ext4_group_info *db;
int err, count = 0, count2 = 0;
struct ext4_free_data *entry;
- ext4_fsblk_t discard_block;
struct list_head *l, *ltmp;
list_for_each_safe(l, ltmp, &txn->t_private_list) {
@@ -2836,13 +2558,19 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
page_cache_release(e4b.bd_bitmap_page);
}
ext4_unlock_group(sb, entry->group);
- discard_block = (ext4_fsblk_t) entry->group * EXT4_BLOCKS_PER_GROUP(sb)
- + entry->start_blk
- + le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
- trace_ext4_discard_blocks(sb, (unsigned long long)discard_block,
- entry->count);
- sb_issue_discard(sb, discard_block, entry->count);
-
+ if (test_opt(sb, DISCARD)) {
+ ext4_fsblk_t discard_block;
+ struct ext4_super_block *es = EXT4_SB(sb)->s_es;
+
+ discard_block = (ext4_fsblk_t)entry->group *
+ EXT4_BLOCKS_PER_GROUP(sb)
+ + entry->start_blk
+ + le32_to_cpu(es->s_first_data_block);
+ trace_ext4_discard_blocks(sb,
+ (unsigned long long)discard_block,
+ entry->count);
+ sb_issue_discard(sb, discard_block, entry->count);
+ }
kmem_cache_free(ext4_free_ext_cachep, entry);
ext4_mb_release_desc(&e4b);
}
@@ -3027,12 +2755,6 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
if (!(ac->ac_flags & EXT4_MB_DELALLOC_RESERVED))
/* release all the reserved blocks if non delalloc */
percpu_counter_sub(&sbi->s_dirtyblocks_counter, reserv_blks);
- else {
- percpu_counter_sub(&sbi->s_dirtyblocks_counter,
- ac->ac_b_ex.fe_len);
- /* convert reserved quota blocks to real quota blocks */
- vfs_dq_claim_block(ac->ac_inode, ac->ac_b_ex.fe_len);
- }
if (sbi->s_log_groups_per_flex) {
ext4_group_t flex_group = ext4_flex_group(sbi,
@@ -3276,7 +2998,28 @@ static void ext4_mb_collect_stats(struct ext4_allocation_context *ac)
atomic_inc(&sbi->s_bal_breaks);
}
- ext4_mb_store_history(ac);
+ if (ac->ac_op == EXT4_MB_HISTORY_ALLOC)
+ trace_ext4_mballoc_alloc(ac);
+ else
+ trace_ext4_mballoc_prealloc(ac);
+}
+
+/*
+ * Called on failure; free up any blocks from the inode PA for this
+ * context. We don't need this for MB_GROUP_PA because we only change
+ * pa_free in ext4_mb_release_context(), but on failure, we've already
+ * zeroed out ac->ac_b_ex.fe_len, so group_pa->pa_free is not changed.
+ */
+static void ext4_discard_allocated_blocks(struct ext4_allocation_context *ac)
+{
+ struct ext4_prealloc_space *pa = ac->ac_pa;
+ int len;
+
+ if (pa && pa->pa_type == MB_INODE_PA) {
+ len = ac->ac_b_ex.fe_len;
+ pa->pa_free += len;
+ }
+
}
/*
@@ -3776,7 +3519,6 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
if (ac) {
ac->ac_sb = sb;
ac->ac_inode = pa->pa_inode;
- ac->ac_op = EXT4_MB_HISTORY_DISCARD;
}
while (bit < end) {
@@ -3796,7 +3538,7 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
ac->ac_b_ex.fe_start = bit;
ac->ac_b_ex.fe_len = next - bit;
ac->ac_b_ex.fe_logical = 0;
- ext4_mb_store_history(ac);
+ trace_ext4_mballoc_discard(ac);
}
trace_ext4_mb_release_inode_pa(ac, pa, grp_blk_start + bit,
@@ -3831,9 +3573,6 @@ ext4_mb_release_group_pa(struct ext4_buddy *e4b,
ext4_group_t group;
ext4_grpblk_t bit;
- if (ac)
- ac->ac_op = EXT4_MB_HISTORY_DISCARD;
-
trace_ext4_mb_release_group_pa(ac, pa);
BUG_ON(pa->pa_deleted == 0);
ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit);
@@ -3848,7 +3587,7 @@ ext4_mb_release_group_pa(struct ext4_buddy *e4b,
ac->ac_b_ex.fe_start = bit;
ac->ac_b_ex.fe_len = pa->pa_len;
ac->ac_b_ex.fe_logical = 0;
- ext4_mb_store_history(ac);
+ trace_ext4_mballoc_discard(ac);
}
return 0;
@@ -4189,7 +3928,6 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len;
isize = (i_size_read(ac->ac_inode) + ac->ac_sb->s_blocksize - 1)
>> bsbits;
- size = max(size, isize);
if ((size == isize) &&
!ext4_fs_is_busy(sbi) &&
@@ -4199,6 +3937,7 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
}
/* don't use group allocation for large files */
+ size = max(size, isize);
if (size >= sbi->s_mb_stream_request) {
ac->ac_flags |= EXT4_MB_STREAM_ALLOC;
return;
@@ -4210,7 +3949,7 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
* per cpu locality group is to reduce the contention between block
* request from multiple CPUs.
*/
- ac->ac_lg = per_cpu_ptr(sbi->s_locality_groups, raw_smp_processor_id());
+ ac->ac_lg = __this_cpu_ptr(sbi->s_locality_groups);
/* we're going to use group allocation */
ac->ac_flags |= EXT4_MB_HINT_GROUP_ALLOC;
@@ -4568,6 +4307,7 @@ repeat:
ac->ac_status = AC_STATUS_CONTINUE;
goto repeat;
} else if (*errp) {
+ ext4_discard_allocated_blocks(ac);
ac->ac_b_ex.fe_len = 0;
ar->len = 0;
ext4_mb_show_ac(ac);
@@ -4700,18 +4440,24 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
return 0;
}
-/*
- * Main entry point into mballoc to free blocks
+/**
+ * ext4_free_blocks() -- Free given blocks and update quota
+ * @handle: handle for this transaction
+ * @inode: inode
+ * @block: start physical block to free
+ * @count: number of blocks to count
+ * @metadata: Are these metadata blocks
*/
-void ext4_mb_free_blocks(handle_t *handle, struct inode *inode,
- ext4_fsblk_t block, unsigned long count,
- int metadata, unsigned long *freed)
+void ext4_free_blocks(handle_t *handle, struct inode *inode,
+ struct buffer_head *bh, ext4_fsblk_t block,
+ unsigned long count, int flags)
{
struct buffer_head *bitmap_bh = NULL;
struct super_block *sb = inode->i_sb;
struct ext4_allocation_context *ac = NULL;
struct ext4_group_desc *gdp;
struct ext4_super_block *es;
+ unsigned long freed = 0;
unsigned int overflow;
ext4_grpblk_t bit;
struct buffer_head *gd_bh;
@@ -4721,13 +4467,16 @@ void ext4_mb_free_blocks(handle_t *handle, struct inode *inode,
int err = 0;
int ret;
- *freed = 0;
+ if (bh) {
+ if (block)
+ BUG_ON(block != bh->b_blocknr);
+ else
+ block = bh->b_blocknr;
+ }
sbi = EXT4_SB(sb);
es = EXT4_SB(sb)->s_es;
- if (block < le32_to_cpu(es->s_first_data_block) ||
- block + count < block ||
- block + count > ext4_blocks_count(es)) {
+ if (!ext4_data_block_valid(sbi, block, count)) {
ext4_error(sb, __func__,
"Freeing blocks not in datazone - "
"block = %llu, count = %lu", block, count);
@@ -4735,11 +4484,35 @@ void ext4_mb_free_blocks(handle_t *handle, struct inode *inode,
}
ext4_debug("freeing block %llu\n", block);
- trace_ext4_free_blocks(inode, block, count, metadata);
+ trace_ext4_free_blocks(inode, block, count, flags);
+
+ if (flags & EXT4_FREE_BLOCKS_FORGET) {
+ struct buffer_head *tbh = bh;
+ int i;
+
+ BUG_ON(bh && (count > 1));
+
+ for (i = 0; i < count; i++) {
+ if (!bh)
+ tbh = sb_find_get_block(inode->i_sb,
+ block + i);
+ ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA,
+ inode, tbh, block + i);
+ }
+ }
+
+ /*
+ * We need to make sure we don't reuse the freed block until
+ * after the transaction is committed, which we can do by
+ * treating the block as metadata, below. We make an
+ * exception if the inode is to be written in writeback mode
+ * since writeback mode has weak data consistency guarantees.
+ */
+ if (!ext4_should_writeback_data(inode))
+ flags |= EXT4_FREE_BLOCKS_METADATA;
ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
if (ac) {
- ac->ac_op = EXT4_MB_HISTORY_FREE;
ac->ac_inode = inode;
ac->ac_sb = sb;
}
@@ -4806,13 +4579,14 @@ do_more:
ac->ac_b_ex.fe_group = block_group;
ac->ac_b_ex.fe_start = bit;
ac->ac_b_ex.fe_len = count;
- ext4_mb_store_history(ac);
+ trace_ext4_mballoc_free(ac);
}
err = ext4_mb_load_buddy(sb, block_group, &e4b);
if (err)
goto error_return;
- if (metadata && ext4_handle_valid(handle)) {
+
+ if ((flags & EXT4_FREE_BLOCKS_METADATA) && ext4_handle_valid(handle)) {
struct ext4_free_data *new_entry;
/*
* blocks being freed are metadata. these blocks shouldn't
@@ -4851,7 +4625,7 @@ do_more:
ext4_mb_release_desc(&e4b);
- *freed += count;
+ freed += count;
/* We dirtied the bitmap block */
BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
@@ -4871,6 +4645,8 @@ do_more:
}
sb->s_dirt = 1;
error_return:
+ if (freed)
+ vfs_dq_free_block(inode, freed);
brelse(bitmap_bh);
ext4_std_error(sb, err);
if (ac)
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index 188d3d709b24..436521cae456 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -17,7 +17,6 @@
#include <linux/proc_fs.h>
#include <linux/pagemap.h>
#include <linux/seq_file.h>
-#include <linux/version.h>
#include <linux/blkdev.h>
#include <linux/mutex.h>
#include "ext4_jbd2.h"
@@ -52,18 +51,8 @@ extern u8 mb_enable_debug;
#define mb_debug(n, fmt, a...)
#endif
-/*
- * with EXT4_MB_HISTORY mballoc stores last N allocations in memory
- * and you can monitor it in /proc/fs/ext4/<dev>/mb_history
- */
-#define EXT4_MB_HISTORY
#define EXT4_MB_HISTORY_ALLOC 1 /* allocation */
#define EXT4_MB_HISTORY_PREALLOC 2 /* preallocated blocks used */
-#define EXT4_MB_HISTORY_DISCARD 4 /* preallocation discarded */
-#define EXT4_MB_HISTORY_FREE 8 /* free */
-
-#define EXT4_MB_HISTORY_DEFAULT (EXT4_MB_HISTORY_ALLOC | \
- EXT4_MB_HISTORY_PREALLOC)
/*
* How long mballoc can look for a best extent (in found extents)
@@ -84,7 +73,7 @@ extern u8 mb_enable_debug;
* with 'ext4_mb_stats' allocator will collect stats that will be
* shown at umount. The collecting costs though!
*/
-#define MB_DEFAULT_STATS 1
+#define MB_DEFAULT_STATS 0
/*
* files smaller than MB_DEFAULT_STREAM_THRESHOLD are served
@@ -217,22 +206,6 @@ struct ext4_allocation_context {
#define AC_STATUS_FOUND 2
#define AC_STATUS_BREAK 3
-struct ext4_mb_history {
- struct ext4_free_extent orig; /* orig allocation */
- struct ext4_free_extent goal; /* goal allocation */
- struct ext4_free_extent result; /* result allocation */
- unsigned pid;
- unsigned ino;
- __u16 found; /* how many extents have been found */
- __u16 groups; /* how many groups have been scanned */
- __u16 tail; /* what tail broke some buddy */
- __u16 buddy; /* buddy the tail ^^^ broke */
- __u16 flags;
- __u8 cr:3; /* which phase the result extent was found at */
- __u8 op:4;
- __u8 merged:1;
-};
-
struct ext4_buddy {
struct page *bd_buddy_page;
void *bd_buddy;
@@ -247,13 +220,6 @@ struct ext4_buddy {
#define EXT4_MB_BITMAP(e4b) ((e4b)->bd_bitmap)
#define EXT4_MB_BUDDY(e4b) ((e4b)->bd_buddy)
-#ifndef EXT4_MB_HISTORY
-static inline void ext4_mb_store_history(struct ext4_allocation_context *ac)
-{
- return;
-}
-#endif
-
#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1)
static inline ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb,
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index bf519f239ae6..81415814b00b 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -75,7 +75,7 @@ static int finish_range(handle_t *handle, struct inode *inode,
goto err_out;
}
}
- retval = ext4_ext_insert_extent(handle, inode, path, &newext);
+ retval = ext4_ext_insert_extent(handle, inode, path, &newext, 0);
err_out:
if (path) {
ext4_ext_drop_refs(path);
@@ -238,7 +238,7 @@ static int extend_credit_for_blkdel(handle_t *handle, struct inode *inode)
* So allocate a credit of 3. We may update
* quota (user and group).
*/
- needed = 3 + 2*EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb);
+ needed = 3 + EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb);
if (ext4_journal_extend(handle, needed) != 0)
retval = ext4_journal_restart(handle, needed);
@@ -262,13 +262,17 @@ static int free_dind_blocks(handle_t *handle,
for (i = 0; i < max_entries; i++) {
if (tmp_idata[i]) {
extend_credit_for_blkdel(handle, inode);
- ext4_free_blocks(handle, inode,
- le32_to_cpu(tmp_idata[i]), 1, 1);
+ ext4_free_blocks(handle, inode, 0,
+ le32_to_cpu(tmp_idata[i]), 1,
+ EXT4_FREE_BLOCKS_METADATA |
+ EXT4_FREE_BLOCKS_FORGET);
}
}
put_bh(bh);
extend_credit_for_blkdel(handle, inode);
- ext4_free_blocks(handle, inode, le32_to_cpu(i_data), 1, 1);
+ ext4_free_blocks(handle, inode, 0, le32_to_cpu(i_data), 1,
+ EXT4_FREE_BLOCKS_METADATA |
+ EXT4_FREE_BLOCKS_FORGET);
return 0;
}
@@ -297,7 +301,9 @@ static int free_tind_blocks(handle_t *handle,
}
put_bh(bh);
extend_credit_for_blkdel(handle, inode);
- ext4_free_blocks(handle, inode, le32_to_cpu(i_data), 1, 1);
+ ext4_free_blocks(handle, inode, 0, le32_to_cpu(i_data), 1,
+ EXT4_FREE_BLOCKS_METADATA |
+ EXT4_FREE_BLOCKS_FORGET);
return 0;
}
@@ -308,8 +314,10 @@ static int free_ind_block(handle_t *handle, struct inode *inode, __le32 *i_data)
/* ei->i_data[EXT4_IND_BLOCK] */
if (i_data[0]) {
extend_credit_for_blkdel(handle, inode);
- ext4_free_blocks(handle, inode,
- le32_to_cpu(i_data[0]), 1, 1);
+ ext4_free_blocks(handle, inode, 0,
+ le32_to_cpu(i_data[0]), 1,
+ EXT4_FREE_BLOCKS_METADATA |
+ EXT4_FREE_BLOCKS_FORGET);
}
/* ei->i_data[EXT4_DIND_BLOCK] */
@@ -419,7 +427,8 @@ static int free_ext_idx(handle_t *handle, struct inode *inode,
}
put_bh(bh);
extend_credit_for_blkdel(handle, inode);
- ext4_free_blocks(handle, inode, block, 1, 1);
+ ext4_free_blocks(handle, inode, 0, block, 1,
+ EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET);
return retval;
}
@@ -477,7 +486,7 @@ int ext4_ext_migrate(struct inode *inode)
handle = ext4_journal_start(inode,
EXT4_DATA_TRANS_BLOCKS(inode->i_sb) +
EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
- 2 * EXT4_QUOTA_INIT_BLOCKS(inode->i_sb)
+ EXT4_MAXQUOTAS_INIT_BLOCKS(inode->i_sb)
+ 1);
if (IS_ERR(handle)) {
retval = PTR_ERR(handle);
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index c07a2915e40b..82c415be87a4 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -77,12 +77,14 @@ static int
mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
struct ext4_extent **extent)
{
+ struct ext4_extent_header *eh;
int ppos, leaf_ppos = path->p_depth;
ppos = leaf_ppos;
if (EXT_LAST_EXTENT(path[ppos].p_hdr) > path[ppos].p_ext) {
/* leaf block */
*extent = ++path[ppos].p_ext;
+ path[ppos].p_block = ext_pblock(path[ppos].p_ext);
return 0;
}
@@ -119,9 +121,18 @@ mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
ext_block_hdr(path[cur_ppos+1].p_bh);
}
+ path[leaf_ppos].p_ext = *extent = NULL;
+
+ eh = path[leaf_ppos].p_hdr;
+ if (le16_to_cpu(eh->eh_entries) == 0)
+ /* empty leaf is found */
+ return -ENODATA;
+
/* leaf block */
path[leaf_ppos].p_ext = *extent =
EXT_FIRST_EXTENT(path[leaf_ppos].p_hdr);
+ path[leaf_ppos].p_block =
+ ext_pblock(path[leaf_ppos].p_ext);
return 0;
}
}
@@ -155,40 +166,15 @@ mext_check_null_inode(struct inode *inode1, struct inode *inode2,
}
/**
- * mext_double_down_read - Acquire two inodes' read semaphore
+ * double_down_write_data_sem - Acquire two inodes' write lock of i_data_sem
*
* @orig_inode: original inode structure
* @donor_inode: donor inode structure
- * Acquire read semaphore of the two inodes (orig and donor) by i_ino order.
+ * Acquire write lock of i_data_sem of the two inodes (orig and donor) by
+ * i_ino order.
*/
static void
-mext_double_down_read(struct inode *orig_inode, struct inode *donor_inode)
-{
- struct inode *first = orig_inode, *second = donor_inode;
-
- /*
- * Use the inode number to provide the stable locking order instead
- * of its address, because the C language doesn't guarantee you can
- * compare pointers that don't come from the same array.
- */
- if (donor_inode->i_ino < orig_inode->i_ino) {
- first = donor_inode;
- second = orig_inode;
- }
-
- down_read(&EXT4_I(first)->i_data_sem);
- down_read(&EXT4_I(second)->i_data_sem);
-}
-
-/**
- * mext_double_down_write - Acquire two inodes' write semaphore
- *
- * @orig_inode: original inode structure
- * @donor_inode: donor inode structure
- * Acquire write semaphore of the two inodes (orig and donor) by i_ino order.
- */
-static void
-mext_double_down_write(struct inode *orig_inode, struct inode *donor_inode)
+double_down_write_data_sem(struct inode *orig_inode, struct inode *donor_inode)
{
struct inode *first = orig_inode, *second = donor_inode;
@@ -203,32 +189,18 @@ mext_double_down_write(struct inode *orig_inode, struct inode *donor_inode)
}
down_write(&EXT4_I(first)->i_data_sem);
- down_write(&EXT4_I(second)->i_data_sem);
+ down_write_nested(&EXT4_I(second)->i_data_sem, SINGLE_DEPTH_NESTING);
}
/**
- * mext_double_up_read - Release two inodes' read semaphore
+ * double_up_write_data_sem - Release two inodes' write lock of i_data_sem
*
* @orig_inode: original inode structure to be released its lock first
* @donor_inode: donor inode structure to be released its lock second
- * Release read semaphore of two inodes (orig and donor).
+ * Release write lock of i_data_sem of two inodes (orig and donor).
*/
static void
-mext_double_up_read(struct inode *orig_inode, struct inode *donor_inode)
-{
- up_read(&EXT4_I(orig_inode)->i_data_sem);
- up_read(&EXT4_I(donor_inode)->i_data_sem);
-}
-
-/**
- * mext_double_up_write - Release two inodes' write semaphore
- *
- * @orig_inode: original inode structure to be released its lock first
- * @donor_inode: donor inode structure to be released its lock second
- * Release write semaphore of two inodes (orig and donor).
- */
-static void
-mext_double_up_write(struct inode *orig_inode, struct inode *donor_inode)
+double_up_write_data_sem(struct inode *orig_inode, struct inode *donor_inode)
{
up_write(&EXT4_I(orig_inode)->i_data_sem);
up_write(&EXT4_I(donor_inode)->i_data_sem);
@@ -322,7 +294,7 @@ mext_insert_across_blocks(handle_t *handle, struct inode *orig_inode,
goto out;
if (ext4_ext_insert_extent(handle, orig_inode,
- orig_path, new_ext))
+ orig_path, new_ext, 0))
goto out;
}
@@ -333,7 +305,7 @@ mext_insert_across_blocks(handle_t *handle, struct inode *orig_inode,
goto out;
if (ext4_ext_insert_extent(handle, orig_inode,
- orig_path, end_ext))
+ orig_path, end_ext, 0))
goto out;
}
out:
@@ -596,7 +568,7 @@ out:
* @tmp_oext: the extent that will belong to the donor inode
* @orig_off: block offset of original inode
* @donor_off: block offset of donor inode
- * @max_count: the maximun length of extents
+ * @max_count: the maximum length of extents
*
* Return 0 on success, or a negative error value on failure.
*/
@@ -661,6 +633,7 @@ mext_calc_swap_extents(struct ext4_extent *tmp_dext,
* @donor_inode: donor inode
* @from: block offset of orig_inode
* @count: block count to be replaced
+ * @err: pointer to save return value
*
* Replace original inode extents and donor inode extents page by page.
* We implement this replacement in the following three steps:
@@ -671,33 +644,33 @@ mext_calc_swap_extents(struct ext4_extent *tmp_dext,
* 3. Change the block information of donor inode to point at the saved
* original inode blocks in the dummy extents.
*
- * Return 0 on success, or a negative error value on failure.
+ * Return replaced block count.
*/
static int
mext_replace_branches(handle_t *handle, struct inode *orig_inode,
struct inode *donor_inode, ext4_lblk_t from,
- ext4_lblk_t count)
+ ext4_lblk_t count, int *err)
{
struct ext4_ext_path *orig_path = NULL;
struct ext4_ext_path *donor_path = NULL;
struct ext4_extent *oext, *dext;
struct ext4_extent tmp_dext, tmp_oext;
ext4_lblk_t orig_off = from, donor_off = from;
- int err = 0;
int depth;
int replaced_count = 0;
int dext_alen;
- mext_double_down_write(orig_inode, donor_inode);
+ /* Protect extent trees against block allocations via delalloc */
+ double_down_write_data_sem(orig_inode, donor_inode);
/* Get the original extent for the block "orig_off" */
- err = get_ext_path(orig_inode, orig_off, &orig_path);
- if (err)
+ *err = get_ext_path(orig_inode, orig_off, &orig_path);
+ if (*err)
goto out;
/* Get the donor extent for the head */
- err = get_ext_path(donor_inode, donor_off, &donor_path);
- if (err)
+ *err = get_ext_path(donor_inode, donor_off, &donor_path);
+ if (*err)
goto out;
depth = ext_depth(orig_inode);
oext = orig_path[depth].p_ext;
@@ -707,9 +680,9 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode,
dext = donor_path[depth].p_ext;
tmp_dext = *dext;
- err = mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off,
+ *err = mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off,
donor_off, count);
- if (err)
+ if (*err)
goto out;
/* Loop for the donor extents */
@@ -718,7 +691,7 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode,
if (!dext) {
ext4_error(donor_inode->i_sb, __func__,
"The extent for donor must be found");
- err = -EIO;
+ *err = -EIO;
goto out;
} else if (donor_off != le32_to_cpu(tmp_dext.ee_block)) {
ext4_error(donor_inode->i_sb, __func__,
@@ -726,20 +699,20 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode,
"extent(%u) should be equal",
donor_off,
le32_to_cpu(tmp_dext.ee_block));
- err = -EIO;
+ *err = -EIO;
goto out;
}
/* Set donor extent to orig extent */
- err = mext_leaf_block(handle, orig_inode,
+ *err = mext_leaf_block(handle, orig_inode,
orig_path, &tmp_dext, &orig_off);
- if (err < 0)
+ if (*err)
goto out;
/* Set orig extent to donor extent */
- err = mext_leaf_block(handle, donor_inode,
+ *err = mext_leaf_block(handle, donor_inode,
donor_path, &tmp_oext, &donor_off);
- if (err < 0)
+ if (*err)
goto out;
dext_alen = ext4_ext_get_actual_len(&tmp_dext);
@@ -753,35 +726,25 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode,
if (orig_path)
ext4_ext_drop_refs(orig_path);
- err = get_ext_path(orig_inode, orig_off, &orig_path);
- if (err)
+ *err = get_ext_path(orig_inode, orig_off, &orig_path);
+ if (*err)
goto out;
depth = ext_depth(orig_inode);
oext = orig_path[depth].p_ext;
- if (le32_to_cpu(oext->ee_block) +
- ext4_ext_get_actual_len(oext) <= orig_off) {
- err = 0;
- goto out;
- }
tmp_oext = *oext;
if (donor_path)
ext4_ext_drop_refs(donor_path);
- err = get_ext_path(donor_inode, donor_off, &donor_path);
- if (err)
+ *err = get_ext_path(donor_inode, donor_off, &donor_path);
+ if (*err)
goto out;
depth = ext_depth(donor_inode);
dext = donor_path[depth].p_ext;
- if (le32_to_cpu(dext->ee_block) +
- ext4_ext_get_actual_len(dext) <= donor_off) {
- err = 0;
- goto out;
- }
tmp_dext = *dext;
- err = mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off,
+ *err = mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off,
donor_off, count - replaced_count);
- if (err)
+ if (*err)
goto out;
}
@@ -795,8 +758,12 @@ out:
kfree(donor_path);
}
- mext_double_up_write(orig_inode, donor_inode);
- return err;
+ ext4_ext_invalidate_cache(orig_inode);
+ ext4_ext_invalidate_cache(donor_inode);
+
+ double_up_write_data_sem(orig_inode, donor_inode);
+
+ return replaced_count;
}
/**
@@ -808,16 +775,17 @@ out:
* @data_offset_in_page: block index where data swapping starts
* @block_len_in_page: the number of blocks to be swapped
* @uninit: orig extent is uninitialized or not
+ * @err: pointer to save return value
*
* Save the data in original inode blocks and replace original inode extents
* with donor inode extents by calling mext_replace_branches().
- * Finally, write out the saved data in new original inode blocks. Return 0
- * on success, or a negative error value on failure.
+ * Finally, write out the saved data in new original inode blocks. Return
+ * replaced block count.
*/
static int
move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
pgoff_t orig_page_offset, int data_offset_in_page,
- int block_len_in_page, int uninit)
+ int block_len_in_page, int uninit, int *err)
{
struct inode *orig_inode = o_filp->f_dentry->d_inode;
struct address_space *mapping = orig_inode->i_mapping;
@@ -829,9 +797,11 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
long long offs = orig_page_offset << PAGE_CACHE_SHIFT;
unsigned long blocksize = orig_inode->i_sb->s_blocksize;
unsigned int w_flags = 0;
- unsigned int tmp_data_len, data_len;
+ unsigned int tmp_data_size, data_size, replaced_size;
void *fsdata;
- int ret, i, jblocks;
+ int i, jblocks;
+ int err2 = 0;
+ int replaced_count = 0;
int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits;
/*
@@ -841,8 +811,8 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
jblocks = ext4_writepage_trans_blocks(orig_inode) * 2;
handle = ext4_journal_start(orig_inode, jblocks);
if (IS_ERR(handle)) {
- ret = PTR_ERR(handle);
- return ret;
+ *err = PTR_ERR(handle);
+ return 0;
}
if (segment_eq(get_fs(), KERNEL_DS))
@@ -858,39 +828,36 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
* Just swap data blocks between orig and donor.
*/
if (uninit) {
- ret = mext_replace_branches(handle, orig_inode,
- donor_inode, orig_blk_offset,
- block_len_in_page);
-
- /* Clear the inode cache not to refer to the old data */
- ext4_ext_invalidate_cache(orig_inode);
- ext4_ext_invalidate_cache(donor_inode);
+ replaced_count = mext_replace_branches(handle, orig_inode,
+ donor_inode, orig_blk_offset,
+ block_len_in_page, err);
goto out2;
}
offs = (long long)orig_blk_offset << orig_inode->i_blkbits;
- /* Calculate data_len */
+ /* Calculate data_size */
if ((orig_blk_offset + block_len_in_page - 1) ==
((orig_inode->i_size - 1) >> orig_inode->i_blkbits)) {
/* Replace the last block */
- tmp_data_len = orig_inode->i_size & (blocksize - 1);
+ tmp_data_size = orig_inode->i_size & (blocksize - 1);
/*
- * If data_len equal zero, it shows data_len is multiples of
+ * If data_size equal zero, it shows data_size is multiples of
* blocksize. So we set appropriate value.
*/
- if (tmp_data_len == 0)
- tmp_data_len = blocksize;
+ if (tmp_data_size == 0)
+ tmp_data_size = blocksize;
- data_len = tmp_data_len +
+ data_size = tmp_data_size +
((block_len_in_page - 1) << orig_inode->i_blkbits);
- } else {
- data_len = block_len_in_page << orig_inode->i_blkbits;
- }
+ } else
+ data_size = block_len_in_page << orig_inode->i_blkbits;
- ret = a_ops->write_begin(o_filp, mapping, offs, data_len, w_flags,
+ replaced_size = data_size;
+
+ *err = a_ops->write_begin(o_filp, mapping, offs, data_size, w_flags,
&page, &fsdata);
- if (unlikely(ret < 0))
+ if (unlikely(*err < 0))
goto out;
if (!PageUptodate(page)) {
@@ -911,14 +878,17 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
/* Release old bh and drop refs */
try_to_release_page(page, 0);
- ret = mext_replace_branches(handle, orig_inode, donor_inode,
- orig_blk_offset, block_len_in_page);
- if (ret < 0)
- goto out;
-
- /* Clear the inode cache not to refer to the old data */
- ext4_ext_invalidate_cache(orig_inode);
- ext4_ext_invalidate_cache(donor_inode);
+ replaced_count = mext_replace_branches(handle, orig_inode, donor_inode,
+ orig_blk_offset, block_len_in_page,
+ &err2);
+ if (err2) {
+ if (replaced_count) {
+ block_len_in_page = replaced_count;
+ replaced_size =
+ block_len_in_page << orig_inode->i_blkbits;
+ } else
+ goto out;
+ }
if (!page_has_buffers(page))
create_empty_buffers(page, 1 << orig_inode->i_blkbits, 0);
@@ -928,16 +898,16 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
bh = bh->b_this_page;
for (i = 0; i < block_len_in_page; i++) {
- ret = ext4_get_block(orig_inode,
+ *err = ext4_get_block(orig_inode,
(sector_t)(orig_blk_offset + i), bh, 0);
- if (ret < 0)
+ if (*err < 0)
goto out;
if (bh->b_this_page != NULL)
bh = bh->b_this_page;
}
- ret = a_ops->write_end(o_filp, mapping, offs, data_len, data_len,
+ *err = a_ops->write_end(o_filp, mapping, offs, data_size, replaced_size,
page, fsdata);
page = NULL;
@@ -951,7 +921,10 @@ out:
out2:
ext4_journal_stop(handle);
- return ret < 0 ? ret : 0;
+ if (err2)
+ *err = err2;
+
+ return replaced_count;
}
/**
@@ -962,7 +935,6 @@ out2:
* @orig_start: logical start offset in block for orig
* @donor_start: logical start offset in block for donor
* @len: the number of blocks to be moved
- * @moved_len: moved block length
*
* Check the arguments of ext4_move_extents() whether the files can be
* exchanged with each other.
@@ -970,8 +942,8 @@ out2:
*/
static int
mext_check_arguments(struct inode *orig_inode,
- struct inode *donor_inode, __u64 orig_start,
- __u64 donor_start, __u64 *len, __u64 moved_len)
+ struct inode *donor_inode, __u64 orig_start,
+ __u64 donor_start, __u64 *len)
{
ext4_lblk_t orig_blocks, donor_blocks;
unsigned int blkbits = orig_inode->i_blkbits;
@@ -985,6 +957,13 @@ mext_check_arguments(struct inode *orig_inode,
return -EINVAL;
}
+ if (donor_inode->i_mode & (S_ISUID|S_ISGID)) {
+ ext4_debug("ext4 move extent: suid or sgid is set"
+ " to donor file [ino:orig %lu, donor %lu]\n",
+ orig_inode->i_ino, donor_inode->i_ino);
+ return -EINVAL;
+ }
+
/* Ext4 move extent does not support swapfile */
if (IS_SWAPFILE(orig_inode) || IS_SWAPFILE(donor_inode)) {
ext4_debug("ext4 move extent: The argument files should "
@@ -1001,14 +980,6 @@ mext_check_arguments(struct inode *orig_inode,
return -EINVAL;
}
- /* orig and donor should be different file */
- if (orig_inode->i_ino == donor_inode->i_ino) {
- ext4_debug("ext4 move extent: The argument files should not "
- "be same file [ino:orig %lu, donor %lu]\n",
- orig_inode->i_ino, donor_inode->i_ino);
- return -EINVAL;
- }
-
/* Ext4 move extent supports only extent based file */
if (!(EXT4_I(orig_inode)->i_flags & EXT4_EXTENTS_FL)) {
ext4_debug("ext4 move extent: orig file is not extents "
@@ -1033,13 +1004,6 @@ mext_check_arguments(struct inode *orig_inode,
return -EINVAL;
}
- if (moved_len) {
- ext4_debug("ext4 move extent: moved_len should be 0 "
- "[ino:orig %lu, donor %lu]\n", orig_inode->i_ino,
- donor_inode->i_ino);
- return -EINVAL;
- }
-
if ((orig_start > EXT_MAX_BLOCK) ||
(donor_start > EXT_MAX_BLOCK) ||
(*len > EXT_MAX_BLOCK) ||
@@ -1096,7 +1060,7 @@ mext_check_arguments(struct inode *orig_inode,
}
if (!*len) {
- ext4_debug("ext4 move extent: len shoudld not be 0 "
+ ext4_debug("ext4 move extent: len should not be 0 "
"[ino:orig %lu, donor %lu]\n", orig_inode->i_ino,
donor_inode->i_ino);
return -EINVAL;
@@ -1232,16 +1196,24 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
int block_len_in_page;
int uninit;
- /* protect orig and donor against a truncate */
+ /* orig and donor should be different file */
+ if (orig_inode->i_ino == donor_inode->i_ino) {
+ ext4_debug("ext4 move extent: The argument files should not "
+ "be same file [ino:orig %lu, donor %lu]\n",
+ orig_inode->i_ino, donor_inode->i_ino);
+ return -EINVAL;
+ }
+
+ /* Protect orig and donor inodes against a truncate */
ret1 = mext_inode_double_lock(orig_inode, donor_inode);
if (ret1 < 0)
return ret1;
- mext_double_down_read(orig_inode, donor_inode);
+ /* Protect extent tree against block allocations via delalloc */
+ double_down_write_data_sem(orig_inode, donor_inode);
/* Check the filesystem environment whether move_extent can be done */
ret1 = mext_check_arguments(orig_inode, donor_inode, orig_start,
- donor_start, &len, *moved_len);
- mext_double_up_read(orig_inode, donor_inode);
+ donor_start, &len);
if (ret1)
goto out;
@@ -1355,36 +1327,39 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
seq_start = le32_to_cpu(ext_cur->ee_block);
rest_blocks = seq_blocks;
- /* Discard preallocations of two inodes */
- down_write(&EXT4_I(orig_inode)->i_data_sem);
- ext4_discard_preallocations(orig_inode);
- up_write(&EXT4_I(orig_inode)->i_data_sem);
-
- down_write(&EXT4_I(donor_inode)->i_data_sem);
- ext4_discard_preallocations(donor_inode);
- up_write(&EXT4_I(donor_inode)->i_data_sem);
+ /*
+ * Up semaphore to avoid following problems:
+ * a. transaction deadlock among ext4_journal_start,
+ * ->write_begin via pagefault, and jbd2_journal_commit
+ * b. racing with ->readpage, ->write_begin, and ext4_get_block
+ * in move_extent_per_page
+ */
+ double_up_write_data_sem(orig_inode, donor_inode);
while (orig_page_offset <= seq_end_page) {
/* Swap original branches with new branches */
- ret1 = move_extent_per_page(o_filp, donor_inode,
+ block_len_in_page = move_extent_per_page(
+ o_filp, donor_inode,
orig_page_offset,
data_offset_in_page,
- block_len_in_page, uninit);
- if (ret1 < 0)
- goto out;
- orig_page_offset++;
+ block_len_in_page, uninit,
+ &ret1);
+
/* Count how many blocks we have exchanged */
*moved_len += block_len_in_page;
+ if (ret1 < 0)
+ break;
if (*moved_len > len) {
ext4_error(orig_inode->i_sb, __func__,
"We replaced blocks too much! "
"sum of replaced: %llu requested: %llu",
*moved_len, len);
ret1 = -EIO;
- goto out;
+ break;
}
+ orig_page_offset++;
data_offset_in_page = 0;
rest_blocks -= block_len_in_page;
if (rest_blocks > blocks_per_page)
@@ -1393,6 +1368,10 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
block_len_in_page = rest_blocks;
}
+ double_down_write_data_sem(orig_inode, donor_inode);
+ if (ret1 < 0)
+ break;
+
/* Decrease buffer counter */
if (holecheck_path)
ext4_ext_drop_refs(holecheck_path);
@@ -1414,6 +1393,11 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
}
out:
+ if (*moved_len) {
+ ext4_discard_preallocations(orig_inode);
+ ext4_discard_preallocations(donor_inode);
+ }
+
if (orig_path) {
ext4_ext_drop_refs(orig_path);
kfree(orig_path);
@@ -1422,7 +1406,7 @@ out:
ext4_ext_drop_refs(holecheck_path);
kfree(holecheck_path);
}
-
+ double_up_write_data_sem(orig_inode, donor_inode);
ret2 = mext_inode_double_unlock(orig_inode, donor_inode);
if (ret1)
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 42f81d285cd5..17a17e10dd60 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1292,9 +1292,6 @@ errout:
* add_dirent_to_buf will attempt search the directory block for
* space. It will return -ENOSPC if no space is available, and -EIO
* and -EEXIST if directory entry already exists.
- *
- * NOTE! bh is NOT released in the case where ENOSPC is returned. In
- * all other cases bh is released.
*/
static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
struct inode *inode, struct ext4_dir_entry_2 *de,
@@ -1315,14 +1312,10 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
top = bh->b_data + blocksize - reclen;
while ((char *) de <= top) {
if (!ext4_check_dir_entry("ext4_add_entry", dir, de,
- bh, offset)) {
- brelse(bh);
+ bh, offset))
return -EIO;
- }
- if (ext4_match(namelen, name, de)) {
- brelse(bh);
+ if (ext4_match(namelen, name, de))
return -EEXIST;
- }
nlen = EXT4_DIR_REC_LEN(de->name_len);
rlen = ext4_rec_len_from_disk(de->rec_len, blocksize);
if ((de->inode? rlen - nlen: rlen) >= reclen)
@@ -1337,7 +1330,6 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
err = ext4_journal_get_write_access(handle, bh);
if (err) {
ext4_std_error(dir->i_sb, err);
- brelse(bh);
return err;
}
@@ -1377,7 +1369,6 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
err = ext4_handle_dirty_metadata(handle, dir, bh);
if (err)
ext4_std_error(dir->i_sb, err);
- brelse(bh);
return 0;
}
@@ -1471,7 +1462,9 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
if (!(de))
return retval;
- return add_dirent_to_buf(handle, dentry, inode, de, bh);
+ retval = add_dirent_to_buf(handle, dentry, inode, de, bh);
+ brelse(bh);
+ return retval;
}
/*
@@ -1514,16 +1507,14 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
if(!bh)
return retval;
retval = add_dirent_to_buf(handle, dentry, inode, NULL, bh);
- if (retval != -ENOSPC)
+ if (retval != -ENOSPC) {
+ brelse(bh);
return retval;
+ }
if (blocks == 1 && !dx_fallback &&
- EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_DIR_INDEX)) {
- retval = make_indexed_dir(handle, dentry, inode, bh);
- if (retval == -ENOSPC)
- brelse(bh);
- return retval;
- }
+ EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_DIR_INDEX))
+ return make_indexed_dir(handle, dentry, inode, bh);
brelse(bh);
}
bh = ext4_append(handle, dir, &block, &retval);
@@ -1533,8 +1524,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
de->inode = 0;
de->rec_len = ext4_rec_len_to_disk(blocksize, blocksize);
retval = add_dirent_to_buf(handle, dentry, inode, de, bh);
- if (retval == -ENOSPC)
- brelse(bh);
+ brelse(bh);
return retval;
}
@@ -1568,10 +1558,8 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
goto journal_error;
err = add_dirent_to_buf(handle, dentry, inode, NULL, bh);
- if (err != -ENOSPC) {
- bh = NULL;
+ if (err != -ENOSPC)
goto cleanup;
- }
/* Block full, should compress but for now just split */
dxtrace(printk(KERN_DEBUG "using %u of %u node entries\n",
@@ -1664,8 +1652,6 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
if (!de)
goto cleanup;
err = add_dirent_to_buf(handle, dentry, inode, de, bh);
- if (err != -ENOSPC)
- bh = NULL;
goto cleanup;
journal_error:
@@ -1783,7 +1769,7 @@ static int ext4_create(struct inode *dir, struct dentry *dentry, int mode,
retry:
handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
- 2*EXT4_QUOTA_INIT_BLOCKS(dir->i_sb));
+ EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
if (IS_ERR(handle))
return PTR_ERR(handle);
@@ -1817,7 +1803,7 @@ static int ext4_mknod(struct inode *dir, struct dentry *dentry,
retry:
handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
- 2*EXT4_QUOTA_INIT_BLOCKS(dir->i_sb));
+ EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
if (IS_ERR(handle))
return PTR_ERR(handle);
@@ -1854,7 +1840,7 @@ static int ext4_mkdir(struct inode *dir, struct dentry *dentry, int mode)
retry:
handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
- 2*EXT4_QUOTA_INIT_BLOCKS(dir->i_sb));
+ EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
if (IS_ERR(handle))
return PTR_ERR(handle);
@@ -2076,7 +2062,8 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode)
struct ext4_iloc iloc;
int err = 0;
- if (!ext4_handle_valid(handle))
+ /* ext4_handle_valid() assumes a valid handle_t pointer */
+ if (handle && !ext4_handle_valid(handle))
return 0;
mutex_lock(&EXT4_SB(inode->i_sb)->s_orphan_lock);
@@ -2266,7 +2253,7 @@ static int ext4_symlink(struct inode *dir,
retry:
handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
EXT4_INDEX_EXTRA_TRANS_BLOCKS + 5 +
- 2*EXT4_QUOTA_INIT_BLOCKS(dir->i_sb));
+ EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
if (IS_ERR(handle))
return PTR_ERR(handle);
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 3cfc343c41b5..3b2c5541d8a6 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -247,7 +247,7 @@ static int setup_new_group_blocks(struct super_block *sb,
goto exit_bh;
if (IS_ERR(gdb = bclean(handle, sb, block))) {
- err = PTR_ERR(bh);
+ err = PTR_ERR(gdb);
goto exit_bh;
}
ext4_handle_dirty_metadata(handle, NULL, gdb);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index df539ba27779..735c20d5fd56 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -50,13 +50,6 @@
#define CREATE_TRACE_POINTS
#include <trace/events/ext4.h>
-static int default_mb_history_length = 1000;
-
-module_param_named(default_mb_history_length, default_mb_history_length,
- int, 0644);
-MODULE_PARM_DESC(default_mb_history_length,
- "Default number of entries saved for mb_history");
-
struct proc_dir_entry *ext4_proc_root;
static struct kset *ext4_kset;
@@ -189,6 +182,36 @@ void ext4_itable_unused_set(struct super_block *sb,
bg->bg_itable_unused_hi = cpu_to_le16(count >> 16);
}
+
+/* Just increment the non-pointer handle value */
+static handle_t *ext4_get_nojournal(void)
+{
+ handle_t *handle = current->journal_info;
+ unsigned long ref_cnt = (unsigned long)handle;
+
+ BUG_ON(ref_cnt >= EXT4_NOJOURNAL_MAX_REF_COUNT);
+
+ ref_cnt++;
+ handle = (handle_t *)ref_cnt;
+
+ current->journal_info = handle;
+ return handle;
+}
+
+
+/* Decrement the non-pointer handle value */
+static void ext4_put_nojournal(handle_t *handle)
+{
+ unsigned long ref_cnt = (unsigned long)handle;
+
+ BUG_ON(ref_cnt == 0);
+
+ ref_cnt--;
+ handle = (handle_t *)ref_cnt;
+
+ current->journal_info = handle;
+}
+
/*
* Wrappers for jbd2_journal_start/end.
*
@@ -215,11 +238,7 @@ handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks)
}
return jbd2_journal_start(journal, nblocks);
}
- /*
- * We're not journaling, return the appropriate indication.
- */
- current->journal_info = EXT4_NOJOURNAL_HANDLE;
- return current->journal_info;
+ return ext4_get_nojournal();
}
/*
@@ -235,11 +254,7 @@ int __ext4_journal_stop(const char *where, handle_t *handle)
int rc;
if (!ext4_handle_valid(handle)) {
- /*
- * Do this here since we don't call jbd2_journal_stop() in
- * no-journal mode.
- */
- current->journal_info = NULL;
+ ext4_put_nojournal(handle);
return 0;
}
sb = handle->h_transaction->t_journal->j_private;
@@ -580,15 +595,14 @@ static void ext4_put_super(struct super_block *sb)
struct ext4_super_block *es = sbi->s_es;
int i, err;
+ flush_workqueue(sbi->dio_unwritten_wq);
+ destroy_workqueue(sbi->dio_unwritten_wq);
+
lock_super(sb);
lock_kernel();
if (sb->s_dirt)
ext4_commit_super(sb, 1);
- ext4_release_system_zone(sb);
- ext4_mb_release(sb);
- ext4_ext_release(sb);
- ext4_xattr_put_super(sb);
if (sbi->s_journal) {
err = jbd2_journal_destroy(sbi->s_journal);
sbi->s_journal = NULL;
@@ -596,6 +610,12 @@ static void ext4_put_super(struct super_block *sb)
ext4_abort(sb, __func__,
"Couldn't clean up the journal");
}
+
+ ext4_release_system_zone(sb);
+ ext4_mb_release(sb);
+ ext4_ext_release(sb);
+ ext4_xattr_put_super(sb);
+
if (!(sb->s_flags & MS_RDONLY)) {
EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
es->s_state = cpu_to_le16(sbi->s_mount_state);
@@ -682,8 +702,16 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
ei->i_reserved_data_blocks = 0;
ei->i_reserved_meta_blocks = 0;
ei->i_allocated_meta_blocks = 0;
+ ei->i_da_metadata_calc_len = 0;
ei->i_delalloc_reserved_flag = 0;
spin_lock_init(&(ei->i_block_reservation_lock));
+#ifdef CONFIG_QUOTA
+ ei->i_reserved_quota = 0;
+#endif
+ INIT_LIST_HEAD(&ei->i_aio_dio_complete_list);
+ ei->cur_aio_dio = NULL;
+ ei->i_sync_tid = 0;
+ ei->i_datasync_tid = 0;
return &ei->vfs_inode;
}
@@ -745,9 +773,22 @@ static inline void ext4_show_quota_options(struct seq_file *seq,
#if defined(CONFIG_QUOTA)
struct ext4_sb_info *sbi = EXT4_SB(sb);
- if (sbi->s_jquota_fmt)
- seq_printf(seq, ",jqfmt=%s",
- (sbi->s_jquota_fmt == QFMT_VFS_OLD) ? "vfsold" : "vfsv0");
+ if (sbi->s_jquota_fmt) {
+ char *fmtname = "";
+
+ switch (sbi->s_jquota_fmt) {
+ case QFMT_VFS_OLD:
+ fmtname = "vfsold";
+ break;
+ case QFMT_VFS_V0:
+ fmtname = "vfsv0";
+ break;
+ case QFMT_VFS_V1:
+ fmtname = "vfsv1";
+ break;
+ }
+ seq_printf(seq, ",jqfmt=%s", fmtname);
+ }
if (sbi->s_qf_names[USRQUOTA])
seq_printf(seq, ",usrjquota=%s", sbi->s_qf_names[USRQUOTA]);
@@ -879,6 +920,12 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
if (test_opt(sb, NO_AUTO_DA_ALLOC))
seq_puts(seq, ",noauto_da_alloc");
+ if (test_opt(sb, DISCARD))
+ seq_puts(seq, ",discard");
+
+ if (test_opt(sb, NOLOAD))
+ seq_puts(seq, ",norecovery");
+
ext4_show_quota_options(seq, sb);
return 0;
@@ -971,7 +1018,9 @@ static const struct dquot_operations ext4_quota_operations = {
.reserve_space = dquot_reserve_space,
.claim_space = dquot_claim_space,
.release_rsv = dquot_release_reserved_space,
+#ifdef CONFIG_QUOTA
.get_reserved_space = ext4_get_reserved_space,
+#endif
.alloc_inode = dquot_alloc_inode,
.free_space = dquot_free_space,
.free_inode = dquot_free_inode,
@@ -1052,14 +1101,15 @@ enum {
Opt_journal_update, Opt_journal_dev,
Opt_journal_checksum, Opt_journal_async_commit,
Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
- Opt_data_err_abort, Opt_data_err_ignore, Opt_mb_history_length,
+ Opt_data_err_abort, Opt_data_err_ignore,
Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
- Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota,
- Opt_ignore, Opt_barrier, Opt_nobarrier, Opt_err, Opt_resize,
- Opt_usrquota, Opt_grpquota, Opt_i_version,
+ Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, Opt_quota,
+ Opt_noquota, Opt_ignore, Opt_barrier, Opt_nobarrier, Opt_err,
+ Opt_resize, Opt_usrquota, Opt_grpquota, Opt_i_version,
Opt_stripe, Opt_delalloc, Opt_nodelalloc,
Opt_block_validity, Opt_noblock_validity,
- Opt_inode_readahead_blks, Opt_journal_ioprio
+ Opt_inode_readahead_blks, Opt_journal_ioprio,
+ Opt_discard, Opt_nodiscard,
};
static const match_table_t tokens = {
@@ -1084,6 +1134,7 @@ static const match_table_t tokens = {
{Opt_acl, "acl"},
{Opt_noacl, "noacl"},
{Opt_noload, "noload"},
+ {Opt_noload, "norecovery"},
{Opt_nobh, "nobh"},
{Opt_bh, "bh"},
{Opt_commit, "commit=%u"},
@@ -1099,13 +1150,13 @@ static const match_table_t tokens = {
{Opt_data_writeback, "data=writeback"},
{Opt_data_err_abort, "data_err=abort"},
{Opt_data_err_ignore, "data_err=ignore"},
- {Opt_mb_history_length, "mb_history_length=%u"},
{Opt_offusrjquota, "usrjquota="},
{Opt_usrjquota, "usrjquota=%s"},
{Opt_offgrpjquota, "grpjquota="},
{Opt_grpjquota, "grpjquota=%s"},
{Opt_jqfmt_vfsold, "jqfmt=vfsold"},
{Opt_jqfmt_vfsv0, "jqfmt=vfsv0"},
+ {Opt_jqfmt_vfsv1, "jqfmt=vfsv1"},
{Opt_grpquota, "grpquota"},
{Opt_noquota, "noquota"},
{Opt_quota, "quota"},
@@ -1125,6 +1176,8 @@ static const match_table_t tokens = {
{Opt_auto_da_alloc, "auto_da_alloc=%u"},
{Opt_auto_da_alloc, "auto_da_alloc"},
{Opt_noauto_da_alloc, "noauto_da_alloc"},
+ {Opt_discard, "discard"},
+ {Opt_nodiscard, "nodiscard"},
{Opt_err, NULL},
};
@@ -1281,9 +1334,11 @@ static int parse_options(char *options, struct super_block *sb,
*journal_devnum = option;
break;
case Opt_journal_checksum:
- break; /* Kept for backwards compatibility */
+ set_opt(sbi->s_mount_opt, JOURNAL_CHECKSUM);
+ break;
case Opt_journal_async_commit:
set_opt(sbi->s_mount_opt, JOURNAL_ASYNC_COMMIT);
+ set_opt(sbi->s_mount_opt, JOURNAL_CHECKSUM);
break;
case Opt_noload:
set_opt(sbi->s_mount_opt, NOLOAD);
@@ -1340,13 +1395,6 @@ static int parse_options(char *options, struct super_block *sb,
case Opt_data_err_ignore:
clear_opt(sbi->s_mount_opt, DATA_ERR_ABORT);
break;
- case Opt_mb_history_length:
- if (match_int(&args[0], &option))
- return 0;
- if (option < 0)
- return 0;
- sbi->s_mb_history_max = option;
- break;
#ifdef CONFIG_QUOTA
case Opt_usrjquota:
qtype = USRQUOTA;
@@ -1411,6 +1459,9 @@ clear_qf_name:
goto set_qf_format;
case Opt_jqfmt_vfsv0:
qfmt = QFMT_VFS_V0;
+ goto set_qf_format;
+ case Opt_jqfmt_vfsv1:
+ qfmt = QFMT_VFS_V1;
set_qf_format:
if (sb_any_quota_loaded(sb) &&
sbi->s_jquota_fmt != qfmt) {
@@ -1453,6 +1504,7 @@ set_qf_format:
case Opt_offgrpjquota:
case Opt_jqfmt_vfsold:
case Opt_jqfmt_vfsv0:
+ case Opt_jqfmt_vfsv1:
ext4_msg(sb, KERN_ERR,
"journaled quota options not supported");
break;
@@ -1551,6 +1603,12 @@ set_qf_format:
else
set_opt(sbi->s_mount_opt,NO_AUTO_DA_ALLOC);
break;
+ case Opt_discard:
+ set_opt(sbi->s_mount_opt, DISCARD);
+ break;
+ case Opt_nodiscard:
+ clear_opt(sbi->s_mount_opt, DISCARD);
+ break;
default:
ext4_msg(sb, KERN_ERR,
"Unrecognized mount option \"%s\" "
@@ -1646,13 +1704,6 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
EXT4_INODES_PER_GROUP(sb),
sbi->s_mount_opt);
- if (EXT4_SB(sb)->s_journal) {
- ext4_msg(sb, KERN_INFO, "%s journal on %s",
- EXT4_SB(sb)->s_journal->j_inode ? "internal" :
- "external", EXT4_SB(sb)->s_journal->j_devname);
- } else {
- ext4_msg(sb, KERN_INFO, "no journal");
- }
return res;
}
@@ -1666,14 +1717,14 @@ static int ext4_fill_flex_info(struct super_block *sb)
size_t size;
int i;
- if (!sbi->s_es->s_log_groups_per_flex) {
+ sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex;
+ groups_per_flex = 1 << sbi->s_log_groups_per_flex;
+
+ if (groups_per_flex < 2) {
sbi->s_log_groups_per_flex = 0;
return 1;
}
- sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex;
- groups_per_flex = 1 << sbi->s_log_groups_per_flex;
-
/* We allocate both existing and potentially added groups */
flex_group_count = ((sbi->s_groups_count + groups_per_flex - 1) +
((le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) + 1) <<
@@ -2092,11 +2143,8 @@ static int parse_strtoul(const char *buf,
{
char *endp;
- while (*buf && isspace(*buf))
- buf++;
- *value = simple_strtoul(buf, &endp, 0);
- while (*endp && isspace(*endp))
- endp++;
+ *value = simple_strtoul(skip_spaces(buf), &endp, 0);
+ endp = skip_spaces(endp);
if (*endp || *value > max)
return -EINVAL;
@@ -2127,9 +2175,9 @@ static ssize_t lifetime_write_kbytes_show(struct ext4_attr *a,
struct super_block *sb = sbi->s_buddy_cache->i_sb;
return snprintf(buf, PAGE_SIZE, "%llu\n",
- sbi->s_kbytes_written +
+ (unsigned long long)(sbi->s_kbytes_written +
((part_stat_read(sb->s_bdev->bd_part, sectors[1]) -
- EXT4_SB(sb)->s_sectors_written_start) >> 1));
+ EXT4_SB(sb)->s_sectors_written_start) >> 1)));
}
static ssize_t inode_readahead_blks_store(struct ext4_attr *a,
@@ -2197,6 +2245,7 @@ EXT4_RW_ATTR_SBI_UI(mb_min_to_scan, s_mb_min_to_scan);
EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs);
EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request);
EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc);
+EXT4_RW_ATTR_SBI_UI(max_writeback_mb_bump, s_max_writeback_mb_bump);
static struct attribute *ext4_attrs[] = {
ATTR_LIST(delayed_allocation_blocks),
@@ -2210,6 +2259,7 @@ static struct attribute *ext4_attrs[] = {
ATTR_LIST(mb_order2_req),
ATTR_LIST(mb_stream_req),
ATTR_LIST(mb_group_prealloc),
+ ATTR_LIST(max_writeback_mb_bump),
NULL,
};
@@ -2413,7 +2463,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
sbi->s_commit_interval = JBD2_DEFAULT_MAX_COMMIT_AGE * HZ;
sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME;
sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME;
- sbi->s_mb_history_max = default_mb_history_length;
set_opt(sbi->s_mount_opt, BARRIER);
@@ -2679,6 +2728,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
}
sbi->s_stripe = ext4_get_stripe_size(sbi);
+ sbi->s_max_writeback_mb_bump = 128;
/*
* set up enough so that it can read an inode
@@ -2712,26 +2762,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)) {
if (ext4_load_journal(sb, es, journal_devnum))
goto failed_mount3;
- if (!(sb->s_flags & MS_RDONLY) &&
- EXT4_SB(sb)->s_journal->j_failed_commit) {
- ext4_msg(sb, KERN_CRIT, "error: "
- "ext4_fill_super: Journal transaction "
- "%u is corrupt",
- EXT4_SB(sb)->s_journal->j_failed_commit);
- if (test_opt(sb, ERRORS_RO)) {
- ext4_msg(sb, KERN_CRIT,
- "Mounting filesystem read-only");
- sb->s_flags |= MS_RDONLY;
- EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
- es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
- }
- if (test_opt(sb, ERRORS_PANIC)) {
- EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
- es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
- ext4_commit_super(sb, 1);
- goto failed_mount4;
- }
- }
} else if (test_opt(sb, NOLOAD) && !(sb->s_flags & MS_RDONLY) &&
EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)) {
ext4_msg(sb, KERN_ERR, "required journal recovery "
@@ -2752,14 +2782,20 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
goto failed_mount4;
}
- jbd2_journal_set_features(sbi->s_journal,
- JBD2_FEATURE_COMPAT_CHECKSUM, 0, 0);
- if (test_opt(sb, JOURNAL_ASYNC_COMMIT))
- jbd2_journal_set_features(sbi->s_journal, 0, 0,
+ if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) {
+ jbd2_journal_set_features(sbi->s_journal,
+ JBD2_FEATURE_COMPAT_CHECKSUM, 0,
JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT);
- else
+ } else if (test_opt(sb, JOURNAL_CHECKSUM)) {
+ jbd2_journal_set_features(sbi->s_journal,
+ JBD2_FEATURE_COMPAT_CHECKSUM, 0, 0);
jbd2_journal_clear_features(sbi->s_journal, 0, 0,
JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT);
+ } else {
+ jbd2_journal_clear_features(sbi->s_journal,
+ JBD2_FEATURE_COMPAT_CHECKSUM, 0,
+ JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT);
+ }
/* We have now updated the journal if required, so we can
* validate the data journaling mode. */
@@ -2798,6 +2834,12 @@ no_journal:
clear_opt(sbi->s_mount_opt, NOBH);
}
}
+ EXT4_SB(sb)->dio_unwritten_wq = create_workqueue("ext4-dio-unwritten");
+ if (!EXT4_SB(sb)->dio_unwritten_wq) {
+ printk(KERN_ERR "EXT4-fs: failed to create DIO workqueue\n");
+ goto failed_mount_wq;
+ }
+
/*
* The jbd2_journal_load will have done any necessary log recovery,
* so we can safely mount the rest of the filesystem now.
@@ -2849,12 +2891,12 @@ no_journal:
"available");
}
- if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
+ if (test_opt(sb, DELALLOC) &&
+ (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)) {
ext4_msg(sb, KERN_WARNING, "Ignoring delalloc option - "
"requested data journaling mode");
clear_opt(sbi->s_mount_opt, DELALLOC);
- } else if (test_opt(sb, DELALLOC))
- ext4_msg(sb, KERN_INFO, "delayed allocation enabled");
+ }
err = ext4_setup_system_zone(sb);
if (err) {
@@ -2910,6 +2952,8 @@ cantfind_ext4:
failed_mount4:
ext4_msg(sb, KERN_ERR, "mount failed");
+ destroy_workqueue(EXT4_SB(sb)->dio_unwritten_wq);
+failed_mount_wq:
ext4_release_system_zone(sb);
if (sbi->s_journal) {
jbd2_journal_destroy(sbi->s_journal);
@@ -3164,9 +3208,7 @@ static int ext4_load_journal(struct super_block *sb,
return -EINVAL;
}
- if (journal->j_flags & JBD2_BARRIER)
- ext4_msg(sb, KERN_INFO, "barriers enabled");
- else
+ if (!(journal->j_flags & JBD2_BARRIER))
ext4_msg(sb, KERN_INFO, "barriers disabled");
if (!really_read_only && test_opt(sb, UPDATE_JOURNAL)) {
@@ -3361,11 +3403,13 @@ static int ext4_sync_fs(struct super_block *sb, int wait)
{
int ret = 0;
tid_t target;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
trace_ext4_sync_fs(sb, wait);
- if (jbd2_journal_start_commit(EXT4_SB(sb)->s_journal, &target)) {
+ flush_workqueue(sbi->dio_unwritten_wq);
+ if (jbd2_journal_start_commit(sbi->s_journal, &target)) {
if (wait)
- jbd2_log_wait_commit(EXT4_SB(sb)->s_journal, target);
+ jbd2_log_wait_commit(sbi->s_journal, target);
}
return ret;
}
@@ -3645,13 +3689,11 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
buf->f_blocks = ext4_blocks_count(es) - sbi->s_overhead_last;
buf->f_bfree = percpu_counter_sum_positive(&sbi->s_freeblocks_counter) -
percpu_counter_sum_positive(&sbi->s_dirtyblocks_counter);
- ext4_free_blocks_count_set(es, buf->f_bfree);
buf->f_bavail = buf->f_bfree - ext4_r_blocks_count(es);
if (buf->f_bfree < ext4_r_blocks_count(es))
buf->f_bavail = 0;
buf->f_files = le32_to_cpu(es->s_inodes_count);
buf->f_ffree = percpu_counter_sum_positive(&sbi->s_freeinodes_counter);
- es->s_free_inodes_count = cpu_to_le32(buf->f_ffree);
buf->f_namelen = EXT4_NAME_LEN;
fsid = le64_to_cpup((void *)es->s_uuid) ^
le64_to_cpup((void *)es->s_uuid + sizeof(u64));
@@ -3943,35 +3985,68 @@ static int ext4_get_sb(struct file_system_type *fs_type, int flags,
return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super,mnt);
}
-static struct file_system_type ext4_fs_type = {
+#if !defined(CONTIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
+static struct file_system_type ext2_fs_type = {
.owner = THIS_MODULE,
- .name = "ext4",
+ .name = "ext2",
.get_sb = ext4_get_sb,
.kill_sb = kill_block_super,
.fs_flags = FS_REQUIRES_DEV,
};
-#ifdef CONFIG_EXT4DEV_COMPAT
-static int ext4dev_get_sb(struct file_system_type *fs_type, int flags,
- const char *dev_name, void *data,struct vfsmount *mnt)
+static inline void register_as_ext2(void)
{
- printk(KERN_WARNING "EXT4-fs (%s): Update your userspace programs "
- "to mount using ext4\n", dev_name);
- printk(KERN_WARNING "EXT4-fs (%s): ext4dev backwards compatibility "
- "will go away by 2.6.31\n", dev_name);
- return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super,mnt);
+ int err = register_filesystem(&ext2_fs_type);
+ if (err)
+ printk(KERN_WARNING
+ "EXT4-fs: Unable to register as ext2 (%d)\n", err);
+}
+
+static inline void unregister_as_ext2(void)
+{
+ unregister_filesystem(&ext2_fs_type);
}
+MODULE_ALIAS("ext2");
+#else
+static inline void register_as_ext2(void) { }
+static inline void unregister_as_ext2(void) { }
+#endif
-static struct file_system_type ext4dev_fs_type = {
+#if !defined(CONTIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
+static struct file_system_type ext3_fs_type = {
.owner = THIS_MODULE,
- .name = "ext4dev",
- .get_sb = ext4dev_get_sb,
+ .name = "ext3",
+ .get_sb = ext4_get_sb,
.kill_sb = kill_block_super,
.fs_flags = FS_REQUIRES_DEV,
};
-MODULE_ALIAS("ext4dev");
+
+static inline void register_as_ext3(void)
+{
+ int err = register_filesystem(&ext3_fs_type);
+ if (err)
+ printk(KERN_WARNING
+ "EXT4-fs: Unable to register as ext3 (%d)\n", err);
+}
+
+static inline void unregister_as_ext3(void)
+{
+ unregister_filesystem(&ext3_fs_type);
+}
+MODULE_ALIAS("ext3");
+#else
+static inline void register_as_ext3(void) { }
+static inline void unregister_as_ext3(void) { }
#endif
+static struct file_system_type ext4_fs_type = {
+ .owner = THIS_MODULE,
+ .name = "ext4",
+ .get_sb = ext4_get_sb,
+ .kill_sb = kill_block_super,
+ .fs_flags = FS_REQUIRES_DEV,
+};
+
static int __init init_ext4_fs(void)
{
int err;
@@ -3993,18 +4068,15 @@ static int __init init_ext4_fs(void)
err = init_inodecache();
if (err)
goto out1;
+ register_as_ext2();
+ register_as_ext3();
err = register_filesystem(&ext4_fs_type);
if (err)
goto out;
-#ifdef CONFIG_EXT4DEV_COMPAT
- err = register_filesystem(&ext4dev_fs_type);
- if (err) {
- unregister_filesystem(&ext4_fs_type);
- goto out;
- }
-#endif
return 0;
out:
+ unregister_as_ext2();
+ unregister_as_ext3();
destroy_inodecache();
out1:
exit_ext4_xattr();
@@ -4020,10 +4092,9 @@ out4:
static void __exit exit_ext4_fs(void)
{
+ unregister_as_ext2();
+ unregister_as_ext3();
unregister_filesystem(&ext4_fs_type);
-#ifdef CONFIG_EXT4DEV_COMPAT
- unregister_filesystem(&ext4dev_fs_type);
-#endif
destroy_inodecache();
exit_ext4_xattr();
exit_ext4_mballoc();
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index fed5b01d7a8d..f3a2f7ed45aa 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -92,7 +92,7 @@ static struct buffer_head *ext4_xattr_cache_find(struct inode *,
struct mb_cache_entry **);
static void ext4_xattr_rehash(struct ext4_xattr_header *,
struct ext4_xattr_entry *);
-static int ext4_xattr_list(struct inode *inode, char *buffer,
+static int ext4_xattr_list(struct dentry *dentry, char *buffer,
size_t buffer_size);
static struct mb_cache *ext4_xattr_cache;
@@ -140,7 +140,7 @@ ext4_xattr_handler(int name_index)
ssize_t
ext4_listxattr(struct dentry *dentry, char *buffer, size_t size)
{
- return ext4_xattr_list(dentry->d_inode, buffer, size);
+ return ext4_xattr_list(dentry, buffer, size);
}
static int
@@ -325,7 +325,7 @@ ext4_xattr_get(struct inode *inode, int name_index, const char *name,
}
static int
-ext4_xattr_list_entries(struct inode *inode, struct ext4_xattr_entry *entry,
+ext4_xattr_list_entries(struct dentry *dentry, struct ext4_xattr_entry *entry,
char *buffer, size_t buffer_size)
{
size_t rest = buffer_size;
@@ -335,9 +335,10 @@ ext4_xattr_list_entries(struct inode *inode, struct ext4_xattr_entry *entry,
ext4_xattr_handler(entry->e_name_index);
if (handler) {
- size_t size = handler->list(inode, buffer, rest,
+ size_t size = handler->list(dentry, buffer, rest,
entry->e_name,
- entry->e_name_len);
+ entry->e_name_len,
+ handler->flags);
if (buffer) {
if (size > rest)
return -ERANGE;
@@ -350,8 +351,9 @@ ext4_xattr_list_entries(struct inode *inode, struct ext4_xattr_entry *entry,
}
static int
-ext4_xattr_block_list(struct inode *inode, char *buffer, size_t buffer_size)
+ext4_xattr_block_list(struct dentry *dentry, char *buffer, size_t buffer_size)
{
+ struct inode *inode = dentry->d_inode;
struct buffer_head *bh = NULL;
int error;
@@ -376,7 +378,7 @@ ext4_xattr_block_list(struct inode *inode, char *buffer, size_t buffer_size)
goto cleanup;
}
ext4_xattr_cache_insert(bh);
- error = ext4_xattr_list_entries(inode, BFIRST(bh), buffer, buffer_size);
+ error = ext4_xattr_list_entries(dentry, BFIRST(bh), buffer, buffer_size);
cleanup:
brelse(bh);
@@ -385,8 +387,9 @@ cleanup:
}
static int
-ext4_xattr_ibody_list(struct inode *inode, char *buffer, size_t buffer_size)
+ext4_xattr_ibody_list(struct dentry *dentry, char *buffer, size_t buffer_size)
{
+ struct inode *inode = dentry->d_inode;
struct ext4_xattr_ibody_header *header;
struct ext4_inode *raw_inode;
struct ext4_iloc iloc;
@@ -404,7 +407,7 @@ ext4_xattr_ibody_list(struct inode *inode, char *buffer, size_t buffer_size)
error = ext4_xattr_check_names(IFIRST(header), end);
if (error)
goto cleanup;
- error = ext4_xattr_list_entries(inode, IFIRST(header),
+ error = ext4_xattr_list_entries(dentry, IFIRST(header),
buffer, buffer_size);
cleanup:
@@ -423,12 +426,12 @@ cleanup:
* used / required on success.
*/
static int
-ext4_xattr_list(struct inode *inode, char *buffer, size_t buffer_size)
+ext4_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size)
{
int i_error, b_error;
- down_read(&EXT4_I(inode)->xattr_sem);
- i_error = ext4_xattr_ibody_list(inode, buffer, buffer_size);
+ down_read(&EXT4_I(dentry->d_inode)->xattr_sem);
+ i_error = ext4_xattr_ibody_list(dentry, buffer, buffer_size);
if (i_error < 0) {
b_error = 0;
} else {
@@ -436,11 +439,11 @@ ext4_xattr_list(struct inode *inode, char *buffer, size_t buffer_size)
buffer += i_error;
buffer_size -= i_error;
}
- b_error = ext4_xattr_block_list(inode, buffer, buffer_size);
+ b_error = ext4_xattr_block_list(dentry, buffer, buffer_size);
if (b_error < 0)
i_error = 0;
}
- up_read(&EXT4_I(inode)->xattr_sem);
+ up_read(&EXT4_I(dentry->d_inode)->xattr_sem);
return i_error + b_error;
}
@@ -482,9 +485,10 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode,
ea_bdebug(bh, "refcount now=0; freeing");
if (ce)
mb_cache_entry_free(ce);
- ext4_free_blocks(handle, inode, bh->b_blocknr, 1, 1);
get_bh(bh);
- ext4_forget(handle, 1, inode, bh, bh->b_blocknr);
+ ext4_free_blocks(handle, inode, bh, 0, 1,
+ EXT4_FREE_BLOCKS_METADATA |
+ EXT4_FREE_BLOCKS_FORGET);
} else {
le32_add_cpu(&BHDR(bh)->h_refcount, -1);
error = ext4_handle_dirty_metadata(handle, inode, bh);
@@ -832,7 +836,8 @@ inserted:
new_bh = sb_getblk(sb, block);
if (!new_bh) {
getblk_failed:
- ext4_free_blocks(handle, inode, block, 1, 1);
+ ext4_free_blocks(handle, inode, 0, block, 1,
+ EXT4_FREE_BLOCKS_METADATA);
error = -EIO;
goto cleanup;
}
@@ -988,6 +993,10 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
if (error)
goto cleanup;
+ error = ext4_journal_get_write_access(handle, is.iloc.bh);
+ if (error)
+ goto cleanup;
+
if (EXT4_I(inode)->i_state & EXT4_STATE_NEW) {
struct ext4_inode *raw_inode = ext4_raw_inode(&is.iloc);
memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size);
@@ -1013,9 +1022,6 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
if (flags & XATTR_CREATE)
goto cleanup;
}
- error = ext4_journal_get_write_access(handle, is.iloc.bh);
- if (error)
- goto cleanup;
if (!value) {
if (!is.s.not_found)
error = ext4_xattr_ibody_set(handle, inode, &i, &is);
@@ -1326,6 +1332,8 @@ retry:
goto cleanup;
kfree(b_entry_name);
kfree(buffer);
+ b_entry_name = NULL;
+ buffer = NULL;
brelse(is->iloc.bh);
kfree(is);
kfree(bs);
diff --git a/fs/ext4/xattr_security.c b/fs/ext4/xattr_security.c
index ca5f89fc6cae..983c253999a7 100644
--- a/fs/ext4/xattr_security.c
+++ b/fs/ext4/xattr_security.c
@@ -12,8 +12,8 @@
#include "xattr.h"
static size_t
-ext4_xattr_security_list(struct inode *inode, char *list, size_t list_size,
- const char *name, size_t name_len)
+ext4_xattr_security_list(struct dentry *dentry, char *list, size_t list_size,
+ const char *name, size_t name_len, int type)
{
const size_t prefix_len = sizeof(XATTR_SECURITY_PREFIX)-1;
const size_t total_len = prefix_len + name_len + 1;
@@ -28,23 +28,23 @@ ext4_xattr_security_list(struct inode *inode, char *list, size_t list_size,
}
static int
-ext4_xattr_security_get(struct inode *inode, const char *name,
- void *buffer, size_t size)
+ext4_xattr_security_get(struct dentry *dentry, const char *name,
+ void *buffer, size_t size, int type)
{
if (strcmp(name, "") == 0)
return -EINVAL;
- return ext4_xattr_get(inode, EXT4_XATTR_INDEX_SECURITY, name,
- buffer, size);
+ return ext4_xattr_get(dentry->d_inode, EXT4_XATTR_INDEX_SECURITY,
+ name, buffer, size);
}
static int
-ext4_xattr_security_set(struct inode *inode, const char *name,
- const void *value, size_t size, int flags)
+ext4_xattr_security_set(struct dentry *dentry, const char *name,
+ const void *value, size_t size, int flags, int type)
{
if (strcmp(name, "") == 0)
return -EINVAL;
- return ext4_xattr_set(inode, EXT4_XATTR_INDEX_SECURITY, name,
- value, size, flags);
+ return ext4_xattr_set(dentry->d_inode, EXT4_XATTR_INDEX_SECURITY,
+ name, value, size, flags);
}
int
diff --git a/fs/ext4/xattr_trusted.c b/fs/ext4/xattr_trusted.c
index ac1a52cf2a37..15b50edc6587 100644
--- a/fs/ext4/xattr_trusted.c
+++ b/fs/ext4/xattr_trusted.c
@@ -14,8 +14,8 @@
#include "xattr.h"
static size_t
-ext4_xattr_trusted_list(struct inode *inode, char *list, size_t list_size,
- const char *name, size_t name_len)
+ext4_xattr_trusted_list(struct dentry *dentry, char *list, size_t list_size,
+ const char *name, size_t name_len, int type)
{
const size_t prefix_len = XATTR_TRUSTED_PREFIX_LEN;
const size_t total_len = prefix_len + name_len + 1;
@@ -32,23 +32,23 @@ ext4_xattr_trusted_list(struct inode *inode, char *list, size_t list_size,
}
static int
-ext4_xattr_trusted_get(struct inode *inode, const char *name,
- void *buffer, size_t size)
+ext4_xattr_trusted_get(struct dentry *dentry, const char *name, void *buffer,
+ size_t size, int type)
{
if (strcmp(name, "") == 0)
return -EINVAL;
- return ext4_xattr_get(inode, EXT4_XATTR_INDEX_TRUSTED, name,
- buffer, size);
+ return ext4_xattr_get(dentry->d_inode, EXT4_XATTR_INDEX_TRUSTED,
+ name, buffer, size);
}
static int
-ext4_xattr_trusted_set(struct inode *inode, const char *name,
- const void *value, size_t size, int flags)
+ext4_xattr_trusted_set(struct dentry *dentry, const char *name,
+ const void *value, size_t size, int flags, int type)
{
if (strcmp(name, "") == 0)
return -EINVAL;
- return ext4_xattr_set(inode, EXT4_XATTR_INDEX_TRUSTED, name,
- value, size, flags);
+ return ext4_xattr_set(dentry->d_inode, EXT4_XATTR_INDEX_TRUSTED,
+ name, value, size, flags);
}
struct xattr_handler ext4_xattr_trusted_handler = {
diff --git a/fs/ext4/xattr_user.c b/fs/ext4/xattr_user.c
index d91aa61b42aa..c4ce05746ce1 100644
--- a/fs/ext4/xattr_user.c
+++ b/fs/ext4/xattr_user.c
@@ -13,13 +13,13 @@
#include "xattr.h"
static size_t
-ext4_xattr_user_list(struct inode *inode, char *list, size_t list_size,
- const char *name, size_t name_len)
+ext4_xattr_user_list(struct dentry *dentry, char *list, size_t list_size,
+ const char *name, size_t name_len, int type)
{
const size_t prefix_len = XATTR_USER_PREFIX_LEN;
const size_t total_len = prefix_len + name_len + 1;
- if (!test_opt(inode->i_sb, XATTR_USER))
+ if (!test_opt(dentry->d_sb, XATTR_USER))
return 0;
if (list && total_len <= list_size) {
@@ -31,26 +31,27 @@ ext4_xattr_user_list(struct inode *inode, char *list, size_t list_size,
}
static int
-ext4_xattr_user_get(struct inode *inode, const char *name,
- void *buffer, size_t size)
+ext4_xattr_user_get(struct dentry *dentry, const char *name,
+ void *buffer, size_t size, int type)
{
if (strcmp(name, "") == 0)
return -EINVAL;
- if (!test_opt(inode->i_sb, XATTR_USER))
+ if (!test_opt(dentry->d_sb, XATTR_USER))
return -EOPNOTSUPP;
- return ext4_xattr_get(inode, EXT4_XATTR_INDEX_USER, name, buffer, size);
+ return ext4_xattr_get(dentry->d_inode, EXT4_XATTR_INDEX_USER,
+ name, buffer, size);
}
static int
-ext4_xattr_user_set(struct inode *inode, const char *name,
- const void *value, size_t size, int flags)
+ext4_xattr_user_set(struct dentry *dentry, const char *name,
+ const void *value, size_t size, int flags, int type)
{
if (strcmp(name, "") == 0)
return -EINVAL;
- if (!test_opt(inode->i_sb, XATTR_USER))
+ if (!test_opt(dentry->d_sb, XATTR_USER))
return -EOPNOTSUPP;
- return ext4_xattr_set(inode, EXT4_XATTR_INDEX_USER, name,
- value, size, flags);
+ return ext4_xattr_set(dentry->d_inode, EXT4_XATTR_INDEX_USER,
+ name, value, size, flags);
}
struct xattr_handler ext4_xattr_user_handler = {