diff options
author | Stephen Rothwell <sfr@canb.auug.org.au> | 2009-05-25 11:09:22 +1000 |
---|---|---|
committer | Stephen Rothwell <sfr@canb.auug.org.au> | 2009-05-25 11:09:22 +1000 |
commit | bc5f74e46a629318b694fdd810830a22edf6ed12 (patch) | |
tree | 1df2a251b04d70822529a3d2ed366fa924f9bc54 /fs | |
parent | f69c820220ffb703ff6cc5493ed689670ee0cd78 (diff) | |
parent | ea38fecb092cc63f115966d5a383c21373eef6d8 (diff) |
Merge commit 'ext4/next'
Diffstat (limited to 'fs')
-rw-r--r-- | fs/buffer.c | 2 | ||||
-rw-r--r-- | fs/ext2/super.c | 1 | ||||
-rw-r--r-- | fs/ext3/acl.c | 13 | ||||
-rw-r--r-- | fs/ext3/super.c | 1 | ||||
-rw-r--r-- | fs/ext4/Makefile | 4 | ||||
-rw-r--r-- | fs/ext4/acl.c | 13 | ||||
-rw-r--r-- | fs/ext4/balloc.c | 28 | ||||
-rw-r--r-- | fs/ext4/block_validity.c | 244 | ||||
-rw-r--r-- | fs/ext4/dir.c | 3 | ||||
-rw-r--r-- | fs/ext4/ext4.h | 354 | ||||
-rw-r--r-- | fs/ext4/ext4_i.h | 140 | ||||
-rw-r--r-- | fs/ext4/ext4_sb.h | 161 | ||||
-rw-r--r-- | fs/ext4/extents.c | 70 | ||||
-rw-r--r-- | fs/ext4/group.h | 29 | ||||
-rw-r--r-- | fs/ext4/ialloc.c | 70 | ||||
-rw-r--r-- | fs/ext4/inode.c | 406 | ||||
-rw-r--r-- | fs/ext4/mballoc.c | 166 | ||||
-rw-r--r-- | fs/ext4/mballoc.h | 1 | ||||
-rw-r--r-- | fs/ext4/namei.c | 22 | ||||
-rw-r--r-- | fs/ext4/namei.h | 8 | ||||
-rw-r--r-- | fs/ext4/resize.c | 36 | ||||
-rw-r--r-- | fs/ext4/super.c | 255 | ||||
-rw-r--r-- | fs/ioctl.c | 14 | ||||
-rw-r--r-- | fs/mpage.c | 6 |
24 files changed, 1169 insertions, 878 deletions
diff --git a/fs/buffer.c b/fs/buffer.c index aed297739eb0..ad0112900222 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -2933,6 +2933,8 @@ int submit_bh(int rw, struct buffer_head * bh) BUG_ON(!buffer_locked(bh)); BUG_ON(!buffer_mapped(bh)); BUG_ON(!bh->b_end_io); + BUG_ON(buffer_delay(bh)); + BUG_ON(buffer_unwritten(bh)); /* * Mask in barrier bit for a write (could be either a WRITE or a diff --git a/fs/ext2/super.c b/fs/ext2/super.c index 5c4afe652245..e3c748faf2db 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -1093,6 +1093,7 @@ failed_mount: brelse(bh); failed_sbi: sb->s_fs_info = NULL; + kfree(sbi->s_blockgroup_lock); kfree(sbi); return ret; } diff --git a/fs/ext3/acl.c b/fs/ext3/acl.c index d81ef2fdb08e..e0c745451715 100644 --- a/fs/ext3/acl.c +++ b/fs/ext3/acl.c @@ -129,12 +129,15 @@ fail: static inline struct posix_acl * ext3_iget_acl(struct inode *inode, struct posix_acl **i_acl) { - struct posix_acl *acl = EXT3_ACL_NOT_CACHED; + struct posix_acl *acl = ACCESS_ONCE(*i_acl); - spin_lock(&inode->i_lock); - if (*i_acl != EXT3_ACL_NOT_CACHED) - acl = posix_acl_dup(*i_acl); - spin_unlock(&inode->i_lock); + if (acl) { + spin_lock(&inode->i_lock); + acl = *i_acl; + if (acl != EXT3_ACL_NOT_CACHED) + acl = posix_acl_dup(acl); + spin_unlock(&inode->i_lock); + } return acl; } diff --git a/fs/ext3/super.c b/fs/ext3/super.c index 599dbfe504c3..d8b73d4abe3e 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -2021,6 +2021,7 @@ failed_mount: brelse(bh); out_fail: sb->s_fs_info = NULL; + kfree(sbi->s_blockgroup_lock); kfree(sbi); lock_kernel(); return ret; diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile index a8ff003a00f7..8a34710ecf40 100644 --- a/fs/ext4/Makefile +++ b/fs/ext4/Makefile @@ -5,8 +5,8 @@ obj-$(CONFIG_EXT4_FS) += ext4.o ext4-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \ - ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \ - ext4_jbd2.o migrate.o mballoc.o + ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \ + ext4_jbd2.o migrate.o mballoc.o block_validity.o ext4-$(CONFIG_EXT4_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c index 647e0d65a284..605aeed96d68 100644 --- a/fs/ext4/acl.c +++ b/fs/ext4/acl.c @@ -129,12 +129,15 @@ fail: static inline struct posix_acl * ext4_iget_acl(struct inode *inode, struct posix_acl **i_acl) { - struct posix_acl *acl = EXT4_ACL_NOT_CACHED; + struct posix_acl *acl = ACCESS_ONCE(*i_acl); - spin_lock(&inode->i_lock); - if (*i_acl != EXT4_ACL_NOT_CACHED) - acl = posix_acl_dup(*i_acl); - spin_unlock(&inode->i_lock); + if (acl) { + spin_lock(&inode->i_lock); + acl = *i_acl; + if (acl != EXT4_ACL_NOT_CACHED) + acl = posix_acl_dup(acl); + spin_unlock(&inode->i_lock); + } return acl; } diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index 53c72ad85877..e2126d70dff5 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -19,7 +19,6 @@ #include <linux/buffer_head.h> #include "ext4.h" #include "ext4_jbd2.h" -#include "group.h" #include "mballoc.h" /* @@ -88,6 +87,7 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh, ext4_group_t block_group, struct ext4_group_desc *gdp) { int bit, bit_max; + ext4_group_t ngroups = ext4_get_groups_count(sb); unsigned free_blocks, group_blocks; struct ext4_sb_info *sbi = EXT4_SB(sb); @@ -123,7 +123,7 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh, bit_max += ext4_bg_num_gdb(sb, block_group); } - if (block_group == sbi->s_groups_count - 1) { + if (block_group == ngroups - 1) { /* * Even though mke2fs always initialize first and last group * if some other tool enabled the EXT4_BG_BLOCK_UNINIT we need @@ -131,7 +131,7 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh, */ group_blocks = ext4_blocks_count(sbi->s_es) - le32_to_cpu(sbi->s_es->s_first_data_block) - - (EXT4_BLOCKS_PER_GROUP(sb) * (sbi->s_groups_count - 1)); + (EXT4_BLOCKS_PER_GROUP(sb) * (ngroups - 1)); } else { group_blocks = EXT4_BLOCKS_PER_GROUP(sb); } @@ -205,18 +205,18 @@ struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb, { unsigned int group_desc; unsigned int offset; + ext4_group_t ngroups = ext4_get_groups_count(sb); struct ext4_group_desc *desc; struct ext4_sb_info *sbi = EXT4_SB(sb); - if (block_group >= sbi->s_groups_count) { + if (block_group >= ngroups) { ext4_error(sb, "ext4_get_group_desc", "block_group >= groups_count - " "block_group = %u, groups_count = %u", - block_group, sbi->s_groups_count); + block_group, ngroups); return NULL; } - smp_rmb(); group_desc = block_group >> EXT4_DESC_PER_BLOCK_BITS(sb); offset = block_group & (EXT4_DESC_PER_BLOCK(sb) - 1); @@ -326,16 +326,16 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group) unlock_buffer(bh); return bh; } - spin_lock(sb_bgl_lock(EXT4_SB(sb), block_group)); + ext4_lock_group(sb, block_group); if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { ext4_init_block_bitmap(sb, bh, block_group, desc); set_bitmap_uptodate(bh); set_buffer_uptodate(bh); - spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group)); + ext4_unlock_group(sb, block_group); unlock_buffer(bh); return bh; } - spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group)); + ext4_unlock_group(sb, block_group); if (buffer_uptodate(bh)) { /* * if not uninit if bh is uptodate, @@ -451,7 +451,7 @@ void ext4_add_groupblocks(handle_t *handle, struct super_block *sb, down_write(&grp->alloc_sem); for (i = 0, blocks_freed = 0; i < count; i++) { BUFFER_TRACE(bitmap_bh, "clear bit"); - if (!ext4_clear_bit_atomic(sb_bgl_lock(sbi, block_group), + if (!ext4_clear_bit_atomic(ext4_group_lock_ptr(sb, block_group), bit + i, bitmap_bh->b_data)) { ext4_error(sb, __func__, "bit already cleared for block %llu", @@ -461,11 +461,11 @@ void ext4_add_groupblocks(handle_t *handle, struct super_block *sb, blocks_freed++; } } - spin_lock(sb_bgl_lock(sbi, block_group)); + ext4_lock_group(sb, block_group); blk_free_count = blocks_freed + ext4_free_blks_count(sb, desc); ext4_free_blks_set(sb, desc, blk_free_count); desc->bg_checksum = ext4_group_desc_csum(sbi, block_group, desc); - spin_unlock(sb_bgl_lock(sbi, block_group)); + ext4_unlock_group(sb, block_group); percpu_counter_add(&sbi->s_freeblocks_counter, blocks_freed); if (sbi->s_log_groups_per_flex) { @@ -665,7 +665,7 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb) ext4_fsblk_t desc_count; struct ext4_group_desc *gdp; ext4_group_t i; - ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count; + ext4_group_t ngroups = ext4_get_groups_count(sb); #ifdef EXT4FS_DEBUG struct ext4_super_block *es; ext4_fsblk_t bitmap_count; @@ -677,7 +677,6 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb) bitmap_count = 0; gdp = NULL; - smp_rmb(); for (i = 0; i < ngroups; i++) { gdp = ext4_get_group_desc(sb, i, NULL); if (!gdp) @@ -700,7 +699,6 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb) return bitmap_count; #else desc_count = 0; - smp_rmb(); for (i = 0; i < ngroups; i++) { gdp = ext4_get_group_desc(sb, i, NULL); if (!gdp) diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c new file mode 100644 index 000000000000..50784ef07563 --- /dev/null +++ b/fs/ext4/block_validity.c @@ -0,0 +1,244 @@ +/* + * linux/fs/ext4/block_validity.c + * + * Copyright (C) 2009 + * Theodore Ts'o (tytso@mit.edu) + * + * Track which blocks in the filesystem are metadata blocks that + * should never be used as data blocks by files or directories. + */ + +#include <linux/time.h> +#include <linux/fs.h> +#include <linux/namei.h> +#include <linux/quotaops.h> +#include <linux/buffer_head.h> +#include <linux/module.h> +#include <linux/swap.h> +#include <linux/pagemap.h> +#include <linux/version.h> +#include <linux/blkdev.h> +#include <linux/mutex.h> +#include "ext4.h" + +struct ext4_system_zone { + struct rb_node node; + ext4_fsblk_t start_blk; + unsigned int count; +}; + +static struct kmem_cache *ext4_system_zone_cachep; + +int __init init_ext4_system_zone(void) +{ + ext4_system_zone_cachep = KMEM_CACHE(ext4_system_zone, + SLAB_RECLAIM_ACCOUNT); + if (ext4_system_zone_cachep == NULL) + return -ENOMEM; + return 0; +} + +void exit_ext4_system_zone(void) +{ + kmem_cache_destroy(ext4_system_zone_cachep); +} + +static inline int can_merge(struct ext4_system_zone *entry1, + struct ext4_system_zone *entry2) +{ + if ((entry1->start_blk + entry1->count) == entry2->start_blk) + return 1; + return 0; +} + +/* + * Mark a range of blocks as belonging to the "system zone" --- that + * is, filesystem metadata blocks which should never be used by + * inodes. + */ +static int add_system_zone(struct ext4_sb_info *sbi, + ext4_fsblk_t start_blk, + unsigned int count) +{ + struct ext4_system_zone *new_entry = NULL, *entry; + struct rb_node **n = &sbi->system_blks.rb_node, *node; + struct rb_node *parent = NULL, *new_node = NULL; + + while (*n) { + parent = *n; + entry = rb_entry(parent, struct ext4_system_zone, node); + if (start_blk < entry->start_blk) + n = &(*n)->rb_left; + else if (start_blk >= (entry->start_blk + entry->count)) + n = &(*n)->rb_right; + else { + if (start_blk + count > (entry->start_blk + + entry->count)) + entry->count = (start_blk + count - + entry->start_blk); + new_node = *n; + new_entry = rb_entry(new_node, struct ext4_system_zone, + node); + break; + } + } + + if (!new_entry) { + new_entry = kmem_cache_alloc(ext4_system_zone_cachep, + GFP_KERNEL); + if (!new_entry) + return -ENOMEM; + new_entry->start_blk = start_blk; + new_entry->count = count; + new_node = &new_entry->node; + + rb_link_node(new_node, parent, n); + rb_insert_color(new_node, &sbi->system_blks); + } + + /* Can we merge to the left? */ + node = rb_prev(new_node); + if (node) { + entry = rb_entry(node, struct ext4_system_zone, node); + if (can_merge(entry, new_entry)) { + new_entry->start_blk = entry->start_blk; + new_entry->count += entry->count; + rb_erase(node, &sbi->system_blks); + kmem_cache_free(ext4_system_zone_cachep, entry); + } + } + + /* Can we merge to the right? */ + node = rb_next(new_node); + if (node) { + entry = rb_entry(node, struct ext4_system_zone, node); + if (can_merge(new_entry, entry)) { + new_entry->count += entry->count; + rb_erase(node, &sbi->system_blks); + kmem_cache_free(ext4_system_zone_cachep, entry); + } + } + return 0; +} + +static void debug_print_tree(struct ext4_sb_info *sbi) +{ + struct rb_node *node; + struct ext4_system_zone *entry; + int first = 1; + + printk(KERN_INFO "System zones: "); + node = rb_first(&sbi->system_blks); + while (node) { + entry = rb_entry(node, struct ext4_system_zone, node); + printk("%s%llu-%llu", first ? "" : ", ", + entry->start_blk, entry->start_blk + entry->count - 1); + first = 0; + node = rb_next(node); + } + printk("\n"); +} + +int ext4_setup_system_zone(struct super_block *sb) +{ + ext4_group_t ngroups = ext4_get_groups_count(sb); + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_group_desc *gdp; + ext4_group_t i; + int flex_size = ext4_flex_bg_size(sbi); + int ret; + + if (!test_opt(sb, BLOCK_VALIDITY)) { + if (EXT4_SB(sb)->system_blks.rb_node) + ext4_release_system_zone(sb); + return 0; + } + if (EXT4_SB(sb)->system_blks.rb_node) + return 0; + + for (i=0; i < ngroups; i++) { + if (ext4_bg_has_super(sb, i) && + ((i < 5) || ((i % flex_size) == 0))) + add_system_zone(sbi, ext4_group_first_block_no(sb, i), + sbi->s_gdb_count + 1); + gdp = ext4_get_group_desc(sb, i, NULL); + ret = add_system_zone(sbi, ext4_block_bitmap(sb, gdp), 1); + if (ret) + return ret; + ret = add_system_zone(sbi, ext4_inode_bitmap(sb, gdp), 1); + if (ret) + return ret; + ret = add_system_zone(sbi, ext4_inode_table(sb, gdp), + sbi->s_itb_per_group); + if (ret) + return ret; + } + + if (test_opt(sb, DEBUG)) + debug_print_tree(EXT4_SB(sb)); + return 0; +} + +/* Called when the filesystem is unmounted */ +void ext4_release_system_zone(struct super_block *sb) +{ + struct rb_node *n = EXT4_SB(sb)->system_blks.rb_node; + struct rb_node *parent; + struct ext4_system_zone *entry; + + while (n) { + /* Do the node's children first */ + if (n->rb_left) { + n = n->rb_left; + continue; + } + if (n->rb_right) { + n = n->rb_right; + continue; + } + /* + * The node has no children; free it, and then zero + * out parent's link to it. Finally go to the + * beginning of the loop and try to free the parent + * node. + */ + parent = rb_parent(n); + entry = rb_entry(n, struct ext4_system_zone, node); + kmem_cache_free(ext4_system_zone_cachep, entry); + if (!parent) + EXT4_SB(sb)->system_blks.rb_node = NULL; + else if (parent->rb_left == n) + parent->rb_left = NULL; + else if (parent->rb_right == n) + parent->rb_right = NULL; + n = parent; + } + EXT4_SB(sb)->system_blks.rb_node = NULL; +} + +/* + * Returns 1 if the passed-in block region (start_blk, + * start_blk+count) is valid; 0 if some part of the block region + * overlaps with filesystem metadata blocks. + */ +int ext4_data_block_valid(struct ext4_sb_info *sbi, ext4_fsblk_t start_blk, + unsigned int count) +{ + struct ext4_system_zone *entry; + struct rb_node *n = sbi->system_blks.rb_node; + + if ((start_blk <= le32_to_cpu(sbi->s_es->s_first_data_block)) || + (start_blk + count > ext4_blocks_count(sbi->s_es))) + return 0; + while (n) { + entry = rb_entry(n, struct ext4_system_zone, node); + if (start_blk + count - 1 < entry->start_blk) + n = n->rb_left; + else if (start_blk >= (entry->start_blk + entry->count)) + n = n->rb_right; + else + return 0; + } + return 1; +} + diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index b64789929a65..9dc93168e262 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -131,8 +131,7 @@ static int ext4_readdir(struct file *filp, struct buffer_head *bh = NULL; map_bh.b_state = 0; - err = ext4_get_blocks_wrap(NULL, inode, blk, 1, &map_bh, - 0, 0, 0); + err = ext4_get_blocks(NULL, inode, blk, 1, &map_bh, 0); if (err > 0) { pgoff_t index = map_bh.b_blocknr >> (PAGE_CACHE_SHIFT - inode->i_blkbits); diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index d0f15ef56de1..4311cc85b534 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -21,7 +21,14 @@ #include <linux/magic.h> #include <linux/jbd2.h> #include <linux/quota.h> -#include "ext4_i.h" +#include <linux/rwsem.h> +#include <linux/rbtree.h> +#include <linux/seqlock.h> +#include <linux/mutex.h> +#include <linux/timer.h> +#include <linux/wait.h> +#include <linux/blockgroup_lock.h> +#include <linux/percpu_counter.h> /* * The fourth extended filesystem constants/structures @@ -46,6 +53,19 @@ #define ext4_debug(f, a...) do {} while (0) #endif +/* data type for block offset of block group */ +typedef int ext4_grpblk_t; + +/* data type for filesystem-wide blocks number */ +typedef unsigned long long ext4_fsblk_t; + +/* data type for file logical block number */ +typedef __u32 ext4_lblk_t; + +/* data type for block group number */ +typedef unsigned int ext4_group_t; + + /* prefer goal again. length */ #define EXT4_MB_HINT_MERGE 1 /* blocks already reserved */ @@ -179,9 +199,6 @@ struct flex_groups { #define EXT4_BG_BLOCK_UNINIT 0x0002 /* Block bitmap not in use */ #define EXT4_BG_INODE_ZEROED 0x0004 /* On-disk itable initialized to zero */ -#ifdef __KERNEL__ -#include "ext4_sb.h" -#endif /* * Macro-instructions used to manage group descriptors */ @@ -297,10 +314,25 @@ struct ext4_new_group_data { }; /* - * Following is used by preallocation code to tell get_blocks() that we - * want uninitialzed extents. + * Flags used by ext4_get_blocks() */ -#define EXT4_CREATE_UNINITIALIZED_EXT 2 + /* Allocate any needed blocks and/or convert an unitialized + extent to be an initialized ext4 */ +#define EXT4_GET_BLOCKS_CREATE 0x0001 + /* Request the creation of an unitialized extent */ +#define EXT4_GET_BLOCKS_UNINIT_EXT 0x0002 +#define EXT4_GET_BLOCKS_CREATE_UNINIT_EXT (EXT4_GET_BLOCKS_UNINIT_EXT|\ + EXT4_GET_BLOCKS_CREATE) + /* Update the ext4_inode_info i_disksize field */ +#define EXT4_GET_BLOCKS_EXTEND_DISKSIZE 0x0004 + /* Caller is from the delayed allocation writeout path, + so set the magic i_delalloc_reserve_flag after taking the + inode allocation semaphore for */ +#define EXT4_GET_BLOCKS_DELALLOC_RESERVE 0x0008 + /* Call ext4_da_update_reserve_space() after successfully + allocating the blocks */ +#define EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE 0x0010 + /* * ioctl commands @@ -516,6 +548,110 @@ do { \ #endif /* defined(__KERNEL__) || defined(__linux__) */ /* + * storage for cached extent + */ +struct ext4_ext_cache { + ext4_fsblk_t ec_start; + ext4_lblk_t ec_block; + __u32 ec_len; /* must be 32bit to return holes */ + __u32 ec_type; +}; + +/* + * fourth extended file system inode data in memory + */ +struct ext4_inode_info { + __le32 i_data[15]; /* unconverted */ + __u32 i_flags; + ext4_fsblk_t i_file_acl; + __u32 i_dtime; + + /* + * i_block_group is the number of the block group which contains + * this file's inode. Constant across the lifetime of the inode, + * it is ued for making block allocation decisions - we try to + * place a file's data blocks near its inode block, and new inodes + * near to their parent directory's inode. + */ + ext4_group_t i_block_group; + __u32 i_state; /* Dynamic state flags for ext4 */ + + ext4_lblk_t i_dir_start_lookup; +#ifdef CONFIG_EXT4_FS_XATTR + /* + * Extended attributes can be read independently of the main file + * data. Taking i_mutex even when reading would cause contention + * between readers of EAs and writers of regular file data, so + * instead we synchronize on xattr_sem when reading or changing + * EAs. + */ + struct rw_semaphore xattr_sem; +#endif +#ifdef CONFIG_EXT4_FS_POSIX_ACL + struct posix_acl *i_acl; + struct posix_acl *i_default_acl; +#endif + + struct list_head i_orphan; /* unlinked but open inodes */ + + /* + * i_disksize keeps track of what the inode size is ON DISK, not + * in memory. During truncate, i_size is set to the new size by + * the VFS prior to calling ext4_truncate(), but the filesystem won't + * set i_disksize to 0 until the truncate is actually under way. + * + * The intent is that i_disksize always represents the blocks which + * are used by this file. This allows recovery to restart truncate + * on orphans if we crash during truncate. We actually write i_disksize + * into the on-disk inode when writing inodes out, instead of i_size. + * + * The only time when i_disksize and i_size may be different is when + * a truncate is in progress. The only things which change i_disksize + * are ext4_get_block (growth) and ext4_truncate (shrinkth). + */ + loff_t i_disksize; + + /* + * i_data_sem is for serialising ext4_truncate() against + * ext4_getblock(). In the 2.4 ext2 design, great chunks of inode's + * data tree are chopped off during truncate. We can't do that in + * ext4 because whenever we perform intermediate commits during + * truncate, the inode and all the metadata blocks *must* be in a + * consistent state which allows truncation of the orphans to restart + * during recovery. Hence we must fix the get_block-vs-truncate race + * by other means, so we have i_data_sem. + */ + struct rw_semaphore i_data_sem; + struct inode vfs_inode; + struct jbd2_inode jinode; + + struct ext4_ext_cache i_cached_extent; + /* + * File creation time. Its function is same as that of + * struct timespec i_{a,c,m}time in the generic inode. + */ + struct timespec i_crtime; + + /* mballoc */ + struct list_head i_prealloc_list; + spinlock_t i_prealloc_lock; + + /* ialloc */ + ext4_group_t i_last_alloc_group; + + /* allocation reservation info for delalloc */ + unsigned int i_reserved_data_blocks; + unsigned int i_reserved_meta_blocks; + unsigned int i_allocated_meta_blocks; + unsigned short i_delalloc_reserved_flag; + + /* on-disk additional length */ + __u16 i_extra_isize; + + spinlock_t i_block_reservation_lock; +}; + +/* * File system states */ #define EXT4_VALID_FS 0x0001 /* Unmounted cleanly */ @@ -560,6 +696,7 @@ do { \ #define EXT4_MOUNT_I_VERSION 0x2000000 /* i_version support */ #define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */ #define EXT4_MOUNT_DATA_ERR_ABORT 0x10000000 /* Abort on file data write */ +#define EXT4_MOUNT_BLOCK_VALIDITY 0x20000000 /* Block validity checking */ /* Compatibility, for having both ext2_fs.h and ext4_fs.h included at once */ #ifndef _LINUX_EXT2_FS_H @@ -689,6 +826,137 @@ struct ext4_super_block { }; #ifdef __KERNEL__ +/* + * fourth extended-fs super-block data in memory + */ +struct ext4_sb_info { + unsigned long s_desc_size; /* Size of a group descriptor in bytes */ + unsigned long s_inodes_per_block;/* Number of inodes per block */ + unsigned long s_blocks_per_group;/* Number of blocks in a group */ + unsigned long s_inodes_per_group;/* Number of inodes in a group */ + unsigned long s_itb_per_group; /* Number of inode table blocks per group */ + unsigned long s_gdb_count; /* Number of group descriptor blocks */ + unsigned long s_desc_per_block; /* Number of group descriptors per block */ + ext4_group_t s_groups_count; /* Number of groups in the fs */ + unsigned long s_overhead_last; /* Last calculated overhead */ + unsigned long s_blocks_last; /* Last seen block count */ + loff_t s_bitmap_maxbytes; /* max bytes for bitmap files */ + struct buffer_head * s_sbh; /* Buffer containing the super block */ + struct ext4_super_block *s_es; /* Pointer to the super block in the buffer */ + struct buffer_head **s_group_desc; + unsigned long s_mount_opt; + ext4_fsblk_t s_sb_block; + uid_t s_resuid; + gid_t s_resgid; + unsigned short s_mount_state; + unsigned short s_pad; + int s_addr_per_block_bits; + int s_desc_per_block_bits; + int s_inode_size; + int s_first_ino; + unsigned int s_inode_readahead_blks; + spinlock_t s_next_gen_lock; + u32 s_next_generation; + u32 s_hash_seed[4]; + int s_def_hash_version; + int s_hash_unsigned; /* 3 if hash should be signed, 0 if not */ + struct percpu_counter s_freeblocks_counter; + struct percpu_counter s_freeinodes_counter; + struct percpu_counter s_dirs_counter; + struct percpu_counter s_dirtyblocks_counter; + struct blockgroup_lock *s_blockgroup_lock; + struct proc_dir_entry *s_proc; + struct kobject s_kobj; + struct completion s_kobj_unregister; + + /* Journaling */ + struct inode *s_journal_inode; + struct journal_s *s_journal; + struct list_head s_orphan; + struct mutex s_orphan_lock; + struct mutex s_resize_lock; + unsigned long s_commit_interval; + u32 s_max_batch_time; + u32 s_min_batch_time; + struct block_device *journal_bdev; +#ifdef CONFIG_JBD2_DEBUG + struct timer_list turn_ro_timer; /* For turning read-only (crash simulation) */ + wait_queue_head_t ro_wait_queue; /* For people waiting for the fs to go read-only */ +#endif +#ifdef CONFIG_QUOTA + char *s_qf_names[MAXQUOTAS]; /* Names of quota files with journalled quota */ + int s_jquota_fmt; /* Format of quota to use */ +#endif + unsigned int s_want_extra_isize; /* New inodes should reserve # bytes */ + struct rb_root system_blks; + +#ifdef EXTENTS_STATS + /* ext4 extents stats */ + unsigned long s_ext_min; + unsigned long s_ext_max; + unsigned long s_depth_max; + spinlock_t s_ext_stats_lock; + unsigned long s_ext_blocks; + unsigned long s_ext_extents; +#endif + + /* for buddy allocator */ + struct ext4_group_info ***s_group_info; + struct inode *s_buddy_cache; + long s_blocks_reserved; + spinlock_t s_reserve_lock; + spinlock_t s_md_lock; + tid_t s_last_transaction; + unsigned short *s_mb_offsets; + unsigned int *s_mb_maxs; + + /* tunables */ + unsigned long s_stripe; + unsigned int s_mb_stream_request; + unsigned int s_mb_max_to_scan; + unsigned int s_mb_min_to_scan; + unsigned int s_mb_stats; + unsigned int s_mb_order2_reqs; + unsigned int s_mb_group_prealloc; + /* where last allocation was done - for stream allocation */ + unsigned long s_mb_last_group; + unsigned long s_mb_last_start; + + /* history to debug policy */ + struct ext4_mb_history *s_mb_history; + int s_mb_history_cur; + int s_mb_history_max; + int s_mb_history_num; + spinlock_t s_mb_history_lock; + int s_mb_history_filter; + + /* stats for buddy allocator */ + spinlock_t s_mb_pa_lock; + atomic_t s_bal_reqs; /* number of reqs with len > 1 */ + atomic_t s_bal_success; /* we found long enough chunks */ + atomic_t s_bal_allocated; /* in blocks */ + atomic_t s_bal_ex_scanned; /* total extents scanned */ + atomic_t s_bal_goals; /* goal hits */ + atomic_t s_bal_breaks; /* too long searches */ + atomic_t s_bal_2orders; /* 2^order hits */ + spinlock_t s_bal_lock; + unsigned long s_mb_buddies_generated; + unsigned long long s_mb_generation_time; + atomic_t s_mb_lost_chunks; + atomic_t s_mb_preallocated; + atomic_t s_mb_discarded; + + /* locality groups */ + struct ext4_locality_group *s_locality_groups; + + /* for write statistics */ + unsigned long s_sectors_written_start; + u64 s_kbytes_written; + + unsigned int s_log_groups_per_flex; + struct flex_groups *s_flex_groups; +}; + static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb) { return sb->s_fs_info; @@ -704,7 +972,6 @@ static inline struct timespec ext4_current_time(struct inode *inode) current_fs_time(inode->i_sb) : CURRENT_TIME_SEC; } - static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino) { return ino == EXT4_ROOT_INO || @@ -1014,6 +1281,14 @@ extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb, ext4_group_t block_group, struct buffer_head ** bh); extern int ext4_should_retry_alloc(struct super_block *sb, int *retries); +struct buffer_head *ext4_read_block_bitmap(struct super_block *sb, + ext4_group_t block_group); +extern unsigned ext4_init_block_bitmap(struct super_block *sb, + struct buffer_head *bh, + ext4_group_t group, + struct ext4_group_desc *desc); +#define ext4_free_blocks_after_init(sb, group, desc) \ + ext4_init_block_bitmap(sb, NULL, group, desc) /* dir.c */ extern int ext4_check_dir_entry(const char *, struct inode *, @@ -1038,6 +1313,11 @@ extern struct inode * ext4_orphan_get(struct super_block *, unsigned long); extern unsigned long ext4_count_free_inodes(struct super_block *); extern unsigned long ext4_count_dirs(struct super_block *); extern void ext4_check_inodes_bitmap(struct super_block *); +extern unsigned ext4_init_inode_bitmap(struct super_block *sb, + struct buffer_head *bh, + ext4_group_t group, + struct ext4_group_desc *desc); +extern void mark_bitmap_end(int start_bit, int end_bit, char *bitmap); /* mballoc.c */ extern long ext4_mb_stats; @@ -1161,6 +1441,10 @@ extern void ext4_used_dirs_set(struct super_block *sb, struct ext4_group_desc *bg, __u32 count); extern void ext4_itable_unused_set(struct super_block *sb, struct ext4_group_desc *bg, __u32 count); +extern __le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 group, + struct ext4_group_desc *gdp); +extern int ext4_group_desc_csum_verify(struct ext4_sb_info *sbi, __u32 group, + struct ext4_group_desc *gdp); static inline ext4_fsblk_t ext4_blocks_count(struct ext4_super_block *es) { @@ -1228,6 +1512,18 @@ struct ext4_group_info *ext4_get_group_info(struct super_block *sb, return grp_info[indexv][indexh]; } +/* + * Reading s_groups_count requires using smp_rmb() afterwards. See + * the locking protocol documented in the comments of ext4_group_add() + * in resize.c + */ +static inline ext4_group_t ext4_get_groups_count(struct super_block *sb) +{ + ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count; + + smp_rmb(); + return ngroups; +} static inline ext4_group_t ext4_flex_group(struct ext4_sb_info *sbi, ext4_group_t block_group) @@ -1283,33 +1579,25 @@ struct ext4_group_info { }; #define EXT4_GROUP_INFO_NEED_INIT_BIT 0 -#define EXT4_GROUP_INFO_LOCKED_BIT 1 #define EXT4_MB_GRP_NEED_INIT(grp) \ (test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state))) -static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group) +static inline spinlock_t *ext4_group_lock_ptr(struct super_block *sb, + ext4_group_t group) { - struct ext4_group_info *grinfo = ext4_get_group_info(sb, group); - - bit_spin_lock(EXT4_GROUP_INFO_LOCKED_BIT, &(grinfo->bb_state)); + return bgl_lock_ptr(EXT4_SB(sb)->s_blockgroup_lock, group); } -static inline void ext4_unlock_group(struct super_block *sb, - ext4_group_t group) +static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group) { - struct ext4_group_info *grinfo = ext4_get_group_info(sb, group); - - bit_spin_unlock(EXT4_GROUP_INFO_LOCKED_BIT, &(grinfo->bb_state)); + spin_lock(ext4_group_lock_ptr(sb, group)); } -static inline int ext4_is_group_locked(struct super_block *sb, +static inline void ext4_unlock_group(struct super_block *sb, ext4_group_t group) { - struct ext4_group_info *grinfo = ext4_get_group_info(sb, group); - - return bit_spin_is_locked(EXT4_GROUP_INFO_LOCKED_BIT, - &(grinfo->bb_state)); + spin_unlock(ext4_group_lock_ptr(sb, group)); } /* @@ -1326,11 +1614,21 @@ extern const struct file_operations ext4_file_operations; /* namei.c */ extern const struct inode_operations ext4_dir_inode_operations; extern const struct inode_operations ext4_special_inode_operations; +extern struct dentry *ext4_get_parent(struct dentry *child); /* symlink.c */ extern const struct inode_operations ext4_symlink_inode_operations; extern const struct inode_operations ext4_fast_symlink_inode_operations; +/* block_validity */ +extern void ext4_release_system_zone(struct super_block *sb); +extern int ext4_setup_system_zone(struct super_block *sb); +extern int __init init_ext4_system_zone(void); +extern void exit_ext4_system_zone(void); +extern int ext4_data_block_valid(struct ext4_sb_info *sbi, + ext4_fsblk_t start_blk, + unsigned int count); + /* extents.c */ extern int ext4_ext_tree_init(handle_t *handle, struct inode *); extern int ext4_ext_writepage_trans_blocks(struct inode *, int); @@ -1338,17 +1636,15 @@ extern int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, int chunk); extern int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, ext4_lblk_t iblock, unsigned int max_blocks, - struct buffer_head *bh_result, - int create, int extend_disksize); + struct buffer_head *bh_result, int flags); extern void ext4_ext_truncate(struct inode *); extern void ext4_ext_init(struct super_block *); extern void ext4_ext_release(struct super_block *); extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len); -extern int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, - sector_t block, unsigned int max_blocks, - struct buffer_head *bh, int create, - int extend_disksize, int flag); +extern int ext4_get_blocks(handle_t *handle, struct inode *inode, + sector_t block, unsigned int max_blocks, + struct buffer_head *bh, int flags); extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, __u64 start, __u64 len); diff --git a/fs/ext4/ext4_i.h b/fs/ext4/ext4_i.h deleted file mode 100644 index 4ce2187123aa..000000000000 --- a/fs/ext4/ext4_i.h +++ /dev/null @@ -1,140 +0,0 @@ -/* - * ext4_i.h - * - * Copyright (C) 1992, 1993, 1994, 1995 - * Remy Card (card@masi.ibp.fr) - * Laboratoire MASI - Institut Blaise Pascal - * Universite Pierre et Marie Curie (Paris VI) - * - * from - * - * linux/include/linux/minix_fs_i.h - * - * Copyright (C) 1991, 1992 Linus Torvalds - */ - -#ifndef _EXT4_I -#define _EXT4_I - -#include <linux/rwsem.h> -#include <linux/rbtree.h> -#include <linux/seqlock.h> -#include <linux/mutex.h> - -/* data type for block offset of block group */ -typedef int ext4_grpblk_t; - -/* data type for filesystem-wide blocks number */ -typedef unsigned long long ext4_fsblk_t; - -/* data type for file logical block number */ -typedef __u32 ext4_lblk_t; - -/* data type for block group number */ -typedef unsigned int ext4_group_t; - -/* - * storage for cached extent - */ -struct ext4_ext_cache { - ext4_fsblk_t ec_start; - ext4_lblk_t ec_block; - __u32 ec_len; /* must be 32bit to return holes */ - __u32 ec_type; -}; - -/* - * fourth extended file system inode data in memory - */ -struct ext4_inode_info { - __le32 i_data[15]; /* unconverted */ - __u32 i_flags; - ext4_fsblk_t i_file_acl; - __u32 i_dtime; - - /* - * i_block_group is the number of the block group which contains - * this file's inode. Constant across the lifetime of the inode, - * it is ued for making block allocation decisions - we try to - * place a file's data blocks near its inode block, and new inodes - * near to their parent directory's inode. - */ - ext4_group_t i_block_group; - __u32 i_state; /* Dynamic state flags for ext4 */ - - ext4_lblk_t i_dir_start_lookup; -#ifdef CONFIG_EXT4_FS_XATTR - /* - * Extended attributes can be read independently of the main file - * data. Taking i_mutex even when reading would cause contention - * between readers of EAs and writers of regular file data, so - * instead we synchronize on xattr_sem when reading or changing - * EAs. - */ - struct rw_semaphore xattr_sem; -#endif -#ifdef CONFIG_EXT4_FS_POSIX_ACL - struct posix_acl *i_acl; - struct posix_acl *i_default_acl; -#endif - - struct list_head i_orphan; /* unlinked but open inodes */ - - /* - * i_disksize keeps track of what the inode size is ON DISK, not - * in memory. During truncate, i_size is set to the new size by - * the VFS prior to calling ext4_truncate(), but the filesystem won't - * set i_disksize to 0 until the truncate is actually under way. - * - * The intent is that i_disksize always represents the blocks which - * are used by this file. This allows recovery to restart truncate - * on orphans if we crash during truncate. We actually write i_disksize - * into the on-disk inode when writing inodes out, instead of i_size. - * - * The only time when i_disksize and i_size may be different is when - * a truncate is in progress. The only things which change i_disksize - * are ext4_get_block (growth) and ext4_truncate (shrinkth). - */ - loff_t i_disksize; - - /* - * i_data_sem is for serialising ext4_truncate() against - * ext4_getblock(). In the 2.4 ext2 design, great chunks of inode's - * data tree are chopped off during truncate. We can't do that in - * ext4 because whenever we perform intermediate commits during - * truncate, the inode and all the metadata blocks *must* be in a - * consistent state which allows truncation of the orphans to restart - * during recovery. Hence we must fix the get_block-vs-truncate race - * by other means, so we have i_data_sem. - */ - struct rw_semaphore i_data_sem; - struct inode vfs_inode; - struct jbd2_inode jinode; - - struct ext4_ext_cache i_cached_extent; - /* - * File creation time. Its function is same as that of - * struct timespec i_{a,c,m}time in the generic inode. - */ - struct timespec i_crtime; - - /* mballoc */ - struct list_head i_prealloc_list; - spinlock_t i_prealloc_lock; - - /* ialloc */ - ext4_group_t i_last_alloc_group; - - /* allocation reservation info for delalloc */ - unsigned int i_reserved_data_blocks; - unsigned int i_reserved_meta_blocks; - unsigned int i_allocated_meta_blocks; - unsigned short i_delalloc_reserved_flag; - - /* on-disk additional length */ - __u16 i_extra_isize; - - spinlock_t i_block_reservation_lock; -}; - -#endif /* _EXT4_I */ diff --git a/fs/ext4/ext4_sb.h b/fs/ext4/ext4_sb.h deleted file mode 100644 index 57b71fefbccf..000000000000 --- a/fs/ext4/ext4_sb.h +++ /dev/null @@ -1,161 +0,0 @@ -/* - * ext4_sb.h - * - * Copyright (C) 1992, 1993, 1994, 1995 - * Remy Card (card@masi.ibp.fr) - * Laboratoire MASI - Institut Blaise Pascal - * Universite Pierre et Marie Curie (Paris VI) - * - * from - * - * linux/include/linux/minix_fs_sb.h - * - * Copyright (C) 1991, 1992 Linus Torvalds - */ - -#ifndef _EXT4_SB -#define _EXT4_SB - -#ifdef __KERNEL__ -#include <linux/timer.h> -#include <linux/wait.h> -#include <linux/blockgroup_lock.h> -#include <linux/percpu_counter.h> -#endif -#include <linux/rbtree.h> - -/* - * fourth extended-fs super-block data in memory - */ -struct ext4_sb_info { - unsigned long s_desc_size; /* Size of a group descriptor in bytes */ - unsigned long s_inodes_per_block;/* Number of inodes per block */ - unsigned long s_blocks_per_group;/* Number of blocks in a group */ - unsigned long s_inodes_per_group;/* Number of inodes in a group */ - unsigned long s_itb_per_group; /* Number of inode table blocks per group */ - unsigned long s_gdb_count; /* Number of group descriptor blocks */ - unsigned long s_desc_per_block; /* Number of group descriptors per block */ - ext4_group_t s_groups_count; /* Number of groups in the fs */ - unsigned long s_overhead_last; /* Last calculated overhead */ - unsigned long s_blocks_last; /* Last seen block count */ - loff_t s_bitmap_maxbytes; /* max bytes for bitmap files */ - struct buffer_head * s_sbh; /* Buffer containing the super block */ - struct ext4_super_block *s_es; /* Pointer to the super block in the buffer */ - struct buffer_head **s_group_desc; - unsigned long s_mount_opt; - ext4_fsblk_t s_sb_block; - uid_t s_resuid; - gid_t s_resgid; - unsigned short s_mount_state; - unsigned short s_pad; - int s_addr_per_block_bits; - int s_desc_per_block_bits; - int s_inode_size; - int s_first_ino; - unsigned int s_inode_readahead_blks; - spinlock_t s_next_gen_lock; - u32 s_next_generation; - u32 s_hash_seed[4]; - int s_def_hash_version; - int s_hash_unsigned; /* 3 if hash should be signed, 0 if not */ - struct percpu_counter s_freeblocks_counter; - struct percpu_counter s_freeinodes_counter; - struct percpu_counter s_dirs_counter; - struct percpu_counter s_dirtyblocks_counter; - struct blockgroup_lock *s_blockgroup_lock; - struct proc_dir_entry *s_proc; - struct kobject s_kobj; - struct completion s_kobj_unregister; - - /* Journaling */ - struct inode *s_journal_inode; - struct journal_s *s_journal; - struct list_head s_orphan; - unsigned long s_commit_interval; - u32 s_max_batch_time; - u32 s_min_batch_time; - struct block_device *journal_bdev; -#ifdef CONFIG_JBD2_DEBUG - struct timer_list turn_ro_timer; /* For turning read-only (crash simulation) */ - wait_queue_head_t ro_wait_queue; /* For people waiting for the fs to go read-only */ -#endif -#ifdef CONFIG_QUOTA - char *s_qf_names[MAXQUOTAS]; /* Names of quota files with journalled quota */ - int s_jquota_fmt; /* Format of quota to use */ -#endif - unsigned int s_want_extra_isize; /* New inodes should reserve # bytes */ - -#ifdef EXTENTS_STATS - /* ext4 extents stats */ - unsigned long s_ext_min; - unsigned long s_ext_max; - unsigned long s_depth_max; - spinlock_t s_ext_stats_lock; - unsigned long s_ext_blocks; - unsigned long s_ext_extents; -#endif - - /* for buddy allocator */ - struct ext4_group_info ***s_group_info; - struct inode *s_buddy_cache; - long s_blocks_reserved; - spinlock_t s_reserve_lock; - spinlock_t s_md_lock; - tid_t s_last_transaction; - unsigned short *s_mb_offsets; - unsigned int *s_mb_maxs; - - /* tunables */ - unsigned long s_stripe; - unsigned int s_mb_stream_request; - unsigned int s_mb_max_to_scan; - unsigned int s_mb_min_to_scan; - unsigned int s_mb_stats; - unsigned int s_mb_order2_reqs; - unsigned int s_mb_group_prealloc; - /* where last allocation was done - for stream allocation */ - unsigned long s_mb_last_group; - unsigned long s_mb_last_start; - - /* history to debug policy */ - struct ext4_mb_history *s_mb_history; - int s_mb_history_cur; - int s_mb_history_max; - int s_mb_history_num; - spinlock_t s_mb_history_lock; - int s_mb_history_filter; - - /* stats for buddy allocator */ - spinlock_t s_mb_pa_lock; - atomic_t s_bal_reqs; /* number of reqs with len > 1 */ - atomic_t s_bal_success; /* we found long enough chunks */ - atomic_t s_bal_allocated; /* in blocks */ - atomic_t s_bal_ex_scanned; /* total extents scanned */ - atomic_t s_bal_goals; /* goal hits */ - atomic_t s_bal_breaks; /* too long searches */ - atomic_t s_bal_2orders; /* 2^order hits */ - spinlock_t s_bal_lock; - unsigned long s_mb_buddies_generated; - unsigned long long s_mb_generation_time; - atomic_t s_mb_lost_chunks; - atomic_t s_mb_preallocated; - atomic_t s_mb_discarded; - - /* locality groups */ - struct ext4_locality_group *s_locality_groups; - - /* for write statistics */ - unsigned long s_sectors_written_start; - u64 s_kbytes_written; - - unsigned int s_log_groups_per_flex; - struct flex_groups *s_flex_groups; -}; - -static inline spinlock_t * -sb_bgl_lock(struct ext4_sb_info *sbi, unsigned int block_group) -{ - return bgl_lock_ptr(sbi->s_blockgroup_lock, block_group); -} - -#endif /* _EXT4_SB */ diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index e3a55eb8b26a..d4e99e96fddb 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -326,32 +326,18 @@ ext4_ext_max_entries(struct inode *inode, int depth) static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext) { - ext4_fsblk_t block = ext_pblock(ext), valid_block; + ext4_fsblk_t block = ext_pblock(ext); int len = ext4_ext_get_actual_len(ext); - struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es; - valid_block = le32_to_cpu(es->s_first_data_block) + - EXT4_SB(inode->i_sb)->s_gdb_count; - if (unlikely(block <= valid_block || - ((block + len) > ext4_blocks_count(es)))) - return 0; - else - return 1; + return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, len); } static int ext4_valid_extent_idx(struct inode *inode, struct ext4_extent_idx *ext_idx) { - ext4_fsblk_t block = idx_pblock(ext_idx), valid_block; - struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es; + ext4_fsblk_t block = idx_pblock(ext_idx); - valid_block = le32_to_cpu(es->s_first_data_block) + - EXT4_SB(inode->i_sb)->s_gdb_count; - if (unlikely(block <= valid_block || - (block >= ext4_blocks_count(es)))) - return 0; - else - return 1; + return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, 1); } static int ext4_valid_extent_entries(struct inode *inode, @@ -2784,7 +2770,7 @@ fix_extent_len: int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, ext4_lblk_t iblock, unsigned int max_blocks, struct buffer_head *bh_result, - int create, int extend_disksize) + int flags) { struct ext4_ext_path *path = NULL; struct ext4_extent_header *eh; @@ -2803,7 +2789,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, cache_type = ext4_ext_in_cache(inode, iblock, &newex); if (cache_type) { if (cache_type == EXT4_EXT_CACHE_GAP) { - if (!create) { + if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) { /* * block isn't allocated yet and * user doesn't want to allocate it @@ -2869,9 +2855,11 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, EXT4_EXT_CACHE_EXTENT); goto out; } - if (create == EXT4_CREATE_UNINITIALIZED_EXT) + if (flags & EXT4_GET_BLOCKS_UNINIT_EXT) goto out; - if (!create) { + if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) { + if (allocated > max_blocks) + allocated = max_blocks; /* * We have blocks reserved already. We * return allocated blocks so that delalloc @@ -2879,8 +2867,6 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, * the buffer head will be unmapped so that * a read from the block returns 0s. */ - if (allocated > max_blocks) - allocated = max_blocks; set_buffer_unwritten(bh_result); bh_result->b_bdev = inode->i_sb->s_bdev; bh_result->b_blocknr = newblock; @@ -2903,7 +2889,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, * requested block isn't allocated yet; * we couldn't try to create block if create flag is zero */ - if (!create) { + if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) { /* * put just found gap into cache to speed up * subsequent requests @@ -2932,10 +2918,10 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, * EXT_UNINIT_MAX_LEN. */ if (max_blocks > EXT_INIT_MAX_LEN && - create != EXT4_CREATE_UNINITIALIZED_EXT) + !(flags & EXT4_GET_BLOCKS_UNINIT_EXT)) max_blocks = EXT_INIT_MAX_LEN; else if (max_blocks > EXT_UNINIT_MAX_LEN && - create == EXT4_CREATE_UNINITIALIZED_EXT) + (flags & EXT4_GET_BLOCKS_UNINIT_EXT)) max_blocks = EXT_UNINIT_MAX_LEN; /* Check if we can really insert (iblock)::(iblock+max_blocks) extent */ @@ -2966,7 +2952,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, /* try to insert new extent into found leaf and return */ ext4_ext_store_pblock(&newex, newblock); newex.ee_len = cpu_to_le16(ar.len); - if (create == EXT4_CREATE_UNINITIALIZED_EXT) /* Mark uninitialized */ + if (flags & EXT4_GET_BLOCKS_UNINIT_EXT) /* Mark uninitialized */ ext4_ext_mark_uninitialized(&newex); err = ext4_ext_insert_extent(handle, inode, path, &newex); if (err) { @@ -2983,7 +2969,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, newblock = ext_pblock(&newex); allocated = ext4_ext_get_actual_len(&newex); outnew: - if (extend_disksize) { + if (flags & EXT4_GET_BLOCKS_EXTEND_DISKSIZE) { disksize = ((loff_t) iblock + ar.len) << inode->i_blkbits; if (disksize > i_size_read(inode)) disksize = i_size_read(inode); @@ -2994,7 +2980,7 @@ outnew: set_buffer_new(bh_result); /* Cache only when it is _not_ an uninitialized extent */ - if (create != EXT4_CREATE_UNINITIALIZED_EXT) + if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0) ext4_ext_put_in_cache(inode, iblock, allocated, newblock, EXT4_EXT_CACHE_EXTENT); out: @@ -3150,9 +3136,10 @@ retry: ret = PTR_ERR(handle); break; } - ret = ext4_get_blocks_wrap(handle, inode, block, - max_blocks, &map_bh, - EXT4_CREATE_UNINITIALIZED_EXT, 0, 0); + map_bh.b_state = 0; + ret = ext4_get_blocks(handle, inode, block, + max_blocks, &map_bh, + EXT4_GET_BLOCKS_CREATE_UNINIT_EXT); if (ret <= 0) { #ifdef EXT4FS_DEBUG WARN_ON(ret <= 0); @@ -3195,7 +3182,7 @@ static int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path, void *data) { struct fiemap_extent_info *fieinfo = data; - unsigned long blksize_bits = inode->i_sb->s_blocksize_bits; + unsigned char blksize_bits = inode->i_sb->s_blocksize_bits; __u64 logical; __u64 physical; __u64 length; @@ -3242,9 +3229,16 @@ static int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path, * * XXX this might miss a single-block extent at EXT_MAX_BLOCK */ - if (logical + length - 1 == EXT_MAX_BLOCK || - ext4_ext_next_allocated_block(path) == EXT_MAX_BLOCK) + if (ext4_ext_next_allocated_block(path) == EXT_MAX_BLOCK || + newex->ec_block + newex->ec_len - 1 == EXT_MAX_BLOCK) { + loff_t size = i_size_read(inode); + loff_t bs = EXT4_BLOCK_SIZE(inode->i_sb); + flags |= FIEMAP_EXTENT_LAST; + if ((flags & FIEMAP_EXTENT_DELALLOC) && + logical+length > size) + length = (size - logical + bs - 1) & ~(bs-1); + } error = fiemap_fill_next_extent(fieinfo, logical, physical, length, flags); @@ -3318,10 +3312,10 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, * Walk the extent tree gathering extent information. * ext4_ext_fiemap_cb will push extents back to user. */ - down_write(&EXT4_I(inode)->i_data_sem); + down_read(&EXT4_I(inode)->i_data_sem); error = ext4_ext_walk_space(inode, start_blk, len_blks, ext4_ext_fiemap_cb, fieinfo); - up_write(&EXT4_I(inode)->i_data_sem); + up_read(&EXT4_I(inode)->i_data_sem); } return error; diff --git a/fs/ext4/group.h b/fs/ext4/group.h deleted file mode 100644 index c2c0a8d06d0e..000000000000 --- a/fs/ext4/group.h +++ /dev/null @@ -1,29 +0,0 @@ -/* - * linux/fs/ext4/group.h - * - * Copyright (C) 2007 Cluster File Systems, Inc - * - * Author: Andreas Dilger <adilger@clusterfs.com> - */ - -#ifndef _LINUX_EXT4_GROUP_H -#define _LINUX_EXT4_GROUP_H - -extern __le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 group, - struct ext4_group_desc *gdp); -extern int ext4_group_desc_csum_verify(struct ext4_sb_info *sbi, __u32 group, - struct ext4_group_desc *gdp); -struct buffer_head *ext4_read_block_bitmap(struct super_block *sb, - ext4_group_t block_group); -extern unsigned ext4_init_block_bitmap(struct super_block *sb, - struct buffer_head *bh, - ext4_group_t group, - struct ext4_group_desc *desc); -#define ext4_free_blocks_after_init(sb, group, desc) \ - ext4_init_block_bitmap(sb, NULL, group, desc) -extern unsigned ext4_init_inode_bitmap(struct super_block *sb, - struct buffer_head *bh, - ext4_group_t group, - struct ext4_group_desc *desc); -extern void mark_bitmap_end(int start_bit, int end_bit, char *bitmap); -#endif /* _LINUX_EXT4_GROUP_H */ diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index f18e0a08a6b5..82f7d1d7eae0 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -27,7 +27,6 @@ #include "ext4_jbd2.h" #include "xattr.h" #include "acl.h" -#include "group.h" /* * ialloc.c contains the inodes allocation and deallocation routines @@ -123,16 +122,16 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group) unlock_buffer(bh); return bh; } - spin_lock(sb_bgl_lock(EXT4_SB(sb), block_group)); + ext4_lock_group(sb, block_group); if (desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) { ext4_init_inode_bitmap(sb, bh, block_group, desc); set_bitmap_uptodate(bh); set_buffer_uptodate(bh); - spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group)); + ext4_unlock_group(sb, block_group); unlock_buffer(bh); return bh; } - spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group)); + ext4_unlock_group(sb, block_group); if (buffer_uptodate(bh)) { /* * if not uninit if bh is uptodate, @@ -247,9 +246,8 @@ void ext4_free_inode(handle_t *handle, struct inode *inode) goto error_return; /* Ok, now we can actually update the inode bitmaps.. */ - spin_lock(sb_bgl_lock(sbi, block_group)); - cleared = ext4_clear_bit(bit, bitmap_bh->b_data); - spin_unlock(sb_bgl_lock(sbi, block_group)); + cleared = ext4_clear_bit_atomic(ext4_group_lock_ptr(sb, block_group), + bit, bitmap_bh->b_data); if (!cleared) ext4_error(sb, "ext4_free_inode", "bit already cleared for inode %lu", ino); @@ -261,7 +259,7 @@ void ext4_free_inode(handle_t *handle, struct inode *inode) if (fatal) goto error_return; if (gdp) { - spin_lock(sb_bgl_lock(sbi, block_group)); + ext4_lock_group(sb, block_group); count = ext4_free_inodes_count(sb, gdp) + 1; ext4_free_inodes_set(sb, gdp, count); if (is_directory) { @@ -277,7 +275,7 @@ void ext4_free_inode(handle_t *handle, struct inode *inode) } gdp->bg_checksum = ext4_group_desc_csum(sbi, block_group, gdp); - spin_unlock(sb_bgl_lock(sbi, block_group)); + ext4_unlock_group(sb, block_group); percpu_counter_inc(&sbi->s_freeinodes_counter); if (is_directory) percpu_counter_dec(&sbi->s_dirs_counter); @@ -316,7 +314,7 @@ error_return: static int find_group_dir(struct super_block *sb, struct inode *parent, ext4_group_t *best_group) { - ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count; + ext4_group_t ngroups = ext4_get_groups_count(sb); unsigned int freei, avefreei; struct ext4_group_desc *desc, *best_desc = NULL; ext4_group_t group; @@ -353,7 +351,7 @@ static int find_group_flex(struct super_block *sb, struct inode *parent, struct flex_groups *flex_group = sbi->s_flex_groups; ext4_group_t parent_group = EXT4_I(parent)->i_block_group; ext4_group_t parent_fbg_group = ext4_flex_group(sbi, parent_group); - ext4_group_t ngroups = sbi->s_groups_count; + ext4_group_t ngroups = ext4_get_groups_count(sb); int flex_size = ext4_flex_bg_size(sbi); ext4_group_t best_flex = parent_fbg_group; int blocks_per_flex = sbi->s_blocks_per_group * flex_size; @@ -362,7 +360,7 @@ static int find_group_flex(struct super_block *sb, struct inode *parent, ext4_group_t n_fbg_groups; ext4_group_t i; - n_fbg_groups = (sbi->s_groups_count + flex_size - 1) >> + n_fbg_groups = (ngroups + flex_size - 1) >> sbi->s_log_groups_per_flex; find_close_to_parent: @@ -478,20 +476,21 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent, { ext4_group_t parent_group = EXT4_I(parent)->i_block_group; struct ext4_sb_info *sbi = EXT4_SB(sb); - ext4_group_t ngroups = sbi->s_groups_count; + ext4_group_t real_ngroups = ext4_get_groups_count(sb); int inodes_per_group = EXT4_INODES_PER_GROUP(sb); unsigned int freei, avefreei; ext4_fsblk_t freeb, avefreeb; unsigned int ndirs; int max_dirs, min_inodes; ext4_grpblk_t min_blocks; - ext4_group_t i, grp, g; + ext4_group_t i, grp, g, ngroups; struct ext4_group_desc *desc; struct orlov_stats stats; int flex_size = ext4_flex_bg_size(sbi); + ngroups = real_ngroups; if (flex_size > 1) { - ngroups = (ngroups + flex_size - 1) >> + ngroups = (real_ngroups + flex_size - 1) >> sbi->s_log_groups_per_flex; parent_group >>= sbi->s_log_groups_per_flex; } @@ -543,7 +542,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent, */ grp *= flex_size; for (i = 0; i < flex_size; i++) { - if (grp+i >= sbi->s_groups_count) + if (grp+i >= real_ngroups) break; desc = ext4_get_group_desc(sb, grp+i, NULL); if (desc && ext4_free_inodes_count(sb, desc)) { @@ -583,7 +582,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent, } fallback: - ngroups = sbi->s_groups_count; + ngroups = real_ngroups; avefreei = freei / ngroups; fallback_retry: parent_group = EXT4_I(parent)->i_block_group; @@ -613,9 +612,8 @@ static int find_group_other(struct super_block *sb, struct inode *parent, ext4_group_t *group, int mode) { ext4_group_t parent_group = EXT4_I(parent)->i_block_group; - ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count; + ext4_group_t i, last, ngroups = ext4_get_groups_count(sb); struct ext4_group_desc *desc; - ext4_group_t i, last; int flex_size = ext4_flex_bg_size(EXT4_SB(sb)); /* @@ -708,10 +706,10 @@ static int find_group_other(struct super_block *sb, struct inode *parent, /* * claim the inode from the inode bitmap. If the group - * is uninit we need to take the groups's sb_bgl_lock + * is uninit we need to take the groups's ext4_group_lock * and clear the uninit flag. The inode bitmap update * and group desc uninit flag clear should be done - * after holding sb_bgl_lock so that ext4_read_inode_bitmap + * after holding ext4_group_lock so that ext4_read_inode_bitmap * doesn't race with the ext4_claim_inode */ static int ext4_claim_inode(struct super_block *sb, @@ -722,7 +720,7 @@ static int ext4_claim_inode(struct super_block *sb, struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, NULL); - spin_lock(sb_bgl_lock(sbi, group)); + ext4_lock_group(sb, group); if (ext4_set_bit(ino, inode_bitmap_bh->b_data)) { /* not a free inode */ retval = 1; @@ -731,7 +729,7 @@ static int ext4_claim_inode(struct super_block *sb, ino++; if ((group == 0 && ino < EXT4_FIRST_INO(sb)) || ino > EXT4_INODES_PER_GROUP(sb)) { - spin_unlock(sb_bgl_lock(sbi, group)); + ext4_unlock_group(sb, group); ext4_error(sb, __func__, "reserved inode or inode > inodes count - " "block_group = %u, inode=%lu", group, @@ -780,7 +778,7 @@ static int ext4_claim_inode(struct super_block *sb, } gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp); err_ret: - spin_unlock(sb_bgl_lock(sbi, group)); + ext4_unlock_group(sb, group); return retval; } @@ -799,11 +797,10 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode) struct super_block *sb; struct buffer_head *inode_bitmap_bh = NULL; struct buffer_head *group_desc_bh; - ext4_group_t group = 0; + ext4_group_t ngroups, group = 0; unsigned long ino = 0; struct inode *inode; struct ext4_group_desc *gdp = NULL; - struct ext4_super_block *es; struct ext4_inode_info *ei; struct ext4_sb_info *sbi; int ret2, err = 0; @@ -818,15 +815,14 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode) return ERR_PTR(-EPERM); sb = dir->i_sb; + ngroups = ext4_get_groups_count(sb); trace_mark(ext4_request_inode, "dev %s dir %lu mode %d", sb->s_id, dir->i_ino, mode); inode = new_inode(sb); if (!inode) return ERR_PTR(-ENOMEM); ei = EXT4_I(inode); - sbi = EXT4_SB(sb); - es = sbi->s_es; if (sbi->s_log_groups_per_flex && test_opt(sb, OLDALLOC)) { ret2 = find_group_flex(sb, dir, &group); @@ -856,7 +852,7 @@ got_group: if (ret2 == -1) goto out; - for (i = 0; i < sbi->s_groups_count; i++) { + for (i = 0; i < ngroups; i++) { err = -EIO; gdp = ext4_get_group_desc(sb, group, &group_desc_bh); @@ -917,7 +913,7 @@ repeat_in_this_group: * group descriptor metadata has not yet been updated. * So we just go onto the next blockgroup. */ - if (++group == sbi->s_groups_count) + if (++group == ngroups) group = 0; } err = -ENOSPC; @@ -938,7 +934,7 @@ got: } free = 0; - spin_lock(sb_bgl_lock(sbi, group)); + ext4_lock_group(sb, group); /* recheck and clear flag under lock if we still need to */ if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { free = ext4_free_blocks_after_init(sb, group, gdp); @@ -947,7 +943,7 @@ got: gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp); } - spin_unlock(sb_bgl_lock(sbi, group)); + ext4_unlock_group(sb, group); /* Don't need to dirty bitmap block if we didn't change it */ if (free) { @@ -1158,7 +1154,7 @@ unsigned long ext4_count_free_inodes(struct super_block *sb) { unsigned long desc_count; struct ext4_group_desc *gdp; - ext4_group_t i; + ext4_group_t i, ngroups = ext4_get_groups_count(sb); #ifdef EXT4FS_DEBUG struct ext4_super_block *es; unsigned long bitmap_count, x; @@ -1168,7 +1164,7 @@ unsigned long ext4_count_free_inodes(struct super_block *sb) desc_count = 0; bitmap_count = 0; gdp = NULL; - for (i = 0; i < EXT4_SB(sb)->s_groups_count; i++) { + for (i = 0; i < ngroups; i++) { gdp = ext4_get_group_desc(sb, i, NULL); if (!gdp) continue; @@ -1190,7 +1186,7 @@ unsigned long ext4_count_free_inodes(struct super_block *sb) return desc_count; #else desc_count = 0; - for (i = 0; i < EXT4_SB(sb)->s_groups_count; i++) { + for (i = 0; i < ngroups; i++) { gdp = ext4_get_group_desc(sb, i, NULL); if (!gdp) continue; @@ -1205,9 +1201,9 @@ unsigned long ext4_count_free_inodes(struct super_block *sb) unsigned long ext4_count_dirs(struct super_block * sb) { unsigned long count = 0; - ext4_group_t i; + ext4_group_t i, ngroups = ext4_get_groups_count(sb); - for (i = 0; i < EXT4_SB(sb)->s_groups_count; i++) { + for (i = 0; i < ngroups; i++) { struct ext4_group_desc *gdp = ext4_get_group_desc(sb, i, NULL); if (!gdp) continue; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 2a9ffd528dd1..dadd3f995db5 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -372,20 +372,21 @@ static int ext4_block_to_path(struct inode *inode, } static int __ext4_check_blockref(const char *function, struct inode *inode, - __le32 *p, unsigned int max) { - - unsigned int maxblocks = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es); + __le32 *p, unsigned int max) +{ __le32 *bref = p; + unsigned int blk; + while (bref < p+max) { - if (unlikely(le32_to_cpu(*bref) >= maxblocks)) { + blk = le32_to_cpu(*bref++); + if (blk && + unlikely(!ext4_data_block_valid(EXT4_SB(inode->i_sb), + blk, 1))) { ext4_error(inode->i_sb, function, - "block reference %u >= max (%u) " - "in inode #%lu, offset=%d", - le32_to_cpu(*bref), maxblocks, - inode->i_ino, (int)(bref-p)); + "invalid block reference %u " + "in inode #%lu", blk, inode->i_ino); return -EIO; } - bref++; } return 0; } @@ -892,6 +893,10 @@ err_out: } /* + * The ext4_ind_get_blocks() function handles non-extents inodes + * (i.e., using the traditional indirect/double-indirect i_blocks + * scheme) for ext4_get_blocks(). + * * Allocation strategy is simple: if we have to allocate something, we will * have to go the whole way to leaf. So let's do it before attaching anything * to tree, set linkage between the newborn blocks, write them if sync is @@ -909,15 +914,16 @@ err_out: * return = 0, if plain lookup failed. * return < 0, error case. * - * - * Need to be called with - * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system block - * (ie, create is zero). Otherwise down_write(&EXT4_I(inode)->i_data_sem) + * The ext4_ind_get_blocks() function should be called with + * down_write(&EXT4_I(inode)->i_data_sem) if allocating filesystem + * blocks (i.e., flags has EXT4_GET_BLOCKS_CREATE set) or + * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system + * blocks. */ -static int ext4_get_blocks_handle(handle_t *handle, struct inode *inode, +static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode, ext4_lblk_t iblock, unsigned int maxblocks, struct buffer_head *bh_result, - int create, int extend_disksize) + int flags) { int err = -EIO; ext4_lblk_t offsets[4]; @@ -934,7 +940,7 @@ static int ext4_get_blocks_handle(handle_t *handle, struct inode *inode, J_ASSERT(!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)); - J_ASSERT(handle != NULL || create == 0); + J_ASSERT(handle != NULL || (flags & EXT4_GET_BLOCKS_CREATE) == 0); depth = ext4_block_to_path(inode, iblock, offsets, &blocks_to_boundary); @@ -963,7 +969,7 @@ static int ext4_get_blocks_handle(handle_t *handle, struct inode *inode, } /* Next simple case - plain lookup or failed read of indirect block */ - if (!create || err == -EIO) + if ((flags & EXT4_GET_BLOCKS_CREATE) == 0 || err == -EIO) goto cleanup; /* @@ -1002,7 +1008,7 @@ static int ext4_get_blocks_handle(handle_t *handle, struct inode *inode, * protect it if you're about to implement concurrent * ext4_get_block() -bzzz */ - if (!err && extend_disksize) { + if (!err && (flags & EXT4_GET_BLOCKS_EXTEND_DISKSIZE)) { disksize = ((loff_t) iblock + count) << inode->i_blkbits; if (disksize > i_size_read(inode)) disksize = i_size_read(inode); @@ -1120,8 +1126,23 @@ static void ext4_da_update_reserve_space(struct inode *inode, int used) ext4_discard_preallocations(inode); } +static int check_block_validity(struct inode *inode, sector_t logical, + sector_t phys, int len) +{ + if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), phys, len)) { + ext4_error(inode->i_sb, "check_block_validity", + "inode #%lu logical block %llu mapped to %llu " + "(size %d)", inode->i_ino, + (unsigned long long) logical, + (unsigned long long) phys, len); + WARN_ON(1); + return -EIO; + } + return 0; +} + /* - * The ext4_get_blocks_wrap() function try to look up the requested blocks, + * The ext4_get_blocks() function tries to look up the requested blocks, * and returns if the blocks are already mapped. * * Otherwise it takes the write lock of the i_data_sem and allocate blocks @@ -1129,7 +1150,7 @@ static void ext4_da_update_reserve_space(struct inode *inode, int used) * mapped. * * If file type is extents based, it will call ext4_ext_get_blocks(), - * Otherwise, call with ext4_get_blocks_handle() to handle indirect mapping + * Otherwise, call with ext4_ind_get_blocks() to handle indirect mapping * based files * * On success, it returns the number of blocks being mapped or allocate. @@ -1142,9 +1163,9 @@ static void ext4_da_update_reserve_space(struct inode *inode, int used) * * It returns the error in case of allocation failure. */ -int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block, - unsigned int max_blocks, struct buffer_head *bh, - int create, int extend_disksize, int flag) +int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block, + unsigned int max_blocks, struct buffer_head *bh, + int flags) { int retval; @@ -1152,21 +1173,28 @@ int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block, clear_buffer_unwritten(bh); /* - * Try to see if we can get the block without requesting - * for new file system block. + * Try to see if we can get the block without requesting a new + * file system block. */ down_read((&EXT4_I(inode)->i_data_sem)); if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) { retval = ext4_ext_get_blocks(handle, inode, block, max_blocks, - bh, 0, 0); + bh, 0); } else { - retval = ext4_get_blocks_handle(handle, - inode, block, max_blocks, bh, 0, 0); + retval = ext4_ind_get_blocks(handle, inode, block, max_blocks, + bh, 0); } up_read((&EXT4_I(inode)->i_data_sem)); + if (retval > 0 && buffer_mapped(bh)) { + int ret = check_block_validity(inode, block, + bh->b_blocknr, retval); + if (ret != 0) + return ret; + } + /* If it is only a block(s) look up */ - if (!create) + if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) return retval; /* @@ -1205,7 +1233,7 @@ int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block, * let the underlying get_block() function know to * avoid double accounting */ - if (flag) + if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) EXT4_I(inode)->i_delalloc_reserved_flag = 1; /* * We need to check for EXT4 here because migrate @@ -1213,10 +1241,10 @@ int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block, */ if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) { retval = ext4_ext_get_blocks(handle, inode, block, max_blocks, - bh, create, extend_disksize); + bh, flags); } else { - retval = ext4_get_blocks_handle(handle, inode, block, - max_blocks, bh, create, extend_disksize); + retval = ext4_ind_get_blocks(handle, inode, block, + max_blocks, bh, flags); if (retval > 0 && buffer_new(bh)) { /* @@ -1229,18 +1257,23 @@ int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block, } } - if (flag) { + if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) EXT4_I(inode)->i_delalloc_reserved_flag = 0; - /* - * Update reserved blocks/metadata blocks - * after successful block allocation - * which were deferred till now - */ - if ((retval > 0) && buffer_delay(bh)) - ext4_da_update_reserve_space(inode, retval); - } + + /* + * Update reserved blocks/metadata blocks after successful + * block allocation which had been deferred till now. + */ + if ((retval > 0) && (flags & EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE)) + ext4_da_update_reserve_space(inode, retval); up_write((&EXT4_I(inode)->i_data_sem)); + if (retval > 0 && buffer_mapped(bh)) { + int ret = check_block_validity(inode, block, + bh->b_blocknr, retval); + if (ret != 0) + return ret; + } return retval; } @@ -1268,8 +1301,8 @@ int ext4_get_block(struct inode *inode, sector_t iblock, started = 1; } - ret = ext4_get_blocks_wrap(handle, inode, iblock, - max_blocks, bh_result, create, 0, 0); + ret = ext4_get_blocks(handle, inode, iblock, max_blocks, bh_result, + create ? EXT4_GET_BLOCKS_CREATE : 0); if (ret > 0) { bh_result->b_size = (ret << inode->i_blkbits); ret = 0; @@ -1288,17 +1321,19 @@ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode, { struct buffer_head dummy; int fatal = 0, err; + int flags = EXT4_GET_BLOCKS_EXTEND_DISKSIZE; J_ASSERT(handle != NULL || create == 0); dummy.b_state = 0; dummy.b_blocknr = -1000; buffer_trace_init(&dummy.b_history); - err = ext4_get_blocks_wrap(handle, inode, block, 1, - &dummy, create, 1, 0); + if (create) + flags |= EXT4_GET_BLOCKS_CREATE; + err = ext4_get_blocks(handle, inode, block, 1, &dummy, flags); /* - * ext4_get_blocks_handle() returns number of blocks - * mapped. 0 in case of a HOLE. + * ext4_get_blocks() returns number of blocks mapped. 0 in + * case of a HOLE. */ if (err > 0) { if (err > 1) @@ -1852,7 +1887,7 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd) * @logical - first logical block to start assignment with * * the function goes through all passed space and put actual disk - * block numbers into buffer heads, dropping BH_Delay + * block numbers into buffer heads, dropping BH_Delay and BH_Unwritten */ static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical, struct buffer_head *exbh) @@ -1902,16 +1937,24 @@ static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical, do { if (cur_logical >= logical + blocks) break; - if (buffer_delay(bh)) { - bh->b_blocknr = pblock; - clear_buffer_delay(bh); - bh->b_bdev = inode->i_sb->s_bdev; - } else if (buffer_unwritten(bh)) { - bh->b_blocknr = pblock; - clear_buffer_unwritten(bh); - set_buffer_mapped(bh); - set_buffer_new(bh); - bh->b_bdev = inode->i_sb->s_bdev; + + if (buffer_delay(bh) || + buffer_unwritten(bh)) { + + BUG_ON(bh->b_bdev != inode->i_sb->s_bdev); + + if (buffer_delay(bh)) { + clear_buffer_delay(bh); + bh->b_blocknr = pblock; + } else { + /* + * unwritten already should have + * blocknr assigned. Verify that + */ + clear_buffer_unwritten(bh); + BUG_ON(bh->b_blocknr != pblock); + } + } else if (buffer_mapped(bh)) BUG_ON(bh->b_blocknr != pblock); @@ -1990,51 +2033,6 @@ static void ext4_print_free_blocks(struct inode *inode) return; } -#define EXT4_DELALLOC_RSVED 1 -static int ext4_da_get_block_write(struct inode *inode, sector_t iblock, - struct buffer_head *bh_result, int create) -{ - int ret; - unsigned max_blocks = bh_result->b_size >> inode->i_blkbits; - loff_t disksize = EXT4_I(inode)->i_disksize; - handle_t *handle = NULL; - - handle = ext4_journal_current_handle(); - BUG_ON(!handle); - ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks, - bh_result, create, 0, EXT4_DELALLOC_RSVED); - if (ret <= 0) - return ret; - - bh_result->b_size = (ret << inode->i_blkbits); - - if (ext4_should_order_data(inode)) { - int retval; - retval = ext4_jbd2_file_inode(handle, inode); - if (retval) - /* - * Failed to add inode for ordered mode. Don't - * update file size - */ - return retval; - } - - /* - * Update on-disk size along with block allocation we don't - * use 'extend_disksize' as size may change within already - * allocated block -bzzz - */ - disksize = ((loff_t) iblock + ret) << inode->i_blkbits; - if (disksize > i_size_read(inode)) - disksize = i_size_read(inode); - if (disksize > EXT4_I(inode)->i_disksize) { - ext4_update_i_disksize(inode, disksize); - ret = ext4_mark_inode_dirty(handle, inode); - return ret; - } - return 0; -} - /* * mpage_da_map_blocks - go through given space * @@ -2045,29 +2043,57 @@ static int ext4_da_get_block_write(struct inode *inode, sector_t iblock, */ static int mpage_da_map_blocks(struct mpage_da_data *mpd) { - int err = 0; + int err, blks, get_blocks_flags; struct buffer_head new; - sector_t next; + sector_t next = mpd->b_blocknr; + unsigned max_blocks = mpd->b_size >> mpd->inode->i_blkbits; + loff_t disksize = EXT4_I(mpd->inode)->i_disksize; + handle_t *handle = NULL; /* * We consider only non-mapped and non-allocated blocks */ if ((mpd->b_state & (1 << BH_Mapped)) && - !(mpd->b_state & (1 << BH_Delay))) + !(mpd->b_state & (1 << BH_Delay)) && + !(mpd->b_state & (1 << BH_Unwritten))) return 0; - new.b_state = mpd->b_state; - new.b_blocknr = 0; - new.b_size = mpd->b_size; - next = mpd->b_blocknr; + /* - * If we didn't accumulate anything - * to write simply return + * If we didn't accumulate anything to write simply return */ - if (!new.b_size) + if (!mpd->b_size) return 0; - err = ext4_da_get_block_write(mpd->inode, next, &new, 1); - if (err) { + handle = ext4_journal_current_handle(); + BUG_ON(!handle); + + /* + * Call ext4_get_blocks() to allocate any delayed allocation + * blocks, or to convert an uninitialized extent to be + * initialized (in the case where we have written into + * one or more preallocated blocks). + * + * We pass in the magic EXT4_GET_BLOCKS_DELALLOC_RESERVE to + * indicate that we are on the delayed allocation path. This + * affects functions in many different parts of the allocation + * call path. This flag exists primarily because we don't + * want to change *many* call functions, so ext4_get_blocks() + * will set the magic i_delalloc_reserved_flag once the + * inode's allocation semaphore is taken. + * + * If the blocks in questions were delalloc blocks, set + * EXT4_GET_BLOCKS_DELALLOC_RESERVE so the delalloc accounting + * variables are updated after the blocks have been allocated. + */ + new.b_state = 0; + get_blocks_flags = (EXT4_GET_BLOCKS_CREATE | + EXT4_GET_BLOCKS_DELALLOC_RESERVE); + if (mpd->b_state & (1 << BH_Delay)) + get_blocks_flags |= EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE; + blks = ext4_get_blocks(handle, mpd->inode, next, max_blocks, + &new, get_blocks_flags); + if (blks < 0) { + err = blks; /* * If get block returns with error we simply * return. Later writepage will redirty the page and @@ -2100,12 +2126,14 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd) if (err == -ENOSPC) { ext4_print_free_blocks(mpd->inode); } - /* invlaidate all the pages */ + /* invalidate all the pages */ ext4_da_block_invalidatepages(mpd, next, mpd->b_size >> mpd->inode->i_blkbits); return err; } - BUG_ON(new.b_size == 0); + BUG_ON(blks == 0); + + new.b_size = (blks << mpd->inode->i_blkbits); if (buffer_new(&new)) __unmap_underlying_blocks(mpd->inode, &new); @@ -2118,6 +2146,25 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd) (mpd->b_state & (1 << BH_Unwritten))) mpage_put_bnr_to_bhs(mpd, next, &new); + if (ext4_should_order_data(mpd->inode)) { + err = ext4_jbd2_file_inode(handle, mpd->inode); + if (err) + return err; + } + + /* + * Update on-disk size along with block allocation we don't + * use EXT4_GET_BLOCKS_EXTEND_DISKSIZE as size may change + * within already allocated block -bzzz + */ + disksize = ((loff_t) next + blks) << mpd->inode->i_blkbits; + if (disksize > i_size_read(mpd->inode)) + disksize = i_size_read(mpd->inode); + if (disksize > EXT4_I(mpd->inode)->i_disksize) { + ext4_update_i_disksize(mpd->inode, disksize); + return ext4_mark_inode_dirty(handle, mpd->inode); + } + return 0; } @@ -2192,6 +2239,17 @@ flush_it: return; } +static int ext4_bh_unmapped_or_delay(handle_t *handle, struct buffer_head *bh) +{ + /* + * unmapped buffer is possible for holes. + * delay buffer is possible with delayed allocation. + * We also need to consider unwritten buffer as unmapped. + */ + return (!buffer_mapped(bh) || buffer_delay(bh) || + buffer_unwritten(bh)) && buffer_dirty(bh); +} + /* * __mpage_da_writepage - finds extent of pages and blocks * @@ -2276,8 +2334,7 @@ static int __mpage_da_writepage(struct page *page, * Otherwise we won't make progress * with the page in ext4_da_writepage */ - if (buffer_dirty(bh) && - (!buffer_mapped(bh) || buffer_delay(bh))) { + if (ext4_bh_unmapped_or_delay(NULL, bh)) { mpage_add_bh_to_extent(mpd, logical, bh->b_size, bh->b_state); @@ -2303,8 +2360,16 @@ static int __mpage_da_writepage(struct page *page, } /* - * this is a special callback for ->write_begin() only - * it's intention is to return mapped block or reserve space + * This is a special get_blocks_t callback which is used by + * ext4_da_write_begin(). It will either return mapped block or + * reserve space for a single block. + * + * For delayed buffer_head we have BH_Mapped, BH_New, BH_Delay set. + * We also have b_blocknr = -1 and b_bdev initialized properly + * + * For unwritten buffer_head we have BH_Mapped, BH_New, BH_Unwritten set. + * We also have b_blocknr = physicalblock mapping unwritten extent and b_bdev + * initialized properly. */ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) @@ -2323,7 +2388,7 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, * preallocated blocks are unmapped but should treated * the same as allocated blocks. */ - ret = ext4_get_blocks_wrap(NULL, inode, iblock, 1, bh_result, 0, 0, 0); + ret = ext4_get_blocks(NULL, inode, iblock, 1, bh_result, 0); if ((ret == 0) && !buffer_delay(bh_result)) { /* the block isn't (pre)allocated yet, let's reserve space */ /* @@ -2340,40 +2405,53 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, set_buffer_delay(bh_result); } else if (ret > 0) { bh_result->b_size = (ret << inode->i_blkbits); - /* - * With sub-block writes into unwritten extents - * we also need to mark the buffer as new so that - * the unwritten parts of the buffer gets correctly zeroed. - */ - if (buffer_unwritten(bh_result)) + if (buffer_unwritten(bh_result)) { + /* A delayed write to unwritten bh should + * be marked new and mapped. Mapped ensures + * that we don't do get_block multiple times + * when we write to the same offset and new + * ensures that we do proper zero out for + * partial write. + */ set_buffer_new(bh_result); + set_buffer_mapped(bh_result); + } ret = 0; } return ret; } -static int ext4_bh_unmapped_or_delay(handle_t *handle, struct buffer_head *bh) -{ - /* - * unmapped buffer is possible for holes. - * delay buffer is possible with delayed allocation - */ - return ((!buffer_mapped(bh) || buffer_delay(bh)) && buffer_dirty(bh)); -} - -static int ext4_normal_get_block_write(struct inode *inode, sector_t iblock, +/* + * This function is used as a standard get_block_t calback function + * when there is no desire to allocate any blocks. It is used as a + * callback function for block_prepare_write(), nobh_writepage(), and + * block_write_full_page(). These functions should only try to map a + * single block at a time. + * + * Since this function doesn't do block allocations even if the caller + * requests it by passing in create=1, it is critically important that + * any caller checks to make sure that any buffer heads are returned + * by this function are either all already mapped or marked for + * delayed allocation before calling nobh_writepage() or + * block_write_full_page(). Otherwise, b_blocknr could be left + * unitialized, and the page write functions will be taken by + * surprise. + */ +static int noalloc_get_block_write(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) { int ret = 0; unsigned max_blocks = bh_result->b_size >> inode->i_blkbits; + BUG_ON(bh_result->b_size != inode->i_sb->s_blocksize); + /* * we don't want to do block allocation in writepage * so call get_block_wrap with create = 0 */ - ret = ext4_get_blocks_wrap(NULL, inode, iblock, max_blocks, - bh_result, 0, 0, 0); + ret = ext4_get_blocks(NULL, inode, iblock, max_blocks, bh_result, 0); + BUG_ON(create && ret == 0); if (ret > 0) { bh_result->b_size = (ret << inode->i_blkbits); ret = 0; @@ -2382,10 +2460,11 @@ static int ext4_normal_get_block_write(struct inode *inode, sector_t iblock, } /* - * get called vi ext4_da_writepages after taking page lock (have journal handle) - * get called via journal_submit_inode_data_buffers (no journal handle) - * get called via shrink_page_list via pdflush (no journal handle) - * or grab_page_cache when doing write_begin (have journal handle) + * This function can get called via... + * - ext4_da_writepages after taking page lock (have journal handle) + * - journal_submit_inode_data_buffers (no journal handle) + * - shrink_page_list via pdflush (no journal handle) + * - grab_page_cache when doing write_begin (have journal handle) */ static int ext4_da_writepage(struct page *page, struct writeback_control *wbc) @@ -2436,7 +2515,7 @@ static int ext4_da_writepage(struct page *page, * do block allocation here. */ ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE, - ext4_normal_get_block_write); + noalloc_get_block_write); if (!ret) { page_bufs = page_buffers(page); /* check whether all are mapped and non delay */ @@ -2461,11 +2540,10 @@ static int ext4_da_writepage(struct page *page, } if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode)) - ret = nobh_writepage(page, ext4_normal_get_block_write, wbc); + ret = nobh_writepage(page, noalloc_get_block_write, wbc); else - ret = block_write_full_page(page, - ext4_normal_get_block_write, - wbc); + ret = block_write_full_page(page, noalloc_get_block_write, + wbc); return ret; } @@ -2777,7 +2855,7 @@ retry: *pagep = page; ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata, - ext4_da_get_block_prep); + ext4_da_get_block_prep); if (ret < 0) { unlock_page(page); ext4_journal_stop(handle); @@ -2815,7 +2893,7 @@ static int ext4_da_should_update_i_disksize(struct page *page, for (i = 0; i < idx; i++) bh = bh->b_this_page; - if (!buffer_mapped(bh) || (buffer_delay(bh))) + if (!buffer_mapped(bh) || (buffer_delay(bh)) || buffer_unwritten(bh)) return 0; return 1; } @@ -3085,12 +3163,10 @@ static int __ext4_normal_writepage(struct page *page, struct inode *inode = page->mapping->host; if (test_opt(inode->i_sb, NOBH)) - return nobh_writepage(page, - ext4_normal_get_block_write, wbc); + return nobh_writepage(page, noalloc_get_block_write, wbc); else - return block_write_full_page(page, - ext4_normal_get_block_write, - wbc); + return block_write_full_page(page, noalloc_get_block_write, + wbc); } static int ext4_normal_writepage(struct page *page, @@ -3142,7 +3218,7 @@ static int __ext4_journalled_writepage(struct page *page, int err; ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE, - ext4_normal_get_block_write); + noalloc_get_block_write); if (ret != 0) goto out_unlock; @@ -3227,9 +3303,8 @@ static int ext4_journalled_writepage(struct page *page, * really know unless we go poke around in the buffer_heads. * But block_write_full_page will do the right thing. */ - return block_write_full_page(page, - ext4_normal_get_block_write, - wbc); + return block_write_full_page(page, noalloc_get_block_write, + wbc); } no_write: redirty_page_for_writepage(wbc, page); @@ -4930,7 +5005,8 @@ static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk) */ int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk) { - int groups, gdpblocks; + ext4_group_t groups, ngroups = ext4_get_groups_count(inode->i_sb); + int gdpblocks; int idxblocks; int ret = 0; @@ -4957,8 +5033,8 @@ int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk) groups += nrblocks; gdpblocks = groups; - if (groups > EXT4_SB(inode->i_sb)->s_groups_count) - groups = EXT4_SB(inode->i_sb)->s_groups_count; + if (groups > ngroups) + groups = ngroups; if (groups > EXT4_SB(inode->i_sb)->s_gdb_count) gdpblocks = EXT4_SB(inode->i_sb)->s_gdb_count; @@ -4998,7 +5074,7 @@ int ext4_writepage_trans_blocks(struct inode *inode) * Calculate the journal credits for a chunk of data modification. * * This is called from DIO, fallocate or whoever calling - * ext4_get_blocks_wrap() to map/allocate a chunk of contigous disk blocks. + * ext4_get_blocks() to map/allocate a chunk of contigous disk blocks. * * journal buffers for data blocks are not included here, as DIO * and fallocate do no need to journal data buffers. diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index f871677a7984..ed8482e22c0e 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -372,24 +372,12 @@ static inline void mb_set_bit(int bit, void *addr) ext4_set_bit(bit, addr); } -static inline void mb_set_bit_atomic(spinlock_t *lock, int bit, void *addr) -{ - addr = mb_correct_addr_and_bit(&bit, addr); - ext4_set_bit_atomic(lock, bit, addr); -} - static inline void mb_clear_bit(int bit, void *addr) { addr = mb_correct_addr_and_bit(&bit, addr); ext4_clear_bit(bit, addr); } -static inline void mb_clear_bit_atomic(spinlock_t *lock, int bit, void *addr) -{ - addr = mb_correct_addr_and_bit(&bit, addr); - ext4_clear_bit_atomic(lock, bit, addr); -} - static inline int mb_find_next_zero_bit(void *addr, int max, int start) { int fix = 0, ret, tmpmax; @@ -448,7 +436,7 @@ static void mb_free_blocks_double(struct inode *inode, struct ext4_buddy *e4b, if (unlikely(e4b->bd_info->bb_bitmap == NULL)) return; - BUG_ON(!ext4_is_group_locked(sb, e4b->bd_group)); + assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group)); for (i = 0; i < count; i++) { if (!mb_test_bit(first + i, e4b->bd_info->bb_bitmap)) { ext4_fsblk_t blocknr; @@ -472,7 +460,7 @@ static void mb_mark_used_double(struct ext4_buddy *e4b, int first, int count) if (unlikely(e4b->bd_info->bb_bitmap == NULL)) return; - BUG_ON(!ext4_is_group_locked(e4b->bd_sb, e4b->bd_group)); + assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group)); for (i = 0; i < count; i++) { BUG_ON(mb_test_bit(first + i, e4b->bd_info->bb_bitmap)); mb_set_bit(first + i, e4b->bd_info->bb_bitmap); @@ -739,6 +727,7 @@ static void ext4_mb_generate_buddy(struct super_block *sb, static int ext4_mb_init_cache(struct page *page, char *incore) { + ext4_group_t ngroups; int blocksize; int blocks_per_page; int groups_per_page; @@ -757,6 +746,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore) inode = page->mapping->host; sb = inode->i_sb; + ngroups = ext4_get_groups_count(sb); blocksize = 1 << inode->i_blkbits; blocks_per_page = PAGE_CACHE_SIZE / blocksize; @@ -780,7 +770,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore) for (i = 0; i < groups_per_page; i++) { struct ext4_group_desc *desc; - if (first_group + i >= EXT4_SB(sb)->s_groups_count) + if (first_group + i >= ngroups) break; err = -EIO; @@ -801,17 +791,17 @@ static int ext4_mb_init_cache(struct page *page, char *incore) unlock_buffer(bh[i]); continue; } - spin_lock(sb_bgl_lock(EXT4_SB(sb), first_group + i)); + ext4_lock_group(sb, first_group + i); if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { ext4_init_block_bitmap(sb, bh[i], first_group + i, desc); set_bitmap_uptodate(bh[i]); set_buffer_uptodate(bh[i]); - spin_unlock(sb_bgl_lock(EXT4_SB(sb), first_group + i)); + ext4_unlock_group(sb, first_group + i); unlock_buffer(bh[i]); continue; } - spin_unlock(sb_bgl_lock(EXT4_SB(sb), first_group + i)); + ext4_unlock_group(sb, first_group + i); if (buffer_uptodate(bh[i])) { /* * if not uninit if bh is uptodate, @@ -852,7 +842,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore) struct ext4_group_info *grinfo; group = (first_block + i) >> 1; - if (group >= EXT4_SB(sb)->s_groups_count) + if (group >= ngroups) break; /* @@ -1078,7 +1068,7 @@ static int mb_find_order_for_block(struct ext4_buddy *e4b, int block) return 0; } -static void mb_clear_bits(spinlock_t *lock, void *bm, int cur, int len) +static void mb_clear_bits(void *bm, int cur, int len) { __u32 *addr; @@ -1091,15 +1081,12 @@ static void mb_clear_bits(spinlock_t *lock, void *bm, int cur, int len) cur += 32; continue; } - if (lock) - mb_clear_bit_atomic(lock, cur, bm); - else - mb_clear_bit(cur, bm); + mb_clear_bit(cur, bm); cur++; } } -static void mb_set_bits(spinlock_t *lock, void *bm, int cur, int len) +static void mb_set_bits(void *bm, int cur, int len) { __u32 *addr; @@ -1112,10 +1099,7 @@ static void mb_set_bits(spinlock_t *lock, void *bm, int cur, int len) cur += 32; continue; } - if (lock) - mb_set_bit_atomic(lock, cur, bm); - else - mb_set_bit(cur, bm); + mb_set_bit(cur, bm); cur++; } } @@ -1131,7 +1115,7 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b, struct super_block *sb = e4b->bd_sb; BUG_ON(first + count > (sb->s_blocksize << 3)); - BUG_ON(!ext4_is_group_locked(sb, e4b->bd_group)); + assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group)); mb_check_buddy(e4b); mb_free_blocks_double(inode, e4b, first, count); @@ -1212,7 +1196,7 @@ static int mb_find_extent(struct ext4_buddy *e4b, int order, int block, int ord; void *buddy; - BUG_ON(!ext4_is_group_locked(e4b->bd_sb, e4b->bd_group)); + assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group)); BUG_ON(ex == NULL); buddy = mb_find_buddy(e4b, order, &max); @@ -1276,7 +1260,7 @@ static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex) BUG_ON(start + len > (e4b->bd_sb->s_blocksize << 3)); BUG_ON(e4b->bd_group != ex->fe_group); - BUG_ON(!ext4_is_group_locked(e4b->bd_sb, e4b->bd_group)); + assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group)); mb_check_buddy(e4b); mb_mark_used_double(e4b, start, len); @@ -1330,8 +1314,7 @@ static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex) e4b->bd_info->bb_counters[ord]++; } - mb_set_bits(sb_bgl_lock(EXT4_SB(e4b->bd_sb), ex->fe_group), - EXT4_MB_BITMAP(e4b), ex->fe_start, len0); + mb_set_bits(EXT4_MB_BITMAP(e4b), ex->fe_start, len0); mb_check_buddy(e4b); return ret; @@ -1726,7 +1709,6 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac, unsigned free, fragments; unsigned i, bits; int flex_size = ext4_flex_bg_size(EXT4_SB(ac->ac_sb)); - struct ext4_group_desc *desc; struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group); BUG_ON(cr < 0 || cr >= 4); @@ -1742,10 +1724,6 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac, switch (cr) { case 0: BUG_ON(ac->ac_2order == 0); - /* If this group is uninitialized, skip it initially */ - desc = ext4_get_group_desc(ac->ac_sb, group, NULL); - if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) - return 0; /* Avoid using the first bg of a flexgroup for data files */ if ((ac->ac_flags & EXT4_MB_HINT_DATA) && @@ -1788,6 +1766,7 @@ int ext4_mb_get_buddy_cache_lock(struct super_block *sb, ext4_group_t group) int block, pnum; int blocks_per_page; int groups_per_page; + ext4_group_t ngroups = ext4_get_groups_count(sb); ext4_group_t first_group; struct ext4_group_info *grp; @@ -1807,7 +1786,7 @@ int ext4_mb_get_buddy_cache_lock(struct super_block *sb, ext4_group_t group) /* read all groups the page covers into the cache */ for (i = 0; i < groups_per_page; i++) { - if ((first_group + i) >= EXT4_SB(sb)->s_groups_count) + if ((first_group + i) >= ngroups) break; grp = ext4_get_group_info(sb, first_group + i); /* take all groups write allocation @@ -1945,8 +1924,7 @@ err: static noinline_for_stack int ext4_mb_regular_allocator(struct ext4_allocation_context *ac) { - ext4_group_t group; - ext4_group_t i; + ext4_group_t ngroups, group, i; int cr; int err = 0; int bsbits; @@ -1957,6 +1935,7 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac) sb = ac->ac_sb; sbi = EXT4_SB(sb); + ngroups = ext4_get_groups_count(sb); BUG_ON(ac->ac_status == AC_STATUS_FOUND); /* first, try the goal */ @@ -2017,11 +1996,11 @@ repeat: */ group = ac->ac_g_ex.fe_group; - for (i = 0; i < EXT4_SB(sb)->s_groups_count; group++, i++) { + for (i = 0; i < ngroups; group++, i++) { struct ext4_group_info *grp; struct ext4_group_desc *desc; - if (group == EXT4_SB(sb)->s_groups_count) + if (group == ngroups) group = 0; /* quick check to skip empty groups */ @@ -2064,9 +2043,7 @@ repeat: ac->ac_groups_scanned++; desc = ext4_get_group_desc(sb, group, NULL); - if (cr == 0 || (desc->bg_flags & - cpu_to_le16(EXT4_BG_BLOCK_UNINIT) && - ac->ac_2order != 0)) + if (cr == 0) ext4_mb_simple_scan_group(ac, &e4b); else if (cr == 1 && ac->ac_g_ex.fe_len == sbi->s_stripe) @@ -2315,12 +2292,10 @@ static struct file_operations ext4_mb_seq_history_fops = { static void *ext4_mb_seq_groups_start(struct seq_file *seq, loff_t *pos) { struct super_block *sb = seq->private; - struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_group_t group; - if (*pos < 0 || *pos >= sbi->s_groups_count) + if (*pos < 0 || *pos >= ext4_get_groups_count(sb)) return NULL; - group = *pos + 1; return (void *) ((unsigned long) group); } @@ -2328,11 +2303,10 @@ static void *ext4_mb_seq_groups_start(struct seq_file *seq, loff_t *pos) static void *ext4_mb_seq_groups_next(struct seq_file *seq, void *v, loff_t *pos) { struct super_block *sb = seq->private; - struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_group_t group; ++*pos; - if (*pos < 0 || *pos >= sbi->s_groups_count) + if (*pos < 0 || *pos >= ext4_get_groups_count(sb)) return NULL; group = *pos + 1; return (void *) ((unsigned long) group); @@ -2420,7 +2394,8 @@ static void ext4_mb_history_release(struct super_block *sb) if (sbi->s_proc != NULL) { remove_proc_entry("mb_groups", sbi->s_proc); - remove_proc_entry("mb_history", sbi->s_proc); + if (sbi->s_mb_history_max) + remove_proc_entry("mb_history", sbi->s_proc); } kfree(sbi->s_mb_history); } @@ -2431,17 +2406,17 @@ static void ext4_mb_history_init(struct super_block *sb) int i; if (sbi->s_proc != NULL) { - proc_create_data("mb_history", S_IRUGO, sbi->s_proc, - &ext4_mb_seq_history_fops, sb); + if (sbi->s_mb_history_max) + proc_create_data("mb_history", S_IRUGO, sbi->s_proc, + &ext4_mb_seq_history_fops, sb); proc_create_data("mb_groups", S_IRUGO, sbi->s_proc, &ext4_mb_seq_groups_fops, sb); } - sbi->s_mb_history_max = 1000; sbi->s_mb_history_cur = 0; spin_lock_init(&sbi->s_mb_history_lock); i = sbi->s_mb_history_max * sizeof(struct ext4_mb_history); - sbi->s_mb_history = kzalloc(i, GFP_KERNEL); + sbi->s_mb_history = i ? kzalloc(i, GFP_KERNEL) : NULL; /* if we can't allocate history, then we simple won't use it */ } @@ -2451,7 +2426,7 @@ ext4_mb_store_history(struct ext4_allocation_context *ac) struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); struct ext4_mb_history h; - if (unlikely(sbi->s_mb_history == NULL)) + if (sbi->s_mb_history == NULL) return; if (!(ac->ac_op & sbi->s_mb_history_filter)) @@ -2587,6 +2562,7 @@ void ext4_mb_update_group_info(struct ext4_group_info *grp, ext4_grpblk_t add) static int ext4_mb_init_backend(struct super_block *sb) { + ext4_group_t ngroups = ext4_get_groups_count(sb); ext4_group_t i; int metalen; struct ext4_sb_info *sbi = EXT4_SB(sb); @@ -2598,7 +2574,7 @@ static int ext4_mb_init_backend(struct super_block *sb) struct ext4_group_desc *desc; /* This is the number of blocks used by GDT */ - num_meta_group_infos = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - + num_meta_group_infos = (ngroups + EXT4_DESC_PER_BLOCK(sb) - 1) >> EXT4_DESC_PER_BLOCK_BITS(sb); /* @@ -2644,7 +2620,7 @@ static int ext4_mb_init_backend(struct super_block *sb) for (i = 0; i < num_meta_group_infos; i++) { if ((i + 1) == num_meta_group_infos) metalen = sizeof(*meta_group_info) * - (sbi->s_groups_count - + (ngroups - (i << EXT4_DESC_PER_BLOCK_BITS(sb))); meta_group_info = kmalloc(metalen, GFP_KERNEL); if (meta_group_info == NULL) { @@ -2655,7 +2631,7 @@ static int ext4_mb_init_backend(struct super_block *sb) sbi->s_group_info[i] = meta_group_info; } - for (i = 0; i < sbi->s_groups_count; i++) { + for (i = 0; i < ngroups; i++) { desc = ext4_get_group_desc(sb, i, NULL); if (desc == NULL) { printk(KERN_ERR @@ -2761,7 +2737,7 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery) return 0; } -/* need to called with ext4 group lock (ext4_lock_group) */ +/* need to called with the ext4 group lock held */ static void ext4_mb_cleanup_pa(struct ext4_group_info *grp) { struct ext4_prealloc_space *pa; @@ -2781,13 +2757,14 @@ static void ext4_mb_cleanup_pa(struct ext4_group_info *grp) int ext4_mb_release(struct super_block *sb) { + ext4_group_t ngroups = ext4_get_groups_count(sb); ext4_group_t i; int num_meta_group_infos; struct ext4_group_info *grinfo; struct ext4_sb_info *sbi = EXT4_SB(sb); if (sbi->s_group_info) { - for (i = 0; i < sbi->s_groups_count; i++) { + for (i = 0; i < ngroups; i++) { grinfo = ext4_get_group_info(sb, i); #ifdef DOUBLE_CHECK kfree(grinfo->bb_bitmap); @@ -2797,7 +2774,7 @@ int ext4_mb_release(struct super_block *sb) ext4_unlock_group(sb, i); kfree(grinfo); } - num_meta_group_infos = (sbi->s_groups_count + + num_meta_group_infos = (ngroups + EXT4_DESC_PER_BLOCK(sb) - 1) >> EXT4_DESC_PER_BLOCK_BITS(sb); for (i = 0; i < num_meta_group_infos; i++) @@ -2984,27 +2961,25 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, + le32_to_cpu(es->s_first_data_block); len = ac->ac_b_ex.fe_len; - if (in_range(ext4_block_bitmap(sb, gdp), block, len) || - in_range(ext4_inode_bitmap(sb, gdp), block, len) || - in_range(block, ext4_inode_table(sb, gdp), - EXT4_SB(sb)->s_itb_per_group) || - in_range(block + len - 1, ext4_inode_table(sb, gdp), - EXT4_SB(sb)->s_itb_per_group)) { + if (!ext4_data_block_valid(sbi, block, len)) { ext4_error(sb, __func__, - "Allocating block %llu in system zone of %d group\n", - block, ac->ac_b_ex.fe_group); + "Allocating blocks %llu-%llu which overlap " + "fs metadata\n", block, block+len); /* File system mounted not to panic on error * Fix the bitmap and repeat the block allocation * We leak some of the blocks here. */ - mb_set_bits(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group), - bitmap_bh->b_data, ac->ac_b_ex.fe_start, - ac->ac_b_ex.fe_len); + ext4_lock_group(sb, ac->ac_b_ex.fe_group); + mb_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start, + ac->ac_b_ex.fe_len); + ext4_unlock_group(sb, ac->ac_b_ex.fe_group); err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); if (!err) err = -EAGAIN; goto out_err; } + + ext4_lock_group(sb, ac->ac_b_ex.fe_group); #ifdef AGGRESSIVE_CHECK { int i; @@ -3014,9 +2989,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, } } #endif - spin_lock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group)); - mb_set_bits(NULL, bitmap_bh->b_data, - ac->ac_b_ex.fe_start, ac->ac_b_ex.fe_len); + mb_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start,ac->ac_b_ex.fe_len); if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT); ext4_free_blks_set(sb, gdp, @@ -3026,7 +2999,8 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, len = ext4_free_blks_count(sb, gdp) - ac->ac_b_ex.fe_len; ext4_free_blks_set(sb, gdp, len); gdp->bg_checksum = ext4_group_desc_csum(sbi, ac->ac_b_ex.fe_group, gdp); - spin_unlock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group)); + + ext4_unlock_group(sb, ac->ac_b_ex.fe_group); percpu_counter_sub(&sbi->s_freeblocks_counter, ac->ac_b_ex.fe_len); /* * Now reduce the dirty block count also. Should not go negative @@ -3459,7 +3433,7 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac) * the function goes through all block freed in the group * but not yet committed and marks them used in in-core bitmap. * buddy must be generated from this bitmap - * Need to be called with ext4 group lock (ext4_lock_group) + * Need to be called with the ext4 group lock held */ static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap, ext4_group_t group) @@ -3473,9 +3447,7 @@ static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap, while (n) { entry = rb_entry(n, struct ext4_free_data, node); - mb_set_bits(sb_bgl_lock(EXT4_SB(sb), group), - bitmap, entry->start_blk, - entry->count); + mb_set_bits(bitmap, entry->start_blk, entry->count); n = rb_next(n); } return; @@ -3484,7 +3456,7 @@ static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap, /* * the function goes through all preallocation in this group and marks them * used in in-core bitmap. buddy must be generated from this bitmap - * Need to be called with ext4 group lock (ext4_lock_group) + * Need to be called with ext4 group lock held */ static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, ext4_group_t group) @@ -3516,8 +3488,7 @@ static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, if (unlikely(len == 0)) continue; BUG_ON(groupnr != group); - mb_set_bits(sb_bgl_lock(EXT4_SB(sb), group), - bitmap, start, len); + mb_set_bits(bitmap, start, len); preallocated += len; count++; } @@ -4121,7 +4092,7 @@ static void ext4_mb_return_to_preallocation(struct inode *inode, static void ext4_mb_show_ac(struct ext4_allocation_context *ac) { struct super_block *sb = ac->ac_sb; - ext4_group_t i; + ext4_group_t ngroups, i; printk(KERN_ERR "EXT4-fs: Can't allocate:" " Allocation context details:\n"); @@ -4145,7 +4116,8 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac) printk(KERN_ERR "EXT4-fs: %lu scanned, %d found\n", ac->ac_ex_scanned, ac->ac_found); printk(KERN_ERR "EXT4-fs: groups: \n"); - for (i = 0; i < EXT4_SB(sb)->s_groups_count; i++) { + ngroups = ext4_get_groups_count(sb); + for (i = 0; i < ngroups; i++) { struct ext4_group_info *grp = ext4_get_group_info(sb, i); struct ext4_prealloc_space *pa; ext4_grpblk_t start; @@ -4469,13 +4441,13 @@ static int ext4_mb_release_context(struct ext4_allocation_context *ac) static int ext4_mb_discard_preallocations(struct super_block *sb, int needed) { - ext4_group_t i; + ext4_group_t i, ngroups = ext4_get_groups_count(sb); int ret; int freed = 0; trace_mark(ext4_mb_discard_preallocations, "dev %s needed %d", sb->s_id, needed); - for (i = 0; i < EXT4_SB(sb)->s_groups_count && needed > 0; i++) { + for (i = 0; i < ngroups && needed > 0; i++) { ret = ext4_mb_discard_group_preallocations(sb, i, needed); freed += ret; needed -= ret; @@ -4859,29 +4831,25 @@ do_more: new_entry->group = block_group; new_entry->count = count; new_entry->t_tid = handle->h_transaction->t_tid; + ext4_lock_group(sb, block_group); - mb_clear_bits(sb_bgl_lock(sbi, block_group), bitmap_bh->b_data, - bit, count); + mb_clear_bits(bitmap_bh->b_data, bit, count); ext4_mb_free_metadata(handle, &e4b, new_entry); - ext4_unlock_group(sb, block_group); } else { - ext4_lock_group(sb, block_group); /* need to update group_info->bb_free and bitmap * with group lock held. generate_buddy look at * them with group lock_held */ - mb_clear_bits(sb_bgl_lock(sbi, block_group), bitmap_bh->b_data, - bit, count); + ext4_lock_group(sb, block_group); + mb_clear_bits(bitmap_bh->b_data, bit, count); mb_free_blocks(inode, &e4b, bit, count); ext4_mb_return_to_preallocation(inode, &e4b, block, count); - ext4_unlock_group(sb, block_group); } - spin_lock(sb_bgl_lock(sbi, block_group)); ret = ext4_free_blks_count(sb, gdp) + count; ext4_free_blks_set(sb, gdp, ret); gdp->bg_checksum = ext4_group_desc_csum(sbi, block_group, gdp); - spin_unlock(sb_bgl_lock(sbi, block_group)); + ext4_unlock_group(sb, block_group); percpu_counter_add(&sbi->s_freeblocks_counter, count); if (sbi->s_log_groups_per_flex) { diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h index dd9e6cd5f6cf..75e34f69215b 100644 --- a/fs/ext4/mballoc.h +++ b/fs/ext4/mballoc.h @@ -23,7 +23,6 @@ #include <linux/mutex.h> #include "ext4_jbd2.h" #include "ext4.h" -#include "group.h" /* * with AGGRESSIVE_CHECK allocator runs consistency checks over diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 22098e1cd085..f2bc160463b7 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -37,7 +37,6 @@ #include "ext4.h" #include "ext4_jbd2.h" -#include "namei.h" #include "xattr.h" #include "acl.h" @@ -1997,7 +1996,7 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode) if (!ext4_handle_valid(handle)) return 0; - lock_super(sb); + mutex_lock(&EXT4_SB(sb)->s_orphan_lock); if (!list_empty(&EXT4_I(inode)->i_orphan)) goto out_unlock; @@ -2006,9 +2005,13 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode) /* @@@ FIXME: Observation from aviro: * I think I can trigger J_ASSERT in ext4_orphan_add(). We block - * here (on lock_super()), so race with ext4_link() which might bump + * here (on s_orphan_lock), so race with ext4_link() which might bump * ->i_nlink. For, say it, character device. Not a regular file, * not a directory, not a symlink and ->i_nlink > 0. + * + * tytso, 4/25/2009: I'm not sure how that could happen; + * shouldn't the fs core protect us from these sort of + * unlink()/link() races? */ J_ASSERT((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) || inode->i_nlink == 0); @@ -2045,7 +2048,7 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode) jbd_debug(4, "orphan inode %lu will point to %d\n", inode->i_ino, NEXT_ORPHAN(inode)); out_unlock: - unlock_super(sb); + mutex_unlock(&EXT4_SB(sb)->s_orphan_lock); ext4_std_error(inode->i_sb, err); return err; } @@ -2066,11 +2069,9 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode) if (!ext4_handle_valid(handle)) return 0; - lock_super(inode->i_sb); - if (list_empty(&ei->i_orphan)) { - unlock_super(inode->i_sb); - return 0; - } + mutex_lock(&EXT4_SB(inode->i_sb)->s_orphan_lock); + if (list_empty(&ei->i_orphan)) + goto out; ino_next = NEXT_ORPHAN(inode); prev = ei->i_orphan.prev; @@ -2120,7 +2121,7 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode) out_err: ext4_std_error(inode->i_sb, err); out: - unlock_super(inode->i_sb); + mutex_unlock(&EXT4_SB(inode->i_sb)->s_orphan_lock); return err; out_brelse: @@ -2533,6 +2534,7 @@ const struct inode_operations ext4_dir_inode_operations = { .removexattr = generic_removexattr, #endif .permission = ext4_permission, + .fiemap = ext4_fiemap, }; const struct inode_operations ext4_special_inode_operations = { diff --git a/fs/ext4/namei.h b/fs/ext4/namei.h deleted file mode 100644 index 5e4dfff36a00..000000000000 --- a/fs/ext4/namei.h +++ /dev/null @@ -1,8 +0,0 @@ -/* linux/fs/ext4/namei.h - * - * Copyright (C) 2005 Simtec Electronics - * Ben Dooks <ben@simtec.co.uk> - * -*/ - -extern struct dentry *ext4_get_parent(struct dentry *child); diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index 546c7dd869e1..27eb289eea37 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -15,7 +15,6 @@ #include <linux/slab.h> #include "ext4_jbd2.h" -#include "group.h" #define outside(b, first, last) ((b) < (first) || (b) >= (last)) #define inside(b, first, last) ((b) >= (first) && (b) < (last)) @@ -193,7 +192,7 @@ static int setup_new_group_blocks(struct super_block *sb, if (IS_ERR(handle)) return PTR_ERR(handle); - lock_super(sb); + mutex_lock(&sbi->s_resize_lock); if (input->group != sbi->s_groups_count) { err = -EBUSY; goto exit_journal; @@ -302,7 +301,7 @@ exit_bh: brelse(bh); exit_journal: - unlock_super(sb); + mutex_unlock(&sbi->s_resize_lock); if ((err2 = ext4_journal_stop(handle)) && !err) err = err2; @@ -643,11 +642,12 @@ exit_free: * important part is that the new block and inode counts are in the backup * superblocks, and the location of the new group metadata in the GDT backups. * - * We do not need lock_super() for this, because these blocks are not - * otherwise touched by the filesystem code when it is mounted. We don't - * need to worry about last changing from sbi->s_groups_count, because the - * worst that can happen is that we do not copy the full number of backups - * at this time. The resize which changed s_groups_count will backup again. + * We do not need take the s_resize_lock for this, because these + * blocks are not otherwise touched by the filesystem code when it is + * mounted. We don't need to worry about last changing from + * sbi->s_groups_count, because the worst that can happen is that we + * do not copy the full number of backups at this time. The resize + * which changed s_groups_count will backup again. */ static void update_backups(struct super_block *sb, int blk_off, char *data, int size) @@ -809,7 +809,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) goto exit_put; } - lock_super(sb); + mutex_lock(&sbi->s_resize_lock); if (input->group != sbi->s_groups_count) { ext4_warning(sb, __func__, "multiple resizers run on filesystem!"); @@ -840,7 +840,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) /* * OK, now we've set up the new group. Time to make it active. * - * Current kernels don't lock all allocations via lock_super(), + * We do not lock all allocations via s_resize_lock * so we have to be safe wrt. concurrent accesses the group * data. So we need to be careful to set all of the relevant * group descriptor data etc. *before* we enable the group. @@ -900,12 +900,12 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) * * The precise rules we use are: * - * * Writers of s_groups_count *must* hold lock_super + * * Writers of s_groups_count *must* hold s_resize_lock * AND * * Writers must perform a smp_wmb() after updating all dependent * data and before modifying the groups count * - * * Readers must hold lock_super() over the access + * * Readers must hold s_resize_lock over the access * OR * * Readers must perform an smp_rmb() after reading the groups count * and before reading any dependent data. @@ -948,7 +948,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) sb->s_dirt = 1; exit_journal: - unlock_super(sb); + mutex_unlock(&sbi->s_resize_lock); if ((err2 = ext4_journal_stop(handle)) && !err) err = err2; if (!err) { @@ -986,7 +986,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es, /* We don't need to worry about locking wrt other resizers just * yet: we're going to revalidate es->s_blocks_count after - * taking lock_super() below. */ + * taking the s_resize_lock below. */ o_blocks_count = ext4_blocks_count(es); o_groups_count = EXT4_SB(sb)->s_groups_count; @@ -1056,11 +1056,11 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es, goto exit_put; } - lock_super(sb); + mutex_lock(&EXT4_SB(sb)->s_resize_lock); if (o_blocks_count != ext4_blocks_count(es)) { ext4_warning(sb, __func__, "multiple resizers run on filesystem!"); - unlock_super(sb); + mutex_unlock(&EXT4_SB(sb)->s_resize_lock); ext4_journal_stop(handle); err = -EBUSY; goto exit_put; @@ -1070,14 +1070,14 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es, EXT4_SB(sb)->s_sbh))) { ext4_warning(sb, __func__, "error %d on journal write access", err); - unlock_super(sb); + mutex_unlock(&EXT4_SB(sb)->s_resize_lock); ext4_journal_stop(handle); goto exit_put; } ext4_blocks_count_set(es, o_blocks_count + add); ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh); sb->s_dirt = 1; - unlock_super(sb); + mutex_unlock(&EXT4_SB(sb)->s_resize_lock); ext4_debug("freeing blocks %llu through %llu\n", o_blocks_count, o_blocks_count + add); /* We add the blocks to the bitmap and set the group need init bit */ diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 2958f4e6f222..eca6c057b119 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -20,6 +20,7 @@ #include <linux/string.h> #include <linux/fs.h> #include <linux/time.h> +#include <linux/vmalloc.h> #include <linux/jbd2.h> #include <linux/slab.h> #include <linux/init.h> @@ -45,16 +46,20 @@ #include "ext4_jbd2.h" #include "xattr.h" #include "acl.h" -#include "namei.h" -#include "group.h" + +static int default_mb_history_length = 1000; + +module_param_named(default_mb_history_length, default_mb_history_length, + int, 0644); +MODULE_PARM_DESC(default_mb_history_length, + "Default number of entries saved for mb_history"); struct proc_dir_entry *ext4_proc_root; static struct kset *ext4_kset; static int ext4_load_journal(struct super_block *, struct ext4_super_block *, unsigned long journal_devnum); -static int ext4_commit_super(struct super_block *sb, - struct ext4_super_block *es, int sync); +static int ext4_commit_super(struct super_block *sb, int sync); static void ext4_mark_recovery_complete(struct super_block *sb, struct ext4_super_block *es); static void ext4_clear_journal_err(struct super_block *sb, @@ -305,7 +310,7 @@ static void ext4_handle_error(struct super_block *sb) printk(KERN_CRIT "Remounting filesystem read-only\n"); sb->s_flags |= MS_RDONLY; } - ext4_commit_super(sb, es, 1); + ext4_commit_super(sb, 1); if (test_opt(sb, ERRORS_PANIC)) panic("EXT4-fs (device %s): panic forced after error\n", sb->s_id); @@ -447,7 +452,7 @@ __acquires(bitlock) if (test_opt(sb, ERRORS_CONT)) { EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS; es->s_state |= cpu_to_le16(EXT4_ERROR_FS); - ext4_commit_super(sb, es, 0); + ext4_commit_super(sb, 0); return; } ext4_unlock_group(sb, grp); @@ -563,6 +568,7 @@ static void ext4_put_super(struct super_block *sb) struct ext4_super_block *es = sbi->s_es; int i, err; + ext4_release_system_zone(sb); ext4_mb_release(sb); ext4_ext_release(sb); ext4_xattr_put_super(sb); @@ -576,7 +582,7 @@ static void ext4_put_super(struct super_block *sb) if (!(sb->s_flags & MS_RDONLY)) { EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); es->s_state = cpu_to_le16(sbi->s_mount_state); - ext4_commit_super(sb, es, 1); + ext4_commit_super(sb, 1); } if (sbi->s_proc) { remove_proc_entry(sb->s_id, ext4_proc_root); @@ -586,7 +592,10 @@ static void ext4_put_super(struct super_block *sb) for (i = 0; i < sbi->s_gdb_count; i++) brelse(sbi->s_group_desc[i]); kfree(sbi->s_group_desc); - kfree(sbi->s_flex_groups); + if (is_vmalloc_addr(sbi->s_flex_groups)) + vfree(sbi->s_flex_groups); + else + kfree(sbi->s_flex_groups); percpu_counter_destroy(&sbi->s_freeblocks_counter); percpu_counter_destroy(&sbi->s_freeinodes_counter); percpu_counter_destroy(&sbi->s_dirs_counter); @@ -992,7 +1001,6 @@ static const struct super_operations ext4_sops = { .dirty_inode = ext4_dirty_inode, .delete_inode = ext4_delete_inode, .put_super = ext4_put_super, - .write_super = ext4_write_super, .sync_fs = ext4_sync_fs, .freeze_fs = ext4_freeze, .unfreeze_fs = ext4_unfreeze, @@ -1007,6 +1015,25 @@ static const struct super_operations ext4_sops = { .bdev_try_to_free_page = bdev_try_to_free_page, }; +static const struct super_operations ext4_nojournal_sops = { + .alloc_inode = ext4_alloc_inode, + .destroy_inode = ext4_destroy_inode, + .write_inode = ext4_write_inode, + .dirty_inode = ext4_dirty_inode, + .delete_inode = ext4_delete_inode, + .write_super = ext4_write_super, + .put_super = ext4_put_super, + .statfs = ext4_statfs, + .remount_fs = ext4_remount, + .clear_inode = ext4_clear_inode, + .show_options = ext4_show_options, +#ifdef CONFIG_QUOTA + .quota_read = ext4_quota_read, + .quota_write = ext4_quota_write, +#endif + .bdev_try_to_free_page = bdev_try_to_free_page, +}; + static const struct export_operations ext4_export_ops = { .fh_to_dentry = ext4_fh_to_dentry, .fh_to_parent = ext4_fh_to_parent, @@ -1023,12 +1050,13 @@ enum { Opt_journal_update, Opt_journal_dev, Opt_journal_checksum, Opt_journal_async_commit, Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback, - Opt_data_err_abort, Opt_data_err_ignore, + Opt_data_err_abort, Opt_data_err_ignore, Opt_mb_history_length, Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota, Opt_ignore, Opt_barrier, Opt_nobarrier, Opt_err, Opt_resize, Opt_usrquota, Opt_grpquota, Opt_i_version, Opt_stripe, Opt_delalloc, Opt_nodelalloc, + Opt_block_validity, Opt_noblock_validity, Opt_inode_readahead_blks, Opt_journal_ioprio }; @@ -1069,6 +1097,7 @@ static const match_table_t tokens = { {Opt_data_writeback, "data=writeback"}, {Opt_data_err_abort, "data_err=abort"}, {Opt_data_err_ignore, "data_err=ignore"}, + {Opt_mb_history_length, "mb_history_length=%u"}, {Opt_offusrjquota, "usrjquota="}, {Opt_usrjquota, "usrjquota=%s"}, {Opt_offgrpjquota, "grpjquota="}, @@ -1087,6 +1116,8 @@ static const match_table_t tokens = { {Opt_resize, "resize"}, {Opt_delalloc, "delalloc"}, {Opt_nodelalloc, "nodelalloc"}, + {Opt_block_validity, "block_validity"}, + {Opt_noblock_validity, "noblock_validity"}, {Opt_inode_readahead_blks, "inode_readahead_blks=%u"}, {Opt_journal_ioprio, "journal_ioprio=%u"}, {Opt_auto_da_alloc, "auto_da_alloc=%u"}, @@ -1310,6 +1341,13 @@ static int parse_options(char *options, struct super_block *sb, case Opt_data_err_ignore: clear_opt(sbi->s_mount_opt, DATA_ERR_ABORT); break; + case Opt_mb_history_length: + if (match_int(&args[0], &option)) + return 0; + if (option < 0) + return 0; + sbi->s_mb_history_max = option; + break; #ifdef CONFIG_QUOTA case Opt_usrjquota: qtype = USRQUOTA; @@ -1474,12 +1512,18 @@ set_qf_format: case Opt_delalloc: set_opt(sbi->s_mount_opt, DELALLOC); break; + case Opt_block_validity: + set_opt(sbi->s_mount_opt, BLOCK_VALIDITY); + break; + case Opt_noblock_validity: + clear_opt(sbi->s_mount_opt, BLOCK_VALIDITY); + break; case Opt_inode_readahead_blks: if (match_int(&args[0], &option)) return 0; if (option < 0 || option > (1 << 30)) return 0; - if (option & (option - 1)) { + if (!is_power_of_2(option)) { printk(KERN_ERR "EXT4-fs: inode_readahead_blks" " must be a power of 2\n"); return 0; @@ -1592,7 +1636,7 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es, if (sbi->s_journal) EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); - ext4_commit_super(sb, es, 1); + ext4_commit_super(sb, 1); if (test_opt(sb, DEBUG)) printk(KERN_INFO "[EXT4 FS bs=%lu, gc=%u, " "bpg=%lu, ipg=%lu, mo=%04lx]\n", @@ -1620,6 +1664,7 @@ static int ext4_fill_flex_info(struct super_block *sb) ext4_group_t flex_group_count; ext4_group_t flex_group; int groups_per_flex = 0; + size_t size; int i; if (!sbi->s_es->s_log_groups_per_flex) { @@ -1634,8 +1679,13 @@ static int ext4_fill_flex_info(struct super_block *sb) flex_group_count = ((sbi->s_groups_count + groups_per_flex - 1) + ((le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) + 1) << EXT4_DESC_PER_BLOCK_BITS(sb))) / groups_per_flex; - sbi->s_flex_groups = kzalloc(flex_group_count * - sizeof(struct flex_groups), GFP_KERNEL); + size = flex_group_count * sizeof(struct flex_groups); + sbi->s_flex_groups = kzalloc(size, GFP_KERNEL); + if (sbi->s_flex_groups == NULL) { + sbi->s_flex_groups = vmalloc(size); + if (sbi->s_flex_groups) + memset(sbi->s_flex_groups, 0, size); + } if (sbi->s_flex_groups == NULL) { printk(KERN_ERR "EXT4-fs: not enough memory for " "%u flex groups\n", flex_group_count); @@ -1744,18 +1794,18 @@ static int ext4_check_descriptors(struct super_block *sb) "(block %llu)!\n", i, inode_table); return 0; } - spin_lock(sb_bgl_lock(sbi, i)); + ext4_lock_group(sb, i); if (!ext4_group_desc_csum_verify(sbi, i, gdp)) { printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: " "Checksum for group %u failed (%u!=%u)\n", i, le16_to_cpu(ext4_group_desc_csum(sbi, i, gdp)), le16_to_cpu(gdp->bg_checksum)); if (!(sb->s_flags & MS_RDONLY)) { - spin_unlock(sb_bgl_lock(sbi, i)); + ext4_unlock_group(sb, i); return 0; } } - spin_unlock(sb_bgl_lock(sbi, i)); + ext4_unlock_group(sb, i); if (!flexbg_flag) first_block += EXT4_BLOCKS_PER_GROUP(sb); } @@ -2091,8 +2141,7 @@ static ssize_t inode_readahead_blks_store(struct ext4_attr *a, if (parse_strtoul(buf, 0x40000000, &t)) return -EINVAL; - /* inode_readahead_blks must be a power of 2 */ - if (t & (t-1)) + if (!is_power_of_2(t)) return -EINVAL; sbi->s_inode_readahead_blks = t; @@ -2321,6 +2370,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) sbi->s_commit_interval = JBD2_DEFAULT_MAX_COMMIT_AGE * HZ; sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME; sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME; + sbi->s_mb_history_max = default_mb_history_length; set_opt(sbi->s_mount_opt, BARRIER); @@ -2607,7 +2657,11 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) /* * set up enough so that it can read an inode */ - sb->s_op = &ext4_sops; + if (!test_opt(sb, NOLOAD) && + EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)) + sb->s_op = &ext4_sops; + else + sb->s_op = &ext4_nojournal_sops; sb->s_export_op = &ext4_export_ops; sb->s_xattr = ext4_xattr_handlers; #ifdef CONFIG_QUOTA @@ -2615,6 +2669,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) sb->dq_op = &ext4_quota_operations; #endif INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */ + mutex_init(&sbi->s_orphan_lock); + mutex_init(&sbi->s_resize_lock); sb->s_root = NULL; @@ -2646,7 +2702,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) if (test_opt(sb, ERRORS_PANIC)) { EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS; es->s_state |= cpu_to_le16(EXT4_ERROR_FS); - ext4_commit_super(sb, es, 1); + ext4_commit_super(sb, 1); goto failed_mount4; } } @@ -2780,6 +2836,13 @@ no_journal: } else if (test_opt(sb, DELALLOC)) printk(KERN_INFO "EXT4-fs: delayed allocation enabled\n"); + err = ext4_setup_system_zone(sb); + if (err) { + printk(KERN_ERR "EXT4-fs: failed to initialize system " + "zone (%d)\n", err); + goto failed_mount4; + } + ext4_ext_init(sb); err = ext4_mb_init(sb, needs_recovery); if (err) { @@ -2798,14 +2861,6 @@ no_journal: goto failed_mount4; }; - /* - * akpm: core read_super() calls in here with the superblock locked. - * That deadlocks, because orphan cleanup needs to lock the superblock - * in numerous places. Here we just pop the lock - it's relatively - * harmless, because we are now ready to accept write_super() requests, - * and aviro says that's the only reason for hanging onto the - * superblock lock. - */ EXT4_SB(sb)->s_mount_state |= EXT4_ORPHAN_FS; ext4_orphan_cleanup(sb, es); EXT4_SB(sb)->s_mount_state &= ~EXT4_ORPHAN_FS; @@ -2837,11 +2892,18 @@ cantfind_ext4: failed_mount4: printk(KERN_ERR "EXT4-fs (device %s): mount failed\n", sb->s_id); + ext4_release_system_zone(sb); if (sbi->s_journal) { jbd2_journal_destroy(sbi->s_journal); sbi->s_journal = NULL; } failed_mount3: + if (sbi->s_flex_groups) { + if (is_vmalloc_addr(sbi->s_flex_groups)) + vfree(sbi->s_flex_groups); + else + kfree(sbi->s_flex_groups); + } percpu_counter_destroy(&sbi->s_freeblocks_counter); percpu_counter_destroy(&sbi->s_freeinodes_counter); percpu_counter_destroy(&sbi->s_dirs_counter); @@ -2862,6 +2924,7 @@ failed_mount: brelse(bh); out_fail: sb->s_fs_info = NULL; + kfree(sbi->s_blockgroup_lock); kfree(sbi); lock_kernel(); return ret; @@ -3114,18 +3177,17 @@ static int ext4_load_journal(struct super_block *sb, if (journal_devnum && journal_devnum != le32_to_cpu(es->s_journal_dev)) { es->s_journal_dev = cpu_to_le32(journal_devnum); - sb->s_dirt = 1; /* Make sure we flush the recovery flag to disk. */ - ext4_commit_super(sb, es, 1); + ext4_commit_super(sb, 1); } return 0; } -static int ext4_commit_super(struct super_block *sb, - struct ext4_super_block *es, int sync) +static int ext4_commit_super(struct super_block *sb, int sync) { + struct ext4_super_block *es = EXT4_SB(sb)->s_es; struct buffer_head *sbh = EXT4_SB(sb)->s_sbh; int error = 0; @@ -3154,7 +3216,7 @@ static int ext4_commit_super(struct super_block *sb, &EXT4_SB(sb)->s_freeblocks_counter)); es->s_free_inodes_count = cpu_to_le32(percpu_counter_sum_positive( &EXT4_SB(sb)->s_freeinodes_counter)); - + sb->s_dirt = 0; BUFFER_TRACE(sbh, "marking dirty"); mark_buffer_dirty(sbh); if (sync) { @@ -3192,14 +3254,11 @@ static void ext4_mark_recovery_complete(struct super_block *sb, if (jbd2_journal_flush(journal) < 0) goto out; - lock_super(sb); if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER) && sb->s_flags & MS_RDONLY) { EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); - sb->s_dirt = 0; - ext4_commit_super(sb, es, 1); + ext4_commit_super(sb, 1); } - unlock_super(sb); out: jbd2_journal_unlock_updates(journal); @@ -3238,7 +3297,7 @@ static void ext4_clear_journal_err(struct super_block *sb, EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS; es->s_state |= cpu_to_le16(EXT4_ERROR_FS); - ext4_commit_super(sb, es, 1); + ext4_commit_super(sb, 1); jbd2_journal_clear_err(journal); } @@ -3257,29 +3316,15 @@ int ext4_force_commit(struct super_block *sb) return 0; journal = EXT4_SB(sb)->s_journal; - if (journal) { - sb->s_dirt = 0; + if (journal) ret = ext4_journal_force_commit(journal); - } return ret; } -/* - * Ext4 always journals updates to the superblock itself, so we don't - * have to propagate any other updates to the superblock on disk at this - * point. (We can probably nuke this function altogether, and remove - * any mention to sb->s_dirt in all of fs/ext4; eventual cleanup...) - */ static void ext4_write_super(struct super_block *sb) { - if (EXT4_SB(sb)->s_journal) { - if (mutex_trylock(&sb->s_lock) != 0) - BUG(); - sb->s_dirt = 0; - } else { - ext4_commit_super(sb, EXT4_SB(sb)->s_es, 1); - } + ext4_commit_super(sb, 1); } static int ext4_sync_fs(struct super_block *sb, int wait) @@ -3288,16 +3333,9 @@ static int ext4_sync_fs(struct super_block *sb, int wait) tid_t target; trace_mark(ext4_sync_fs, "dev %s wait %d", sb->s_id, wait); - sb->s_dirt = 0; - if (EXT4_SB(sb)->s_journal) { - if (jbd2_journal_start_commit(EXT4_SB(sb)->s_journal, - &target)) { - if (wait) - jbd2_log_wait_commit(EXT4_SB(sb)->s_journal, - target); - } - } else { - ext4_commit_super(sb, EXT4_SB(sb)->s_es, wait); + if (jbd2_journal_start_commit(EXT4_SB(sb)->s_journal, &target)) { + if (wait) + jbd2_log_wait_commit(EXT4_SB(sb)->s_journal, target); } return ret; } @@ -3310,34 +3348,32 @@ static int ext4_freeze(struct super_block *sb) { int error = 0; journal_t *journal; - sb->s_dirt = 0; - if (!(sb->s_flags & MS_RDONLY)) { - journal = EXT4_SB(sb)->s_journal; + if (sb->s_flags & MS_RDONLY) + return 0; - if (journal) { - /* Now we set up the journal barrier. */ - jbd2_journal_lock_updates(journal); + journal = EXT4_SB(sb)->s_journal; - /* - * We don't want to clear needs_recovery flag when we - * failed to flush the journal. - */ - error = jbd2_journal_flush(journal); - if (error < 0) - goto out; - } + /* Now we set up the journal barrier. */ + jbd2_journal_lock_updates(journal); - /* Journal blocked and flushed, clear needs_recovery flag. */ - EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); - error = ext4_commit_super(sb, EXT4_SB(sb)->s_es, 1); - if (error) - goto out; + /* + * Don't clear the needs_recovery flag if we failed to flush + * the journal. + */ + error = jbd2_journal_flush(journal); + if (error < 0) { + out: + jbd2_journal_unlock_updates(journal); + return error; } + + /* Journal blocked and flushed, clear needs_recovery flag. */ + EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); + error = ext4_commit_super(sb, 1); + if (error) + goto out; return 0; -out: - jbd2_journal_unlock_updates(journal); - return error; } /* @@ -3346,14 +3382,15 @@ out: */ static int ext4_unfreeze(struct super_block *sb) { - if (EXT4_SB(sb)->s_journal && !(sb->s_flags & MS_RDONLY)) { - lock_super(sb); - /* Reser the needs_recovery flag before the fs is unlocked. */ - EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); - ext4_commit_super(sb, EXT4_SB(sb)->s_es, 1); - unlock_super(sb); - jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal); - } + if (sb->s_flags & MS_RDONLY) + return 0; + + lock_super(sb); + /* Reset the needs_recovery flag before the fs is unlocked. */ + EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); + ext4_commit_super(sb, 1); + unlock_super(sb); + jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal); return 0; } @@ -3432,15 +3469,8 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) (sbi->s_mount_state & EXT4_VALID_FS)) es->s_state = cpu_to_le16(sbi->s_mount_state); - /* - * We have to unlock super so that we can wait for - * transactions. - */ - if (sbi->s_journal) { - unlock_super(sb); + if (sbi->s_journal) ext4_mark_recovery_complete(sb, es); - lock_super(sb); - } } else { int ret; if ((ret = EXT4_HAS_RO_COMPAT_FEATURE(sb, @@ -3504,8 +3534,9 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) sb->s_flags &= ~MS_RDONLY; } } + ext4_setup_system_zone(sb); if (sbi->s_journal == NULL) - ext4_commit_super(sb, es, 1); + ext4_commit_super(sb, 1); #ifdef CONFIG_QUOTA /* Release old quota file names */ @@ -3545,9 +3576,8 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf) if (test_opt(sb, MINIX_DF)) { sbi->s_overhead_last = 0; } else if (sbi->s_blocks_last != ext4_blocks_count(es)) { - ext4_group_t ngroups = sbi->s_groups_count, i; + ext4_group_t i, ngroups = ext4_get_groups_count(sb); ext4_fsblk_t overhead = 0; - smp_rmb(); /* * Compute the overhead (FS structures). This is constant @@ -3917,13 +3947,16 @@ static int __init init_ext4_fs(void) { int err; + err = init_ext4_system_zone(); + if (err) + return err; ext4_kset = kset_create_and_add("ext4", NULL, fs_kobj); if (!ext4_kset) - return -ENOMEM; + goto out4; ext4_proc_root = proc_mkdir("fs/ext4", NULL); err = init_ext4_mballoc(); if (err) - return err; + goto out3; err = init_ext4_xattr(); if (err) @@ -3948,6 +3981,11 @@ out1: exit_ext4_xattr(); out2: exit_ext4_mballoc(); +out3: + remove_proc_entry("fs/ext4", NULL); + kset_unregister(ext4_kset); +out4: + exit_ext4_system_zone(); return err; } @@ -3962,6 +4000,7 @@ static void __exit exit_ext4_fs(void) exit_ext4_mballoc(); remove_proc_entry("fs/ext4", NULL); kset_unregister(ext4_kset); + exit_ext4_system_zone(); } MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others"); diff --git a/fs/ioctl.c b/fs/ioctl.c index 82d9c42b8bac..286f38dfc6c0 100644 --- a/fs/ioctl.c +++ b/fs/ioctl.c @@ -414,10 +414,6 @@ static int file_ioctl(struct file *filp, unsigned int cmd, switch (cmd) { case FIBMAP: return ioctl_fibmap(filp, p); - case FS_IOC_FIEMAP: - return ioctl_fiemap(filp, arg); - case FIGETBSZ: - return put_user(inode->i_sb->s_blocksize, p); case FIONREAD: return put_user(i_size_read(inode) - filp->f_pos, p); } @@ -557,6 +553,16 @@ int do_vfs_ioctl(struct file *filp, unsigned int fd, unsigned int cmd, error = ioctl_fsthaw(filp); break; + case FS_IOC_FIEMAP: + return ioctl_fiemap(filp, arg); + + case FIGETBSZ: + { + struct inode *inode = filp->f_path.dentry->d_inode; + int __user *p = (int __user *)arg; + return put_user(inode->i_sb->s_blocksize, p); + } + default: if (S_ISREG(filp->f_path.dentry->d_inode->i_mode)) error = file_ioctl(filp, cmd, arg); diff --git a/fs/mpage.c b/fs/mpage.c index 680ba60863ff..42381bd6543b 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -379,7 +379,8 @@ mpage_readpages(struct address_space *mapping, struct list_head *pages, struct buffer_head map_bh; unsigned long first_logical_block = 0; - clear_buffer_mapped(&map_bh); + map_bh.b_state = 0; + map_bh.b_size = 0; for (page_idx = 0; page_idx < nr_pages; page_idx++) { struct page *page = list_entry(pages->prev, struct page, lru); @@ -412,7 +413,8 @@ int mpage_readpage(struct page *page, get_block_t get_block) struct buffer_head map_bh; unsigned long first_logical_block = 0; - clear_buffer_mapped(&map_bh); + map_bh.b_state = 0; + map_bh.b_size = 0; bio = do_mpage_readpage(bio, page, 1, &last_block_in_bio, &map_bh, &first_logical_block, get_block); if (bio) |