summaryrefslogtreecommitdiff
path: root/fs/btrfs/disk-io.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs/disk-io.c')
-rw-r--r--fs/btrfs/disk-io.c574
1 files changed, 468 insertions, 106 deletions
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 1ac8db5dc0a3..102c176fc29c 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -100,38 +100,83 @@ struct async_submit_bio {
struct btrfs_work work;
};
-/* These are used to set the lockdep class on the extent buffer locks.
- * The class is set by the readpage_end_io_hook after the buffer has
- * passed csum validation but before the pages are unlocked.
+/*
+ * Lockdep class keys for extent_buffer->lock's in this root. For a given
+ * eb, the lockdep key is determined by the btrfs_root it belongs to and
+ * the level the eb occupies in the tree.
+ *
+ * Different roots are used for different purposes and may nest inside each
+ * other and they require separate keysets. As lockdep keys should be
+ * static, assign keysets according to the purpose of the root as indicated
+ * by btrfs_root->objectid. This ensures that all special purpose roots
+ * have separate keysets.
*
- * The lockdep class is also set by btrfs_init_new_buffer on freshly
- * allocated blocks.
+ * Lock-nesting across peer nodes is always done with the immediate parent
+ * node locked thus preventing deadlock. As lockdep doesn't know this, use
+ * subclass to avoid triggering lockdep warning in such cases.
*
- * The class is based on the level in the tree block, which allows lockdep
- * to know that lower nodes nest inside the locks of higher nodes.
+ * The key is set by the readpage_end_io_hook after the buffer has passed
+ * csum validation but before the pages are unlocked. It is also set by
+ * btrfs_init_new_buffer on freshly allocated blocks.
*
- * We also add a check to make sure the highest level of the tree is
- * the same as our lockdep setup here. If BTRFS_MAX_LEVEL changes, this
- * code needs update as well.
+ * We also add a check to make sure the highest level of the tree is the
+ * same as our lockdep setup here. If BTRFS_MAX_LEVEL changes, this code
+ * needs update as well.
*/
#ifdef CONFIG_DEBUG_LOCK_ALLOC
# if BTRFS_MAX_LEVEL != 8
# error
# endif
-static struct lock_class_key btrfs_eb_class[BTRFS_MAX_LEVEL + 1];
-static const char *btrfs_eb_name[BTRFS_MAX_LEVEL + 1] = {
- /* leaf */
- "btrfs-extent-00",
- "btrfs-extent-01",
- "btrfs-extent-02",
- "btrfs-extent-03",
- "btrfs-extent-04",
- "btrfs-extent-05",
- "btrfs-extent-06",
- "btrfs-extent-07",
- /* highest possible level */
- "btrfs-extent-08",
+
+static struct btrfs_lockdep_keyset {
+ u64 id; /* root objectid */
+ const char *name_stem; /* lock name stem */
+ char names[BTRFS_MAX_LEVEL + 1][20];
+ struct lock_class_key keys[BTRFS_MAX_LEVEL + 1];
+} btrfs_lockdep_keysets[] = {
+ { .id = BTRFS_ROOT_TREE_OBJECTID, .name_stem = "root" },
+ { .id = BTRFS_EXTENT_TREE_OBJECTID, .name_stem = "extent" },
+ { .id = BTRFS_CHUNK_TREE_OBJECTID, .name_stem = "chunk" },
+ { .id = BTRFS_DEV_TREE_OBJECTID, .name_stem = "dev" },
+ { .id = BTRFS_FS_TREE_OBJECTID, .name_stem = "fs" },
+ { .id = BTRFS_CSUM_TREE_OBJECTID, .name_stem = "csum" },
+ { .id = BTRFS_ORPHAN_OBJECTID, .name_stem = "orphan" },
+ { .id = BTRFS_TREE_LOG_OBJECTID, .name_stem = "log" },
+ { .id = BTRFS_TREE_RELOC_OBJECTID, .name_stem = "treloc" },
+ { .id = BTRFS_DATA_RELOC_TREE_OBJECTID, .name_stem = "dreloc" },
+ { .id = 0, .name_stem = "tree" },
};
+
+void __init btrfs_init_lockdep(void)
+{
+ int i, j;
+
+ /* initialize lockdep class names */
+ for (i = 0; i < ARRAY_SIZE(btrfs_lockdep_keysets); i++) {
+ struct btrfs_lockdep_keyset *ks = &btrfs_lockdep_keysets[i];
+
+ for (j = 0; j < ARRAY_SIZE(ks->names); j++)
+ snprintf(ks->names[j], sizeof(ks->names[j]),
+ "btrfs-%s-%02d", ks->name_stem, j);
+ }
+}
+
+void btrfs_set_buffer_lockdep_class(u64 objectid, struct extent_buffer *eb,
+ int level)
+{
+ struct btrfs_lockdep_keyset *ks;
+
+ BUG_ON(level >= ARRAY_SIZE(ks->keys));
+
+ /* find the matching keyset, id 0 is the default entry */
+ for (ks = btrfs_lockdep_keysets; ks->id; ks++)
+ if (ks->id == objectid)
+ break;
+
+ lockdep_set_class_and_name(&eb->lock,
+ &ks->keys[level], ks->names[level]);
+}
+
#endif
/*
@@ -211,13 +256,11 @@ void btrfs_csum_final(u32 crc, char *result)
static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf,
int verify)
{
- u16 csum_size =
- btrfs_super_csum_size(&root->fs_info->super_copy);
+ u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
char *result = NULL;
unsigned long len;
unsigned long cur_len;
unsigned long offset = BTRFS_CSUM_SIZE;
- char *map_token = NULL;
char *kaddr;
unsigned long map_start;
unsigned long map_len;
@@ -228,8 +271,7 @@ static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf,
len = buf->len - offset;
while (len > 0) {
err = map_private_extent_buffer(buf, offset, 32,
- &map_token, &kaddr,
- &map_start, &map_len, KM_USER0);
+ &kaddr, &map_start, &map_len);
if (err)
return 1;
cur_len = min(len, map_len - (offset - map_start));
@@ -237,7 +279,6 @@ static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf,
crc, cur_len);
len -= cur_len;
offset += cur_len;
- unmap_extent_buffer(buf, map_token, KM_USER0);
}
if (csum_size > sizeof(inline_result)) {
result = kzalloc(csum_size * sizeof(char), GFP_NOFS);
@@ -325,7 +366,8 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root,
clear_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
io_tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree;
while (1) {
- ret = read_extent_buffer_pages(io_tree, eb, start, 1,
+ ret = read_extent_buffer_pages(io_tree, eb, start,
+ WAIT_COMPLETE,
btree_get_extent, mirror_num);
if (!ret &&
!verify_parent_transid(io_tree, eb, parent_transid))
@@ -494,15 +536,6 @@ static noinline int check_leaf(struct btrfs_root *root,
return 0;
}
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-void btrfs_set_buffer_lockdep_class(struct extent_buffer *eb, int level)
-{
- lockdep_set_class_and_name(&eb->lock,
- &btrfs_eb_class[level],
- btrfs_eb_name[level]);
-}
-#endif
-
static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
struct extent_state *state)
{
@@ -553,7 +586,8 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
}
found_level = btrfs_header_level(eb);
- btrfs_set_buffer_lockdep_class(eb, found_level);
+ btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb),
+ eb, found_level);
ret = csum_tree_block(root, eb, 1);
if (ret) {
@@ -574,11 +608,48 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
end = min_t(u64, eb->len, PAGE_CACHE_SIZE);
end = eb->start + end - 1;
err:
+ if (test_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) {
+ clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags);
+ btree_readahead_hook(root, eb, eb->start, ret);
+ }
+
free_extent_buffer(eb);
out:
return ret;
}
+static int btree_io_failed_hook(struct bio *failed_bio,
+ struct page *page, u64 start, u64 end,
+ u64 mirror_num, struct extent_state *state)
+{
+ struct extent_io_tree *tree;
+ unsigned long len;
+ struct extent_buffer *eb;
+ struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
+
+ tree = &BTRFS_I(page->mapping->host)->io_tree;
+ if (page->private == EXTENT_PAGE_PRIVATE)
+ goto out;
+ if (!page->private)
+ goto out;
+
+ len = page->private >> 2;
+ WARN_ON(len == 0);
+
+ eb = alloc_extent_buffer(tree, start, len, page);
+ if (eb == NULL)
+ goto out;
+
+ if (test_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) {
+ clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags);
+ btree_readahead_hook(root, eb, eb->start, -EIO);
+ }
+ free_extent_buffer(eb);
+
+out:
+ return -EIO; /* we fixed nothing */
+}
+
static void end_workqueue_bio(struct bio *bio, int err)
{
struct end_io_wq *end_io_wq = bio->bi_private;
@@ -874,7 +945,7 @@ static int btree_readpage(struct file *file, struct page *page)
{
struct extent_io_tree *tree;
tree = &BTRFS_I(page->mapping->host)->io_tree;
- return extent_read_full_page(tree, page, btree_get_extent);
+ return extent_read_full_page(tree, page, btree_get_extent, 0);
}
static int btree_releasepage(struct page *page, gfp_t gfp_flags)
@@ -940,11 +1011,43 @@ int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize,
if (!buf)
return 0;
read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree,
- buf, 0, 0, btree_get_extent, 0);
+ buf, 0, WAIT_NONE, btree_get_extent, 0);
free_extent_buffer(buf);
return ret;
}
+int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, u32 blocksize,
+ int mirror_num, struct extent_buffer **eb)
+{
+ struct extent_buffer *buf = NULL;
+ struct inode *btree_inode = root->fs_info->btree_inode;
+ struct extent_io_tree *io_tree = &BTRFS_I(btree_inode)->io_tree;
+ int ret;
+
+ buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
+ if (!buf)
+ return 0;
+
+ set_bit(EXTENT_BUFFER_READAHEAD, &buf->bflags);
+
+ ret = read_extent_buffer_pages(io_tree, buf, 0, WAIT_PAGE_LOCK,
+ btree_get_extent, mirror_num);
+ if (ret) {
+ free_extent_buffer(buf);
+ return ret;
+ }
+
+ if (test_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags)) {
+ free_extent_buffer(buf);
+ return -EIO;
+ } else if (extent_buffer_uptodate(io_tree, buf, NULL)) {
+ *eb = buf;
+ } else {
+ free_extent_buffer(buf);
+ }
+ return 0;
+}
+
struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
u64 bytenr, u32 blocksize)
{
@@ -1077,12 +1180,7 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
init_completion(&root->kobj_unregister);
root->defrag_running = 0;
root->root_key.objectid = objectid;
- root->anon_super.s_root = NULL;
- root->anon_super.s_dev = 0;
- INIT_LIST_HEAD(&root->anon_super.s_list);
- INIT_LIST_HEAD(&root->anon_super.s_instances);
- init_rwsem(&root->anon_super.s_umount);
-
+ root->anon_dev = 0;
return 0;
}
@@ -1106,10 +1204,12 @@ static int find_and_setup_root(struct btrfs_root *tree_root,
generation = btrfs_root_generation(&root->root_item);
blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
+ root->commit_root = NULL;
root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
blocksize, generation);
if (!root->node || !btrfs_buffer_uptodate(root->node, generation)) {
free_extent_buffer(root->node);
+ root->node = NULL;
return -EIO;
}
root->commit_root = btrfs_root_node(root);
@@ -1311,7 +1411,7 @@ again:
spin_lock_init(&root->cache_lock);
init_waitqueue_head(&root->cache_wait);
- ret = set_anon_super(&root->anon_super, NULL);
+ ret = get_anon_bdev(&root->anon_dev);
if (ret)
goto fail;
@@ -1548,6 +1648,235 @@ sleep:
return 0;
}
+/*
+ * this will find the highest generation in the array of
+ * root backups. The index of the highest array is returned,
+ * or -1 if we can't find anything.
+ *
+ * We check to make sure the array is valid by comparing the
+ * generation of the latest root in the array with the generation
+ * in the super block. If they don't match we pitch it.
+ */
+static int find_newest_super_backup(struct btrfs_fs_info *info, u64 newest_gen)
+{
+ u64 cur;
+ int newest_index = -1;
+ struct btrfs_root_backup *root_backup;
+ int i;
+
+ for (i = 0; i < BTRFS_NUM_BACKUP_ROOTS; i++) {
+ root_backup = info->super_copy->super_roots + i;
+ cur = btrfs_backup_tree_root_gen(root_backup);
+ if (cur == newest_gen)
+ newest_index = i;
+ }
+
+ /* check to see if we actually wrapped around */
+ if (newest_index == BTRFS_NUM_BACKUP_ROOTS - 1) {
+ root_backup = info->super_copy->super_roots;
+ cur = btrfs_backup_tree_root_gen(root_backup);
+ if (cur == newest_gen)
+ newest_index = 0;
+ }
+ return newest_index;
+}
+
+
+/*
+ * find the oldest backup so we know where to store new entries
+ * in the backup array. This will set the backup_root_index
+ * field in the fs_info struct
+ */
+static void find_oldest_super_backup(struct btrfs_fs_info *info,
+ u64 newest_gen)
+{
+ int newest_index = -1;
+
+ newest_index = find_newest_super_backup(info, newest_gen);
+ /* if there was garbage in there, just move along */
+ if (newest_index == -1) {
+ info->backup_root_index = 0;
+ } else {
+ info->backup_root_index = (newest_index + 1) % BTRFS_NUM_BACKUP_ROOTS;
+ }
+}
+
+/*
+ * copy all the root pointers into the super backup array.
+ * this will bump the backup pointer by one when it is
+ * done
+ */
+static void backup_super_roots(struct btrfs_fs_info *info)
+{
+ int next_backup;
+ struct btrfs_root_backup *root_backup;
+ int last_backup;
+
+ next_backup = info->backup_root_index;
+ last_backup = (next_backup + BTRFS_NUM_BACKUP_ROOTS - 1) %
+ BTRFS_NUM_BACKUP_ROOTS;
+
+ /*
+ * just overwrite the last backup if we're at the same generation
+ * this happens only at umount
+ */
+ root_backup = info->super_for_commit->super_roots + last_backup;
+ if (btrfs_backup_tree_root_gen(root_backup) ==
+ btrfs_header_generation(info->tree_root->node))
+ next_backup = last_backup;
+
+ root_backup = info->super_for_commit->super_roots + next_backup;
+
+ /*
+ * make sure all of our padding and empty slots get zero filled
+ * regardless of which ones we use today
+ */
+ memset(root_backup, 0, sizeof(*root_backup));
+
+ info->backup_root_index = (next_backup + 1) % BTRFS_NUM_BACKUP_ROOTS;
+
+ btrfs_set_backup_tree_root(root_backup, info->tree_root->node->start);
+ btrfs_set_backup_tree_root_gen(root_backup,
+ btrfs_header_generation(info->tree_root->node));
+
+ btrfs_set_backup_tree_root_level(root_backup,
+ btrfs_header_level(info->tree_root->node));
+
+ btrfs_set_backup_chunk_root(root_backup, info->chunk_root->node->start);
+ btrfs_set_backup_chunk_root_gen(root_backup,
+ btrfs_header_generation(info->chunk_root->node));
+ btrfs_set_backup_chunk_root_level(root_backup,
+ btrfs_header_level(info->chunk_root->node));
+
+ btrfs_set_backup_extent_root(root_backup, info->extent_root->node->start);
+ btrfs_set_backup_extent_root_gen(root_backup,
+ btrfs_header_generation(info->extent_root->node));
+ btrfs_set_backup_extent_root_level(root_backup,
+ btrfs_header_level(info->extent_root->node));
+
+ /*
+ * we might commit during log recovery, which happens before we set
+ * the fs_root. Make sure it is valid before we fill it in.
+ */
+ if (info->fs_root && info->fs_root->node) {
+ btrfs_set_backup_fs_root(root_backup,
+ info->fs_root->node->start);
+ btrfs_set_backup_fs_root_gen(root_backup,
+ btrfs_header_generation(info->fs_root->node));
+ btrfs_set_backup_fs_root_level(root_backup,
+ btrfs_header_level(info->fs_root->node));
+ }
+
+ btrfs_set_backup_dev_root(root_backup, info->dev_root->node->start);
+ btrfs_set_backup_dev_root_gen(root_backup,
+ btrfs_header_generation(info->dev_root->node));
+ btrfs_set_backup_dev_root_level(root_backup,
+ btrfs_header_level(info->dev_root->node));
+
+ btrfs_set_backup_csum_root(root_backup, info->csum_root->node->start);
+ btrfs_set_backup_csum_root_gen(root_backup,
+ btrfs_header_generation(info->csum_root->node));
+ btrfs_set_backup_csum_root_level(root_backup,
+ btrfs_header_level(info->csum_root->node));
+
+ btrfs_set_backup_total_bytes(root_backup,
+ btrfs_super_total_bytes(info->super_copy));
+ btrfs_set_backup_bytes_used(root_backup,
+ btrfs_super_bytes_used(info->super_copy));
+ btrfs_set_backup_num_devices(root_backup,
+ btrfs_super_num_devices(info->super_copy));
+
+ /*
+ * if we don't copy this out to the super_copy, it won't get remembered
+ * for the next commit
+ */
+ memcpy(&info->super_copy->super_roots,
+ &info->super_for_commit->super_roots,
+ sizeof(*root_backup) * BTRFS_NUM_BACKUP_ROOTS);
+}
+
+/*
+ * this copies info out of the root backup array and back into
+ * the in-memory super block. It is meant to help iterate through
+ * the array, so you send it the number of backups you've already
+ * tried and the last backup index you used.
+ *
+ * this returns -1 when it has tried all the backups
+ */
+static noinline int next_root_backup(struct btrfs_fs_info *info,
+ struct btrfs_super_block *super,
+ int *num_backups_tried, int *backup_index)
+{
+ struct btrfs_root_backup *root_backup;
+ int newest = *backup_index;
+
+ if (*num_backups_tried == 0) {
+ u64 gen = btrfs_super_generation(super);
+
+ newest = find_newest_super_backup(info, gen);
+ if (newest == -1)
+ return -1;
+
+ *backup_index = newest;
+ *num_backups_tried = 1;
+ } else if (*num_backups_tried == BTRFS_NUM_BACKUP_ROOTS) {
+ /* we've tried all the backups, all done */
+ return -1;
+ } else {
+ /* jump to the next oldest backup */
+ newest = (*backup_index + BTRFS_NUM_BACKUP_ROOTS - 1) %
+ BTRFS_NUM_BACKUP_ROOTS;
+ *backup_index = newest;
+ *num_backups_tried += 1;
+ }
+ root_backup = super->super_roots + newest;
+
+ btrfs_set_super_generation(super,
+ btrfs_backup_tree_root_gen(root_backup));
+ btrfs_set_super_root(super, btrfs_backup_tree_root(root_backup));
+ btrfs_set_super_root_level(super,
+ btrfs_backup_tree_root_level(root_backup));
+ btrfs_set_super_bytes_used(super, btrfs_backup_bytes_used(root_backup));
+
+ /*
+ * fixme: the total bytes and num_devices need to match or we should
+ * need a fsck
+ */
+ btrfs_set_super_total_bytes(super, btrfs_backup_total_bytes(root_backup));
+ btrfs_set_super_num_devices(super, btrfs_backup_num_devices(root_backup));
+ return 0;
+}
+
+/* helper to cleanup tree roots */
+static void free_root_pointers(struct btrfs_fs_info *info, int chunk_root)
+{
+ free_extent_buffer(info->tree_root->node);
+ free_extent_buffer(info->tree_root->commit_root);
+ free_extent_buffer(info->dev_root->node);
+ free_extent_buffer(info->dev_root->commit_root);
+ free_extent_buffer(info->extent_root->node);
+ free_extent_buffer(info->extent_root->commit_root);
+ free_extent_buffer(info->csum_root->node);
+ free_extent_buffer(info->csum_root->commit_root);
+
+ info->tree_root->node = NULL;
+ info->tree_root->commit_root = NULL;
+ info->dev_root->node = NULL;
+ info->dev_root->commit_root = NULL;
+ info->extent_root->node = NULL;
+ info->extent_root->commit_root = NULL;
+ info->csum_root->node = NULL;
+ info->csum_root->commit_root = NULL;
+
+ if (chunk_root) {
+ free_extent_buffer(info->chunk_root->node);
+ free_extent_buffer(info->chunk_root->commit_root);
+ info->chunk_root->node = NULL;
+ info->chunk_root->commit_root = NULL;
+ }
+}
+
+
struct btrfs_root *open_ctree(struct super_block *sb,
struct btrfs_fs_devices *fs_devices,
char *options)
@@ -1575,6 +1904,8 @@ struct btrfs_root *open_ctree(struct super_block *sb,
int ret;
int err = -EINVAL;
+ int num_backups_tried = 0;
+ int backup_index = 0;
struct btrfs_super_block *disk_super;
@@ -1603,7 +1934,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
goto fail_bdi;
}
- fs_info->btree_inode->i_mapping->flags &= ~__GFP_FS;
+ mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS);
INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC);
INIT_LIST_HEAD(&fs_info->trans_list);
@@ -1619,6 +1950,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
spin_lock_init(&fs_info->fs_roots_radix_lock);
spin_lock_init(&fs_info->delayed_iput_lock);
spin_lock_init(&fs_info->defrag_inodes_lock);
+ spin_lock_init(&fs_info->free_chunk_lock);
mutex_init(&fs_info->reloc_mutex);
init_completion(&fs_info->kobj_unregister);
@@ -1636,8 +1968,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
btrfs_init_block_rsv(&fs_info->trans_block_rsv);
btrfs_init_block_rsv(&fs_info->chunk_block_rsv);
btrfs_init_block_rsv(&fs_info->empty_block_rsv);
- INIT_LIST_HEAD(&fs_info->durable_block_rsv_list);
- mutex_init(&fs_info->durable_block_rsv_mutex);
+ btrfs_init_block_rsv(&fs_info->delayed_block_rsv);
atomic_set(&fs_info->nr_async_submits, 0);
atomic_set(&fs_info->async_delalloc_pages, 0);
atomic_set(&fs_info->async_submit_draining, 0);
@@ -1648,6 +1979,11 @@ struct btrfs_root *open_ctree(struct super_block *sb,
fs_info->metadata_ratio = 0;
fs_info->defrag_inodes = RB_ROOT;
fs_info->trans_no_join = 0;
+ fs_info->free_chunk_space = 0;
+
+ /* readahead state */
+ INIT_RADIX_TREE(&fs_info->reada_tree, GFP_NOFS & ~__GFP_WAIT);
+ spin_lock_init(&fs_info->reada_lock);
fs_info->thread_pool_size = min_t(unsigned long,
num_online_cpus() + 2, 8);
@@ -1676,7 +2012,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
sb->s_bdi = &fs_info->bdi;
fs_info->btree_inode->i_ino = BTRFS_BTREE_INODE_OBJECTID;
- fs_info->btree_inode->i_nlink = 1;
+ set_nlink(fs_info->btree_inode, 1);
/*
* we set the i_size on the btree inode to the max possible int.
* the real end of the address space is determined by all of
@@ -1737,14 +2073,14 @@ struct btrfs_root *open_ctree(struct super_block *sb,
goto fail_alloc;
}
- memcpy(&fs_info->super_copy, bh->b_data, sizeof(fs_info->super_copy));
- memcpy(&fs_info->super_for_commit, &fs_info->super_copy,
- sizeof(fs_info->super_for_commit));
+ memcpy(fs_info->super_copy, bh->b_data, sizeof(*fs_info->super_copy));
+ memcpy(fs_info->super_for_commit, fs_info->super_copy,
+ sizeof(*fs_info->super_for_commit));
brelse(bh);
- memcpy(fs_info->fsid, fs_info->super_copy.fsid, BTRFS_FSID_SIZE);
+ memcpy(fs_info->fsid, fs_info->super_copy->fsid, BTRFS_FSID_SIZE);
- disk_super = &fs_info->super_copy;
+ disk_super = fs_info->super_copy;
if (!btrfs_super_root(disk_super))
goto fail_alloc;
@@ -1754,6 +2090,13 @@ struct btrfs_root *open_ctree(struct super_block *sb,
btrfs_check_super_valid(fs_info, sb->s_flags & MS_RDONLY);
/*
+ * run through our array of backup supers and setup
+ * our ring pointer to the oldest one
+ */
+ generation = btrfs_super_generation(disk_super);
+ find_oldest_super_backup(fs_info, generation);
+
+ /*
* In the long term, we'll store the compression type in the super
* block, and it'll be used for per file compression control.
*/
@@ -1807,6 +2150,9 @@ struct btrfs_root *open_ctree(struct super_block *sb,
fs_info->thread_pool_size),
&fs_info->generic_worker);
+ btrfs_init_workers(&fs_info->caching_workers, "cache",
+ 2, &fs_info->generic_worker);
+
/* a higher idle thresh on the submit workers makes it much more
* likely that bios will be send down in a sane order to the
* devices
@@ -1838,6 +2184,9 @@ struct btrfs_root *open_ctree(struct super_block *sb,
btrfs_init_workers(&fs_info->delayed_workers, "delayed-meta",
fs_info->thread_pool_size,
&fs_info->generic_worker);
+ btrfs_init_workers(&fs_info->readahead_workers, "readahead",
+ fs_info->thread_pool_size,
+ &fs_info->generic_worker);
/*
* endios are largely parallel and should have a very
@@ -1848,6 +2197,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
fs_info->endio_write_workers.idle_thresh = 2;
fs_info->endio_meta_write_workers.idle_thresh = 2;
+ fs_info->readahead_workers.idle_thresh = 2;
btrfs_start_workers(&fs_info->workers, 1);
btrfs_start_workers(&fs_info->generic_worker, 1);
@@ -1860,6 +2210,8 @@ struct btrfs_root *open_ctree(struct super_block *sb,
btrfs_start_workers(&fs_info->endio_write_workers, 1);
btrfs_start_workers(&fs_info->endio_freespace_worker, 1);
btrfs_start_workers(&fs_info->delayed_workers, 1);
+ btrfs_start_workers(&fs_info->caching_workers, 1);
+ btrfs_start_workers(&fs_info->readahead_workers, 1);
fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super);
fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages,
@@ -1906,7 +2258,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
if (!test_bit(EXTENT_BUFFER_UPTODATE, &chunk_root->node->bflags)) {
printk(KERN_WARNING "btrfs: failed to read chunk root on %s\n",
sb->s_id);
- goto fail_chunk_root;
+ goto fail_tree_roots;
}
btrfs_set_root_node(&chunk_root->root_item, chunk_root->node);
chunk_root->commit_root = btrfs_root_node(chunk_root);
@@ -1921,11 +2273,12 @@ struct btrfs_root *open_ctree(struct super_block *sb,
if (ret) {
printk(KERN_WARNING "btrfs: failed to read chunk tree on %s\n",
sb->s_id);
- goto fail_chunk_root;
+ goto fail_tree_roots;
}
btrfs_close_extra_devices(fs_devices);
+retry_root_backup:
blocksize = btrfs_level_size(tree_root,
btrfs_super_root_level(disk_super));
generation = btrfs_super_generation(disk_super);
@@ -1933,32 +2286,33 @@ struct btrfs_root *open_ctree(struct super_block *sb,
tree_root->node = read_tree_block(tree_root,
btrfs_super_root(disk_super),
blocksize, generation);
- if (!tree_root->node)
- goto fail_chunk_root;
- if (!test_bit(EXTENT_BUFFER_UPTODATE, &tree_root->node->bflags)) {
+ if (!tree_root->node ||
+ !test_bit(EXTENT_BUFFER_UPTODATE, &tree_root->node->bflags)) {
printk(KERN_WARNING "btrfs: failed to read tree root on %s\n",
sb->s_id);
- goto fail_tree_root;
+
+ goto recovery_tree_root;
}
+
btrfs_set_root_node(&tree_root->root_item, tree_root->node);
tree_root->commit_root = btrfs_root_node(tree_root);
ret = find_and_setup_root(tree_root, fs_info,
BTRFS_EXTENT_TREE_OBJECTID, extent_root);
if (ret)
- goto fail_tree_root;
+ goto recovery_tree_root;
extent_root->track_dirty = 1;
ret = find_and_setup_root(tree_root, fs_info,
BTRFS_DEV_TREE_OBJECTID, dev_root);
if (ret)
- goto fail_extent_root;
+ goto recovery_tree_root;
dev_root->track_dirty = 1;
ret = find_and_setup_root(tree_root, fs_info,
BTRFS_CSUM_TREE_OBJECTID, csum_root);
if (ret)
- goto fail_dev_root;
+ goto recovery_tree_root;
csum_root->track_dirty = 1;
@@ -2091,22 +2445,13 @@ fail_cleaner:
fail_block_groups:
btrfs_free_block_groups(fs_info);
- free_extent_buffer(csum_root->node);
- free_extent_buffer(csum_root->commit_root);
-fail_dev_root:
- free_extent_buffer(dev_root->node);
- free_extent_buffer(dev_root->commit_root);
-fail_extent_root:
- free_extent_buffer(extent_root->node);
- free_extent_buffer(extent_root->commit_root);
-fail_tree_root:
- free_extent_buffer(tree_root->node);
- free_extent_buffer(tree_root->commit_root);
-fail_chunk_root:
- free_extent_buffer(chunk_root->node);
- free_extent_buffer(chunk_root->commit_root);
+
+fail_tree_roots:
+ free_root_pointers(fs_info, 1);
+
fail_sb_buffer:
btrfs_stop_workers(&fs_info->generic_worker);
+ btrfs_stop_workers(&fs_info->readahead_workers);
btrfs_stop_workers(&fs_info->fixup_workers);
btrfs_stop_workers(&fs_info->delalloc_workers);
btrfs_stop_workers(&fs_info->workers);
@@ -2117,8 +2462,8 @@ fail_sb_buffer:
btrfs_stop_workers(&fs_info->endio_freespace_worker);
btrfs_stop_workers(&fs_info->submit_workers);
btrfs_stop_workers(&fs_info->delayed_workers);
+ btrfs_stop_workers(&fs_info->caching_workers);
fail_alloc:
- kfree(fs_info->delayed_root);
fail_iput:
invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
iput(fs_info->btree_inode);
@@ -2130,13 +2475,27 @@ fail_bdi:
fail_srcu:
cleanup_srcu_struct(&fs_info->subvol_srcu);
fail:
- kfree(extent_root);
- kfree(tree_root);
- kfree(fs_info);
- kfree(chunk_root);
- kfree(dev_root);
- kfree(csum_root);
+ free_fs_info(fs_info);
return ERR_PTR(err);
+
+recovery_tree_root:
+
+ if (!btrfs_test_opt(tree_root, RECOVERY))
+ goto fail_tree_roots;
+
+ free_root_pointers(fs_info, 0);
+
+ /* don't use the log in recovery mode, it won't be valid */
+ btrfs_set_super_log_root(disk_super, 0);
+
+ /* we can't trust the free space cache either */
+ btrfs_set_opt(fs_info->mount_opt, CLEAR_CACHE);
+
+ ret = next_root_backup(fs_info, fs_info->super_copy,
+ &num_backups_tried, &backup_index);
+ if (ret == -1)
+ goto fail_block_groups;
+ goto retry_root_backup;
}
static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate)
@@ -2304,10 +2663,11 @@ int write_all_supers(struct btrfs_root *root, int max_mirrors)
int total_errors = 0;
u64 flags;
- max_errors = btrfs_super_num_devices(&root->fs_info->super_copy) - 1;
+ max_errors = btrfs_super_num_devices(root->fs_info->super_copy) - 1;
do_barriers = !btrfs_test_opt(root, NOBARRIER);
+ backup_super_roots(root->fs_info);
- sb = &root->fs_info->super_for_commit;
+ sb = root->fs_info->super_for_commit;
dev_item = &sb->dev_item;
mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
@@ -2393,10 +2753,8 @@ static void free_fs_root(struct btrfs_root *root)
{
iput(root->cache_inode);
WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree));
- if (root->anon_super.s_dev) {
- down_write(&root->anon_super.s_umount);
- kill_anon_super(&root->anon_super);
- }
+ if (root->anon_dev)
+ free_anon_bdev(root->anon_dev);
free_extent_buffer(root->node);
free_extent_buffer(root->commit_root);
kfree(root->free_ino_ctl);
@@ -2513,8 +2871,6 @@ int close_ctree(struct btrfs_root *root)
/* clear out the rbtree of defraggable inodes */
btrfs_run_defrag_inodes(root->fs_info);
- btrfs_put_block_group_cache(fs_info);
-
/*
* Here come 2 situations when btrfs is broken to flip readonly:
*
@@ -2540,6 +2896,8 @@ int close_ctree(struct btrfs_root *root)
printk(KERN_ERR "btrfs: commit super ret %d\n", ret);
}
+ btrfs_put_block_group_cache(fs_info);
+
kthread_stop(root->fs_info->transaction_kthread);
kthread_stop(root->fs_info->cleaner_kthread);
@@ -2571,7 +2929,6 @@ int close_ctree(struct btrfs_root *root)
del_fs_roots(fs_info);
iput(fs_info->btree_inode);
- kfree(fs_info->delayed_root);
btrfs_stop_workers(&fs_info->generic_worker);
btrfs_stop_workers(&fs_info->fixup_workers);
@@ -2584,6 +2941,8 @@ int close_ctree(struct btrfs_root *root)
btrfs_stop_workers(&fs_info->endio_freespace_worker);
btrfs_stop_workers(&fs_info->submit_workers);
btrfs_stop_workers(&fs_info->delayed_workers);
+ btrfs_stop_workers(&fs_info->caching_workers);
+ btrfs_stop_workers(&fs_info->readahead_workers);
btrfs_close_devices(fs_info->fs_devices);
btrfs_mapping_tree_free(&fs_info->mapping_tree);
@@ -2591,12 +2950,7 @@ int close_ctree(struct btrfs_root *root)
bdi_destroy(&fs_info->bdi);
cleanup_srcu_struct(&fs_info->subvol_srcu);
- kfree(fs_info->extent_root);
- kfree(fs_info->tree_root);
- kfree(fs_info->chunk_root);
- kfree(fs_info->dev_root);
- kfree(fs_info->csum_root);
- kfree(fs_info);
+ free_fs_info(fs_info);
return 0;
}
@@ -2702,7 +3056,8 @@ int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid)
return ret;
}
-int btree_lock_page_hook(struct page *page)
+static int btree_lock_page_hook(struct page *page, void *data,
+ void (*flush_fn)(void *))
{
struct inode *inode = page->mapping->host;
struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -2719,7 +3074,10 @@ int btree_lock_page_hook(struct page *page)
if (!eb)
goto out;
- btrfs_tree_lock(eb);
+ if (!btrfs_try_tree_write_lock(eb)) {
+ flush_fn(data);
+ btrfs_tree_lock(eb);
+ }
btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
@@ -2734,7 +3092,10 @@ int btree_lock_page_hook(struct page *page)
btrfs_tree_unlock(eb);
free_extent_buffer(eb);
out:
- lock_page(page);
+ if (!trylock_page(page)) {
+ flush_fn(data);
+ lock_page(page);
+ }
return 0;
}
@@ -3090,6 +3451,7 @@ static int btrfs_cleanup_transaction(struct btrfs_root *root)
static struct extent_io_ops btree_extent_io_ops = {
.write_cache_pages_lock_hook = btree_lock_page_hook,
.readpage_end_io_hook = btree_readpage_end_io_hook,
+ .readpage_io_failed_hook = btree_io_failed_hook,
.submit_bio_hook = btree_submit_bio_hook,
/* note we're sharing with inode.c for the merge bio hook */
.merge_bio_hook = btrfs_merge_bio_hook,