diff options
Diffstat (limited to 'fs/btrfs/tree-log.c')
-rw-r--r-- | fs/btrfs/tree-log.c | 233 |
1 files changed, 152 insertions, 81 deletions
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 517d0ccb351e..528cae123dc9 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -27,6 +27,7 @@ #include "backref.h" #include "hash.h" #include "compression.h" +#include "qgroup.h" /* magic values for the inode_only field in btrfs_log_inode: * @@ -680,6 +681,21 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, ins.type = BTRFS_EXTENT_ITEM_KEY; offset = key->offset - btrfs_file_extent_offset(eb, item); + /* + * Manually record dirty extent, as here we did a shallow + * file extent item copy and skip normal backref update, + * but modifying extent tree all by ourselves. + * So need to manually record dirty extent for qgroup, + * as the owner of the file extent changed from log tree + * (doesn't affect qgroup) to fs/file tree(affects qgroup) + */ + ret = btrfs_qgroup_insert_dirty_extent(trans, root->fs_info, + btrfs_file_extent_disk_bytenr(eb, item), + btrfs_file_extent_disk_num_bytes(eb, item), + GFP_NOFS); + if (ret < 0) + goto out; + if (ins.objectid > 0) { u64 csum_start; u64 csum_end; @@ -2330,7 +2346,7 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, break; /* for regular files, make sure corresponding - * orhpan item exist. extents past the new EOF + * orphan item exist. extents past the new EOF * will be truncated later by orphan cleanup. */ if (S_ISREG(mode)) { @@ -2422,8 +2438,8 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, root_owner = btrfs_header_owner(parent); next = btrfs_find_create_tree_block(root, bytenr); - if (!next) - return -ENOMEM; + if (IS_ERR(next)) + return PTR_ERR(next); if (*level == 1) { ret = wc->process_func(root, next, wc, ptr_gen); @@ -2757,7 +2773,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, while (1) { int batch = atomic_read(&root->log_batch); /* when we're on an ssd, just kick the log commit out */ - if (!btrfs_test_opt(root, SSD) && + if (!btrfs_test_opt(root->fs_info, SSD) && test_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state)) { mutex_unlock(&root->log_mutex); schedule_timeout_uninterruptible(1); @@ -2788,7 +2804,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, ret = btrfs_write_marked_extents(log, &log->dirty_log_pages, mark); if (ret) { blk_finish_plug(&plug); - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); btrfs_free_logged_extents(log, log_transid); btrfs_set_log_full_commit(root->fs_info, trans); mutex_unlock(&root->log_mutex); @@ -2807,7 +2823,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, */ mutex_unlock(&root->log_mutex); - btrfs_init_log_ctx(&root_log_ctx); + btrfs_init_log_ctx(&root_log_ctx, NULL); mutex_lock(&log_root_tree->log_mutex); atomic_inc(&log_root_tree->log_batch); @@ -2838,7 +2854,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, btrfs_set_log_full_commit(root->fs_info, trans); if (ret != -ENOSPC) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); mutex_unlock(&log_root_tree->log_mutex); goto out; } @@ -2851,6 +2867,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, if (log_root_tree->log_transid_committed >= root_log_ctx.log_transid) { blk_finish_plug(&plug); + list_del_init(&root_log_ctx.list); mutex_unlock(&log_root_tree->log_mutex); ret = root_log_ctx.log_ret; goto out; @@ -2898,7 +2915,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, blk_finish_plug(&plug); if (ret) { btrfs_set_log_full_commit(root->fs_info, trans); - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); btrfs_free_logged_extents(log, log_transid); mutex_unlock(&log_root_tree->log_mutex); goto out_wake_log_root; @@ -2934,7 +2951,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, ret = write_ctree_super(trans, root->fs_info->tree_root, 1); if (ret) { btrfs_set_log_full_commit(root->fs_info, trans); - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out_wake_log_root; } @@ -2991,7 +3008,7 @@ static void free_log_tree(struct btrfs_trans_handle *trans, ret = walk_log_tree(trans, log, &wc); /* I don't think this can happen but just in case */ if (ret) - btrfs_abort_transaction(trans, log, ret); + btrfs_abort_transaction(trans, ret); while (1) { ret = find_first_extent_bit(&log->dirty_log_pages, @@ -3001,7 +3018,7 @@ static void free_log_tree(struct btrfs_trans_handle *trans, break; clear_extent_bits(&log->dirty_log_pages, start, end, - EXTENT_DIRTY | EXTENT_NEW, GFP_NOFS); + EXTENT_DIRTY | EXTENT_NEW); } /* @@ -3160,7 +3177,7 @@ out_unlock: btrfs_set_log_full_commit(root->fs_info, trans); ret = 0; } else if (ret < 0) - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); btrfs_end_log_trans(root); @@ -3193,7 +3210,7 @@ int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, btrfs_set_log_full_commit(root->fs_info, trans); ret = 0; } else if (ret < 0 && ret != -ENOENT) - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); btrfs_end_log_trans(root); return ret; @@ -3944,7 +3961,7 @@ static int wait_ordered_extents(struct btrfs_trans_handle *trans, * i_mapping flags, so that the next fsync won't get * an outdated io error too. */ - btrfs_inode_check_errors(inode); + filemap_check_errors(inode->i_mapping); *ordered_io_error = true; break; } @@ -4141,6 +4158,7 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, INIT_LIST_HEAD(&extents); + down_write(&BTRFS_I(inode)->dio_sem); write_lock(&tree->lock); test_gen = root->fs_info->last_trans_committed; @@ -4169,13 +4187,20 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, } list_sort(NULL, &extents, extent_cmp); + btrfs_get_logged_extents(inode, logged_list, start, end); /* - * Collect any new ordered extents within the range. This is to - * prevent logging file extent items without waiting for the disk - * location they point to being written. We do this only to deal - * with races against concurrent lockless direct IO writes. + * Some ordered extents started by fsync might have completed + * before we could collect them into the list logged_list, which + * means they're gone, not in our logged_list nor in the inode's + * ordered tree. We want the application/user space to know an + * error happened while attempting to persist file data so that + * it can take proper action. If such error happened, we leave + * without writing to the log tree and the fsync must report the + * file data write error and not commit the current transaction. */ - btrfs_get_logged_extents(inode, logged_list, start, end); + ret = filemap_check_errors(inode->i_mapping); + if (ret) + ctx->io_err = ret; process: while (!list_empty(&extents)) { em = list_entry(extents.next, struct extent_map, list); @@ -4202,6 +4227,7 @@ process: } WARN_ON(!list_empty(&extents)); write_unlock(&tree->lock); + up_write(&BTRFS_I(inode)->dio_sem); btrfs_release_path(path); return ret; @@ -4460,7 +4486,8 @@ static int btrfs_log_trailing_hole(struct btrfs_trans_handle *trans, static int btrfs_check_ref_name_override(struct extent_buffer *eb, const int slot, const struct btrfs_key *key, - struct inode *inode) + struct inode *inode, + u64 *other_ino) { int ret; struct btrfs_path *search_path; @@ -4519,7 +4546,16 @@ static int btrfs_check_ref_name_override(struct extent_buffer *eb, search_path, parent, name, this_name_len, 0); if (di && !IS_ERR(di)) { - ret = 1; + struct btrfs_key di_key; + + btrfs_dir_item_key_to_cpu(search_path->nodes[0], + di, &di_key); + if (di_key.type == BTRFS_INODE_ITEM_KEY) { + ret = 1; + *other_ino = di_key.objectid; + } else { + ret = -EAGAIN; + } goto out; } else if (IS_ERR(di)) { ret = PTR_ERR(di); @@ -4623,23 +4659,6 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, mutex_lock(&BTRFS_I(inode)->log_mutex); /* - * Collect ordered extents only if we are logging data. This is to - * ensure a subsequent request to log this inode in LOG_INODE_ALL mode - * will process the ordered extents if they still exists at the time, - * because when we collect them we test and set for the flag - * BTRFS_ORDERED_LOGGED to prevent multiple log requests to process the - * same ordered extents. The consequence for the LOG_INODE_ALL log mode - * not processing the ordered extents is that we end up logging the - * corresponding file extent items, based on the extent maps in the - * inode's extent_map_tree's modified_list, without logging the - * respective checksums (since the may still be only attached to the - * ordered extents and have not been inserted in the csum tree by - * btrfs_finish_ordered_io() yet). - */ - if (inode_only == LOG_INODE_ALL) - btrfs_get_logged_extents(inode, &logged_list, start, end); - - /* * a brute force approach to making sure we get the most uptodate * copies of everything. */ @@ -4711,6 +4730,10 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, ins_nr = 0; ret = btrfs_search_forward(root, &min_key, path, trans->transid); + if (ret < 0) { + err = ret; + goto out_unlock; + } if (ret != 0) break; again: @@ -4726,16 +4749,72 @@ again: if ((min_key.type == BTRFS_INODE_REF_KEY || min_key.type == BTRFS_INODE_EXTREF_KEY) && BTRFS_I(inode)->generation == trans->transid) { + u64 other_ino = 0; + ret = btrfs_check_ref_name_override(path->nodes[0], path->slots[0], - &min_key, inode); + &min_key, inode, + &other_ino); if (ret < 0) { err = ret; goto out_unlock; - } else if (ret > 0) { - err = 1; - btrfs_set_log_full_commit(root->fs_info, trans); - goto out_unlock; + } else if (ret > 0 && ctx && + other_ino != btrfs_ino(ctx->inode)) { + struct btrfs_key inode_key; + struct inode *other_inode; + + if (ins_nr > 0) { + ins_nr++; + } else { + ins_nr = 1; + ins_start_slot = path->slots[0]; + } + ret = copy_items(trans, inode, dst_path, path, + &last_extent, ins_start_slot, + ins_nr, inode_only, + logged_isize); + if (ret < 0) { + err = ret; + goto out_unlock; + } + ins_nr = 0; + btrfs_release_path(path); + inode_key.objectid = other_ino; + inode_key.type = BTRFS_INODE_ITEM_KEY; + inode_key.offset = 0; + other_inode = btrfs_iget(root->fs_info->sb, + &inode_key, root, + NULL); + /* + * If the other inode that had a conflicting dir + * entry was deleted in the current transaction, + * we don't need to do more work nor fallback to + * a transaction commit. + */ + if (IS_ERR(other_inode) && + PTR_ERR(other_inode) == -ENOENT) { + goto next_key; + } else if (IS_ERR(other_inode)) { + err = PTR_ERR(other_inode); + goto out_unlock; + } + /* + * We are safe logging the other inode without + * acquiring its i_mutex as long as we log with + * the LOG_INODE_EXISTS mode. We're safe against + * concurrent renames of the other inode as well + * because during a rename we pin the log and + * update the log with the new name before we + * unpin it. + */ + err = btrfs_log_inode(trans, root, other_inode, + LOG_INODE_EXISTS, + 0, LLONG_MAX, ctx); + iput(other_inode); + if (err) + goto out_unlock; + else + goto next_key; } } @@ -4803,7 +4882,7 @@ next_slot: ins_nr = 0; } btrfs_release_path(path); - +next_key: if (min_key.offset < (u64)-1) { min_key.offset++; } else if (min_key.type < max_key.type) { @@ -4846,21 +4925,6 @@ log_extents: goto out_unlock; } if (fast_search) { - /* - * Some ordered extents started by fsync might have completed - * before we collected the ordered extents in logged_list, which - * means they're gone, not in our logged_list nor in the inode's - * ordered tree. We want the application/user space to know an - * error happened while attempting to persist file data so that - * it can take proper action. If such error happened, we leave - * without writing to the log tree and the fsync must report the - * file data write error and not commit the current transaction. - */ - err = btrfs_inode_check_errors(inode); - if (err) { - ctx->io_err = err; - goto out_unlock; - } ret = btrfs_log_changed_extents(trans, root, inode, dst_path, &logged_list, ctx, start, end); if (ret) { @@ -4937,7 +5001,7 @@ out_unlock: * the actual unlink operation, so if we do this check before a concurrent task * sets last_unlink_trans it means we've logged a consistent version/state of * all the inode items, otherwise we are not sure and must do a transaction - * commit (the concurrent task migth have only updated last_unlink_trans before + * commit (the concurrent task might have only updated last_unlink_trans before * we logged the inode or it might have also done the unlink). */ static bool btrfs_must_commit_transaction(struct btrfs_trans_handle *trans, @@ -4988,7 +5052,7 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans, goto out; if (!S_ISDIR(inode->i_mode)) { - if (!parent || d_really_is_negative(parent) || sb != d_inode(parent)->i_sb) + if (!parent || d_really_is_negative(parent) || sb != parent->d_sb) goto out; inode = d_inode(parent); } @@ -4996,7 +5060,7 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans, while (1) { /* * If we are logging a directory then we start with our inode, - * not our parents inode, so we need to skipp setting the + * not our parent's inode, so we need to skip setting the * logged_trans so that further down in the log code we don't * think this inode has already been logged. */ @@ -5009,11 +5073,15 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans, break; } - if (!parent || d_really_is_negative(parent) || sb != d_inode(parent)->i_sb) + if (!parent || d_really_is_negative(parent) || sb != parent->d_sb) break; - if (IS_ROOT(parent)) + if (IS_ROOT(parent)) { + inode = d_inode(parent); + if (btrfs_must_commit_transaction(trans, inode)) + ret = 1; break; + } parent = dget_parent(parent); dput(old_parent); @@ -5158,7 +5226,7 @@ process_leaf: } ctx->log_new_dentries = false; - if (type == BTRFS_FT_DIR) + if (type == BTRFS_FT_DIR || type == BTRFS_FT_SYMLINK) log_mode = LOG_INODE_ALL; btrfs_release_path(path); ret = btrfs_log_inode(trans, root, di_inode, @@ -5278,11 +5346,16 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans, if (IS_ERR(dir_inode)) continue; + if (ctx) + ctx->log_new_dentries = false; ret = btrfs_log_inode(trans, root, dir_inode, LOG_INODE_ALL, 0, LLONG_MAX, ctx); if (!ret && btrfs_must_commit_transaction(trans, dir_inode)) ret = 1; + if (!ret && ctx && ctx->log_new_dentries) + ret = log_new_dir_dentries(trans, root, + dir_inode, ctx); iput(dir_inode); if (ret) goto out; @@ -5319,7 +5392,7 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, sb = inode->i_sb; - if (btrfs_test_opt(root, NOTREELOG)) { + if (btrfs_test_opt(root->fs_info, NOTREELOG)) { ret = 1; goto end_no_trans; } @@ -5375,7 +5448,7 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, log_dentries = true; /* - * On unlink we must make sure all our current and old parent directores + * On unlink we must make sure all our current and old parent directory * inodes are fully logged. This is to prevent leaving dangling * directory index entries in directories that were our parents but are * not anymore. Not doing this results in old parent directory being @@ -5422,7 +5495,7 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, } while (1) { - if (!parent || d_really_is_negative(parent) || sb != d_inode(parent)->i_sb) + if (!parent || d_really_is_negative(parent) || sb != parent->d_sb) break; inode = d_inode(parent); @@ -5506,7 +5579,7 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree) if (!path) return -ENOMEM; - fs_info->log_root_recovering = 1; + set_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags); trans = btrfs_start_transaction(fs_info->tree_root, 0); if (IS_ERR(trans)) { @@ -5519,8 +5592,8 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree) ret = walk_log_tree(trans, log_root_tree, &wc); if (ret) { - btrfs_std_error(fs_info, ret, "Failed to pin buffers while " - "recovering log root tree."); + btrfs_handle_fs_error(fs_info, ret, + "Failed to pin buffers while recovering log root tree."); goto error; } @@ -5533,7 +5606,7 @@ again: ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0); if (ret < 0) { - btrfs_std_error(fs_info, ret, + btrfs_handle_fs_error(fs_info, ret, "Couldn't find tree log root."); goto error; } @@ -5551,7 +5624,7 @@ again: log = btrfs_read_fs_root(log_root_tree, &found_key); if (IS_ERR(log)) { ret = PTR_ERR(log); - btrfs_std_error(fs_info, ret, + btrfs_handle_fs_error(fs_info, ret, "Couldn't read tree log root."); goto error; } @@ -5566,8 +5639,8 @@ again: free_extent_buffer(log->node); free_extent_buffer(log->commit_root); kfree(log); - btrfs_std_error(fs_info, ret, "Couldn't read target root " - "for tree log recovery."); + btrfs_handle_fs_error(fs_info, ret, + "Couldn't read target root for tree log recovery."); goto error; } @@ -5616,7 +5689,7 @@ again: free_extent_buffer(log_root_tree->node); log_root_tree->log_root = NULL; - fs_info->log_root_recovering = 0; + clear_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags); kfree(log_root_tree); return 0; @@ -5652,11 +5725,9 @@ void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans, * into the file. When the file is logged we check it and * don't log the parents if the file is fully on disk. */ - if (S_ISREG(inode->i_mode)) { - mutex_lock(&BTRFS_I(inode)->log_mutex); - BTRFS_I(inode)->last_unlink_trans = trans->transid; - mutex_unlock(&BTRFS_I(inode)->log_mutex); - } + mutex_lock(&BTRFS_I(inode)->log_mutex); + BTRFS_I(inode)->last_unlink_trans = trans->transid; + mutex_unlock(&BTRFS_I(inode)->log_mutex); /* * if this directory was already logged any new |