diff options
Diffstat (limited to 'fs')
557 files changed, 16302 insertions, 13273 deletions
diff --git a/fs/9p/acl.c b/fs/9p/acl.c index eb3589edf485..0576eaeb60b9 100644 --- a/fs/9p/acl.c +++ b/fs/9p/acl.c @@ -239,13 +239,13 @@ static int v9fs_xattr_get_acl(const struct xattr_handler *handler, } static int v9fs_xattr_set_acl(const struct xattr_handler *handler, - struct dentry *dentry, const char *name, - const void *value, size_t size, int flags) + struct dentry *dentry, struct inode *inode, + const char *name, const void *value, + size_t size, int flags) { int retval; struct posix_acl *acl; struct v9fs_session_info *v9ses; - struct inode *inode = d_inode(dentry); v9ses = v9fs_dentry2v9ses(dentry); /* diff --git a/fs/9p/fid.h b/fs/9p/fid.h index 2b6787fcb626..12700df0bb51 100644 --- a/fs/9p/fid.h +++ b/fs/9p/fid.h @@ -24,6 +24,10 @@ #include <linux/list.h> struct p9_fid *v9fs_fid_lookup(struct dentry *dentry); +static inline struct p9_fid *v9fs_parent_fid(struct dentry *dentry) +{ + return v9fs_fid_lookup(dentry->d_parent); +} struct p9_fid *v9fs_fid_clone(struct dentry *dentry); void v9fs_fid_add(struct dentry *dentry, struct p9_fid *fid); struct p9_fid *v9fs_writeback_fid(struct dentry *dentry); diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c index c37fb9c08970..6181ad79e1a5 100644 --- a/fs/9p/vfs_addr.c +++ b/fs/9p/vfs_addr.c @@ -231,7 +231,6 @@ static int v9fs_launder_page(struct page *page) /** * v9fs_direct_IO - 9P address space operation for direct I/O * @iocb: target I/O control block - * @pos: offset in file to begin the operation * * The presence of v9fs_direct_IO() in the address space ops vector * allowes open() O_DIRECT flags which would have failed otherwise. diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c index b84c291ba1eb..d7b78d531e63 100644 --- a/fs/9p/vfs_file.c +++ b/fs/9p/vfs_file.c @@ -74,7 +74,7 @@ int v9fs_file_open(struct inode *inode, struct file *file) v9fs_proto_dotu(v9ses)); fid = file->private_data; if (!fid) { - fid = v9fs_fid_clone(file->f_path.dentry); + fid = v9fs_fid_clone(file_dentry(file)); if (IS_ERR(fid)) return PTR_ERR(fid); @@ -100,7 +100,7 @@ int v9fs_file_open(struct inode *inode, struct file *file) * because we want write after unlink usecase * to work. */ - fid = v9fs_writeback_fid(file->f_path.dentry); + fid = v9fs_writeback_fid(file_dentry(file)); if (IS_ERR(fid)) { err = PTR_ERR(fid); mutex_unlock(&v9inode->v_mutex); @@ -516,7 +516,7 @@ v9fs_mmap_file_mmap(struct file *filp, struct vm_area_struct *vma) * because we want write after unlink usecase * to work. */ - fid = v9fs_writeback_fid(filp->f_path.dentry); + fid = v9fs_writeback_fid(file_dentry(filp)); if (IS_ERR(fid)) { retval = PTR_ERR(fid); mutex_unlock(&v9inode->v_mutex); diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index f4645c515262..7da9a8354fad 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -595,7 +595,7 @@ static int v9fs_remove(struct inode *dir, struct dentry *dentry, int flags) v9ses = v9fs_inode2v9ses(dir); inode = d_inode(dentry); - dfid = v9fs_fid_lookup(dentry->d_parent); + dfid = v9fs_parent_fid(dentry); if (IS_ERR(dfid)) { retval = PTR_ERR(dfid); p9_debug(P9_DEBUG_VFS, "fid lookup failed %d\n", retval); @@ -653,7 +653,7 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir, ofid = NULL; fid = NULL; name = (char *) dentry->d_name.name; - dfid = v9fs_fid_lookup(dentry->d_parent); + dfid = v9fs_parent_fid(dentry); if (IS_ERR(dfid)) { err = PTR_ERR(dfid); p9_debug(P9_DEBUG_VFS, "fid lookup failed %d\n", err); @@ -798,7 +798,7 @@ struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry, v9ses = v9fs_inode2v9ses(dir); /* We can walk d_parent because we hold the dir->i_mutex */ - dfid = v9fs_fid_lookup(dentry->d_parent); + dfid = v9fs_parent_fid(dentry); if (IS_ERR(dfid)) return ERR_CAST(dfid); @@ -853,7 +853,7 @@ v9fs_vfs_atomic_open(struct inode *dir, struct dentry *dentry, struct p9_fid *fid, *inode_fid; struct dentry *res = NULL; - if (d_unhashed(dentry)) { + if (d_in_lookup(dentry)) { res = v9fs_vfs_lookup(dir, dentry, 0); if (IS_ERR(res)) return PTR_ERR(res); @@ -975,13 +975,13 @@ v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry, if (IS_ERR(oldfid)) return PTR_ERR(oldfid); - olddirfid = v9fs_fid_clone(old_dentry->d_parent); + olddirfid = v9fs_parent_fid(old_dentry); if (IS_ERR(olddirfid)) { retval = PTR_ERR(olddirfid); goto done; } - newdirfid = v9fs_fid_clone(new_dentry->d_parent); + newdirfid = v9fs_parent_fid(new_dentry); if (IS_ERR(newdirfid)) { retval = PTR_ERR(newdirfid); goto clunk_olddir; diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c index a34702c998f5..2ed04c2fe7af 100644 --- a/fs/9p/vfs_inode_dotl.c +++ b/fs/9p/vfs_inode_dotl.c @@ -254,7 +254,7 @@ v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry, struct posix_acl *pacl = NULL, *dacl = NULL; struct dentry *res = NULL; - if (d_unhashed(dentry)) { + if (d_in_lookup(dentry)) { res = v9fs_vfs_lookup(dir, dentry, 0); if (IS_ERR(res)) return PTR_ERR(res); @@ -273,7 +273,7 @@ v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry, p9_debug(P9_DEBUG_VFS, "name:%s flags:0x%x mode:0x%hx\n", name, flags, omode); - dfid = v9fs_fid_lookup(dentry->d_parent); + dfid = v9fs_parent_fid(dentry); if (IS_ERR(dfid)) { err = PTR_ERR(dfid); p9_debug(P9_DEBUG_VFS, "fid lookup failed %d\n", err); @@ -389,7 +389,6 @@ static int v9fs_vfs_mkdir_dotl(struct inode *dir, umode_t mode; struct inode *inode; struct p9_qid qid; - struct dentry *dir_dentry; struct posix_acl *dacl = NULL, *pacl = NULL; p9_debug(P9_DEBUG_VFS, "name %pd\n", dentry); @@ -400,8 +399,7 @@ static int v9fs_vfs_mkdir_dotl(struct inode *dir, if (dir->i_mode & S_ISGID) omode |= S_ISGID; - dir_dentry = dentry->d_parent; - dfid = v9fs_fid_lookup(dir_dentry); + dfid = v9fs_parent_fid(dentry); if (IS_ERR(dfid)) { err = PTR_ERR(dfid); p9_debug(P9_DEBUG_VFS, "fid lookup failed %d\n", err); @@ -691,7 +689,7 @@ v9fs_vfs_symlink_dotl(struct inode *dir, struct dentry *dentry, p9_debug(P9_DEBUG_VFS, "%lu,%s,%s\n", dir->i_ino, name, symname); v9ses = v9fs_inode2v9ses(dir); - dfid = v9fs_fid_lookup(dentry->d_parent); + dfid = v9fs_parent_fid(dentry); if (IS_ERR(dfid)) { err = PTR_ERR(dfid); p9_debug(P9_DEBUG_VFS, "fid lookup failed %d\n", err); @@ -762,7 +760,6 @@ v9fs_vfs_link_dotl(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) { int err; - struct dentry *dir_dentry; struct p9_fid *dfid, *oldfid; struct v9fs_session_info *v9ses; @@ -770,8 +767,7 @@ v9fs_vfs_link_dotl(struct dentry *old_dentry, struct inode *dir, dir->i_ino, old_dentry, dentry); v9ses = v9fs_inode2v9ses(dir); - dir_dentry = dentry->d_parent; - dfid = v9fs_fid_lookup(dir_dentry); + dfid = v9fs_parent_fid(dentry); if (IS_ERR(dfid)) return PTR_ERR(dfid); @@ -822,7 +818,6 @@ v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, umode_t omode, struct p9_fid *fid = NULL, *dfid = NULL; struct inode *inode; struct p9_qid qid; - struct dentry *dir_dentry; struct posix_acl *dacl = NULL, *pacl = NULL; p9_debug(P9_DEBUG_VFS, " %lu,%pd mode: %hx MAJOR: %u MINOR: %u\n", @@ -830,8 +825,7 @@ v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, umode_t omode, MAJOR(rdev), MINOR(rdev)); v9ses = v9fs_inode2v9ses(dir); - dir_dentry = dentry->d_parent; - dfid = v9fs_fid_lookup(dir_dentry); + dfid = v9fs_parent_fid(dentry); if (IS_ERR(dfid)) { err = PTR_ERR(dfid); p9_debug(P9_DEBUG_VFS, "fid lookup failed %d\n", err); diff --git a/fs/9p/xattr.c b/fs/9p/xattr.c index 18c62bae9591..a6bd349bab23 100644 --- a/fs/9p/xattr.c +++ b/fs/9p/xattr.c @@ -147,8 +147,9 @@ static int v9fs_xattr_handler_get(const struct xattr_handler *handler, } static int v9fs_xattr_handler_set(const struct xattr_handler *handler, - struct dentry *dentry, const char *name, - const void *value, size_t size, int flags) + struct dentry *dentry, struct inode *inode, + const char *name, const void *value, + size_t size, int flags) { const char *full_name = xattr_full_name(handler, name); diff --git a/fs/Kconfig b/fs/Kconfig index 6725f59c18e6..4524916fa200 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -10,6 +10,9 @@ config DCACHE_WORD_ACCESS if BLOCK +config FS_IOMAP + bool + source "fs/ext2/Kconfig" source "fs/ext4/Kconfig" source "fs/jbd2/Kconfig" @@ -52,6 +55,7 @@ config FS_DAX_PMD depends on FS_DAX depends on ZONE_DEVICE depends on TRANSPARENT_HUGEPAGE + depends on BROKEN endif # BLOCK diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt index 2d0cbbd14cfc..72c03354c14b 100644 --- a/fs/Kconfig.binfmt +++ b/fs/Kconfig.binfmt @@ -1,6 +1,7 @@ config BINFMT_ELF bool "Kernel support for ELF binaries" depends on MMU && (BROKEN || !FRV) + select ELFCORE default y ---help--- ELF (Executable and Linkable Format) is a format for libraries and @@ -26,6 +27,7 @@ config BINFMT_ELF config COMPAT_BINFMT_ELF bool depends on COMPAT && BINFMT_ELF + select ELFCORE config ARCH_BINFMT_ELF_STATE bool @@ -34,6 +36,7 @@ config BINFMT_ELF_FDPIC bool "Kernel support for FDPIC ELF binaries" default y depends on (FRV || BLACKFIN || (SUPERH32 && !MMU) || C6X) + select ELFCORE help ELF FDPIC binaries are based on ELF, but allow the individual load segments of a binary to be located in memory independently of each @@ -43,6 +46,11 @@ config BINFMT_ELF_FDPIC It is also possible to run FDPIC ELF binaries on MMU linux also. +config ELFCORE + bool + help + This option enables kernel/elfcore.o. + config CORE_DUMP_DEFAULT_ELF_HEADERS bool "Write ELF core dumps with partial segments" default y diff --git a/fs/Makefile b/fs/Makefile index 85b6e13b62d3..ed2b63257ba9 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -49,6 +49,7 @@ obj-$(CONFIG_COREDUMP) += coredump.o obj-$(CONFIG_SYSCTL) += drop_caches.o obj-$(CONFIG_FHANDLE) += fhandle.o +obj-$(CONFIG_FS_IOMAP) += iomap.o obj-y += quota/ diff --git a/fs/adfs/dir.c b/fs/adfs/dir.c index fd4cf2c48e48..bec25f7017c0 100644 --- a/fs/adfs/dir.c +++ b/fs/adfs/dir.c @@ -207,7 +207,7 @@ adfs_hash(const struct dentry *parent, struct qstr *qstr) */ qstr->len = i = name_len; name = qstr->name; - hash = init_name_hash(); + hash = init_name_hash(parent); while (i--) { char c; diff --git a/fs/affs/namei.c b/fs/affs/namei.c index 00d3002a6780..eb32029bc776 100644 --- a/fs/affs/namei.c +++ b/fs/affs/namei.c @@ -61,7 +61,7 @@ affs_get_toupper(struct super_block *sb) * Note: the dentry argument is the parent dentry. */ static inline int -__affs_hash_dentry(struct qstr *qstr, toupper_t toupper, bool notruncate) +__affs_hash_dentry(const struct dentry *dentry, struct qstr *qstr, toupper_t toupper, bool notruncate) { const u8 *name = qstr->name; unsigned long hash; @@ -72,7 +72,7 @@ __affs_hash_dentry(struct qstr *qstr, toupper_t toupper, bool notruncate) if (retval) return retval; - hash = init_name_hash(); + hash = init_name_hash(dentry); len = min(qstr->len, AFFSNAMEMAX); for (; len > 0; name++, len--) hash = partial_name_hash(toupper(*name), hash); @@ -84,7 +84,7 @@ __affs_hash_dentry(struct qstr *qstr, toupper_t toupper, bool notruncate) static int affs_hash_dentry(const struct dentry *dentry, struct qstr *qstr) { - return __affs_hash_dentry(qstr, affs_toupper, + return __affs_hash_dentry(dentry, qstr, affs_toupper, affs_nofilenametruncate(dentry)); } @@ -92,7 +92,7 @@ affs_hash_dentry(const struct dentry *dentry, struct qstr *qstr) static int affs_intl_hash_dentry(const struct dentry *dentry, struct qstr *qstr) { - return __affs_hash_dentry(qstr, affs_intl_toupper, + return __affs_hash_dentry(dentry, qstr, affs_intl_toupper, affs_nofilenametruncate(dentry)); } diff --git a/fs/affs/super.c b/fs/affs/super.c index 2a6713b6b9f4..d6384863192c 100644 --- a/fs/affs/super.c +++ b/fs/affs/super.c @@ -528,7 +528,7 @@ affs_remount(struct super_block *sb, int *flags, char *data) char *prefix = NULL; new_opts = kstrdup(data, GFP_KERNEL); - if (!new_opts) + if (data && !new_opts) return -ENOMEM; pr_debug("%s(flags=0x%x,opts=\"%s\")\n", __func__, *flags, data); @@ -546,7 +546,8 @@ affs_remount(struct super_block *sb, int *flags, char *data) } flush_delayed_work(&sbi->sb_work); - replace_mount_options(sb, new_opts); + if (new_opts) + replace_mount_options(sb, new_opts); sbi->s_flags = mount_flags; sbi->s_mode = mode; diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c index 63cd9f939f19..4832de84d52c 100644 --- a/fs/afs/rxrpc.c +++ b/fs/afs/rxrpc.c @@ -85,18 +85,14 @@ int afs_open_socket(void) skb_queue_head_init(&afs_incoming_calls); + ret = -ENOMEM; afs_async_calls = create_singlethread_workqueue("kafsd"); - if (!afs_async_calls) { - _leave(" = -ENOMEM [wq]"); - return -ENOMEM; - } + if (!afs_async_calls) + goto error_0; ret = sock_create_kern(&init_net, AF_RXRPC, SOCK_DGRAM, PF_INET, &socket); - if (ret < 0) { - destroy_workqueue(afs_async_calls); - _leave(" = %d [socket]", ret); - return ret; - } + if (ret < 0) + goto error_1; socket->sk->sk_allocation = GFP_NOFS; @@ -111,18 +107,26 @@ int afs_open_socket(void) sizeof(srx.transport.sin.sin_addr)); ret = kernel_bind(socket, (struct sockaddr *) &srx, sizeof(srx)); - if (ret < 0) { - sock_release(socket); - destroy_workqueue(afs_async_calls); - _leave(" = %d [bind]", ret); - return ret; - } + if (ret < 0) + goto error_2; + + ret = kernel_listen(socket, INT_MAX); + if (ret < 0) + goto error_2; rxrpc_kernel_intercept_rx_messages(socket, afs_rx_interceptor); afs_socket = socket; _leave(" = 0"); return 0; + +error_2: + sock_release(socket); +error_1: + destroy_workqueue(afs_async_calls); +error_0: + _leave(" = %d", ret); + return ret; } /* diff --git a/fs/afs/write.c b/fs/afs/write.c index 65de439bdc4f..14d506efd1aa 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -643,10 +643,6 @@ ssize_t afs_file_write(struct kiocb *iocb, struct iov_iter *from) return 0; result = generic_file_write_iter(iocb, from); - if (IS_ERR_VALUE(result)) { - _leave(" = %zd", result); - return result; - } _leave(" = %zd", result); return result; @@ -496,7 +496,12 @@ static int aio_setup_ring(struct kioctx *ctx) ctx->mmap_size = nr_pages * PAGE_SIZE; pr_debug("attempting mmap of %lu bytes\n", ctx->mmap_size); - down_write(&mm->mmap_sem); + if (down_write_killable(&mm->mmap_sem)) { + ctx->mmap_size = 0; + aio_free_ring(ctx); + return -EINTR; + } + ctx->mmap_base = do_mmap_pgoff(ctx->aio_ring_file, 0, ctx->mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED, 0, &unused); diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h index f0d268b97d19..a439548de785 100644 --- a/fs/autofs4/autofs_i.h +++ b/fs/autofs4/autofs_i.h @@ -70,9 +70,13 @@ struct autofs_info { }; #define AUTOFS_INF_EXPIRING (1<<0) /* dentry in the process of expiring */ -#define AUTOFS_INF_NO_RCU (1<<1) /* the dentry is being considered +#define AUTOFS_INF_WANT_EXPIRE (1<<1) /* the dentry is being considered * for expiry, so RCU_walk is - * not permitted + * not permitted. If it progresses to + * actual expiry attempt, the flag is + * not cleared when EXPIRING is set - + * in that case it gets cleared only + * when it comes to clearing EXPIRING. */ #define AUTOFS_INF_PENDING (1<<2) /* dentry pending mount */ diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c index 9510d8d2e9cd..b493909e7492 100644 --- a/fs/autofs4/expire.c +++ b/fs/autofs4/expire.c @@ -316,19 +316,17 @@ struct dentry *autofs4_expire_direct(struct super_block *sb, if (ino->flags & AUTOFS_INF_PENDING) goto out; if (!autofs4_direct_busy(mnt, root, timeout, do_now)) { - ino->flags |= AUTOFS_INF_NO_RCU; + ino->flags |= AUTOFS_INF_WANT_EXPIRE; spin_unlock(&sbi->fs_lock); synchronize_rcu(); spin_lock(&sbi->fs_lock); if (!autofs4_direct_busy(mnt, root, timeout, do_now)) { ino->flags |= AUTOFS_INF_EXPIRING; - smp_mb(); - ino->flags &= ~AUTOFS_INF_NO_RCU; init_completion(&ino->expire_complete); spin_unlock(&sbi->fs_lock); return root; } - ino->flags &= ~AUTOFS_INF_NO_RCU; + ino->flags &= ~AUTOFS_INF_WANT_EXPIRE; } out: spin_unlock(&sbi->fs_lock); @@ -446,7 +444,7 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb, while ((dentry = get_next_positive_subdir(dentry, root))) { spin_lock(&sbi->fs_lock); ino = autofs4_dentry_ino(dentry); - if (ino->flags & AUTOFS_INF_NO_RCU) + if (ino->flags & AUTOFS_INF_WANT_EXPIRE) expired = NULL; else expired = should_expire(dentry, mnt, timeout, how); @@ -455,7 +453,7 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb, continue; } ino = autofs4_dentry_ino(expired); - ino->flags |= AUTOFS_INF_NO_RCU; + ino->flags |= AUTOFS_INF_WANT_EXPIRE; spin_unlock(&sbi->fs_lock); synchronize_rcu(); spin_lock(&sbi->fs_lock); @@ -465,7 +463,7 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb, goto found; } - ino->flags &= ~AUTOFS_INF_NO_RCU; + ino->flags &= ~AUTOFS_INF_WANT_EXPIRE; if (expired != dentry) dput(expired); spin_unlock(&sbi->fs_lock); @@ -475,17 +473,8 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb, found: pr_debug("returning %p %pd\n", expired, expired); ino->flags |= AUTOFS_INF_EXPIRING; - smp_mb(); - ino->flags &= ~AUTOFS_INF_NO_RCU; init_completion(&ino->expire_complete); spin_unlock(&sbi->fs_lock); - spin_lock(&sbi->lookup_lock); - spin_lock(&expired->d_parent->d_lock); - spin_lock_nested(&expired->d_lock, DENTRY_D_LOCK_NESTED); - list_move(&expired->d_parent->d_subdirs, &expired->d_child); - spin_unlock(&expired->d_lock); - spin_unlock(&expired->d_parent->d_lock); - spin_unlock(&sbi->lookup_lock); return expired; } @@ -496,7 +485,7 @@ int autofs4_expire_wait(struct dentry *dentry, int rcu_walk) int status; /* Block on any pending expire */ - if (!(ino->flags & (AUTOFS_INF_EXPIRING | AUTOFS_INF_NO_RCU))) + if (!(ino->flags & AUTOFS_INF_WANT_EXPIRE)) return 0; if (rcu_walk) return -ECHILD; @@ -554,7 +543,7 @@ int autofs4_expire_run(struct super_block *sb, ino = autofs4_dentry_ino(dentry); /* avoid rapid-fire expire attempts if expiry fails */ ino->last_used = now; - ino->flags &= ~AUTOFS_INF_EXPIRING; + ino->flags &= ~(AUTOFS_INF_EXPIRING|AUTOFS_INF_WANT_EXPIRE); complete_all(&ino->expire_complete); spin_unlock(&sbi->fs_lock); @@ -583,7 +572,7 @@ int autofs4_do_expire_multi(struct super_block *sb, struct vfsmount *mnt, spin_lock(&sbi->fs_lock); /* avoid rapid-fire expire attempts if expiry fails */ ino->last_used = now; - ino->flags &= ~AUTOFS_INF_EXPIRING; + ino->flags &= ~(AUTOFS_INF_EXPIRING|AUTOFS_INF_WANT_EXPIRE); complete_all(&ino->expire_complete); spin_unlock(&sbi->fs_lock); dput(dentry); diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c index 78bd80298528..3767f6641af1 100644 --- a/fs/autofs4/root.c +++ b/fs/autofs4/root.c @@ -458,7 +458,7 @@ static int autofs4_d_manage(struct dentry *dentry, bool rcu_walk) */ struct inode *inode; - if (ino->flags & (AUTOFS_INF_EXPIRING | AUTOFS_INF_NO_RCU)) + if (ino->flags & AUTOFS_INF_WANT_EXPIRE) return 0; if (d_mountpoint(dentry)) return 0; diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c index 0146d911f468..708214457d16 100644 --- a/fs/autofs4/waitq.c +++ b/fs/autofs4/waitq.c @@ -66,11 +66,12 @@ static int autofs4_write(struct autofs_sb_info *sbi, set_fs(KERNEL_DS); mutex_lock(&sbi->pipe_mutex); - wr = __vfs_write(file, data, bytes, &file->f_pos); - while (bytes && wr) { + while (bytes) { + wr = __vfs_write(file, data, bytes, &file->f_pos); + if (wr <= 0) + break; data += wr; bytes -= wr; - wr = __vfs_write(file, data, bytes, &file->f_pos); } mutex_unlock(&sbi->pipe_mutex); @@ -397,7 +398,7 @@ int autofs4_wait(struct autofs_sb_info *sbi, } } qstr.name = name; - qstr.hash = full_name_hash(name, qstr.len); + qstr.hash = full_name_hash(dentry, name, qstr.len); if (mutex_lock_interruptible(&sbi->wq_mutex)) { kfree(qstr.name); diff --git a/fs/bad_inode.c b/fs/bad_inode.c index 72e35b721608..3ba385eaa26e 100644 --- a/fs/bad_inode.c +++ b/fs/bad_inode.c @@ -100,8 +100,8 @@ static int bad_inode_setattr(struct dentry *direntry, struct iattr *attrs) return -EIO; } -static int bad_inode_setxattr(struct dentry *dentry, const char *name, - const void *value, size_t size, int flags) +static int bad_inode_setxattr(struct dentry *dentry, struct inode *inode, + const char *name, const void *value, size_t size, int flags) { return -EIO; } diff --git a/fs/befs/datastream.c b/fs/befs/datastream.c index dde0b79f3948..af1bc19b7c85 100644 --- a/fs/befs/datastream.c +++ b/fs/befs/datastream.c @@ -48,7 +48,7 @@ struct buffer_head * befs_read_datastream(struct super_block *sb, const befs_data_stream *ds, befs_off_t pos, uint * off) { - struct buffer_head *bh = NULL; + struct buffer_head *bh; befs_block_run run; befs_blocknr_t block; /* block coresponding to pos */ @@ -127,7 +127,7 @@ befs_read_lsymlink(struct super_block *sb, const befs_data_stream *ds, { befs_off_t bytes_read = 0; /* bytes readed */ u16 plen; - struct buffer_head *bh = NULL; + struct buffer_head *bh; befs_debug(sb, "---> %s length: %llu", __func__, len); while (bytes_read < len) { @@ -429,7 +429,7 @@ befs_find_brun_dblindirect(struct super_block *sb, struct buffer_head *dbl_indir_block; struct buffer_head *indir_block; befs_block_run indir_run; - befs_disk_inode_addr *iaddr_array = NULL; + befs_disk_inode_addr *iaddr_array; struct befs_sb_info *befs_sb = BEFS_SB(sb); befs_blocknr_t indir_start_blk = @@ -488,7 +488,6 @@ befs_find_brun_dblindirect(struct super_block *sb, iaddr_array = (befs_disk_inode_addr *) dbl_indir_block->b_data; indir_run = fsrun_to_cpu(sb, iaddr_array[dbl_block_indx]); brelse(dbl_indir_block); - iaddr_array = NULL; /* Read indirect block */ which_block = indir_indx / befs_iaddrs_per_block(sb); @@ -513,7 +512,6 @@ befs_find_brun_dblindirect(struct super_block *sb, iaddr_array = (befs_disk_inode_addr *) indir_block->b_data; *run = fsrun_to_cpu(sb, iaddr_array[block_indx]); brelse(indir_block); - iaddr_array = NULL; blockno_at_run_start = indir_start_blk; blockno_at_run_start += diblklen * dblindir_indx; diff --git a/fs/befs/io.c b/fs/befs/io.c index 7a5b4ec21c56..523c8af2d770 100644 --- a/fs/befs/io.c +++ b/fs/befs/io.c @@ -26,7 +26,7 @@ struct buffer_head * befs_bread_iaddr(struct super_block *sb, befs_inode_addr iaddr) { - struct buffer_head *bh = NULL; + struct buffer_head *bh; befs_blocknr_t block = 0; struct befs_sb_info *befs_sb = BEFS_SB(sb); @@ -63,7 +63,7 @@ befs_bread_iaddr(struct super_block *sb, befs_inode_addr iaddr) struct buffer_head * befs_bread(struct super_block *sb, befs_blocknr_t block) { - struct buffer_head *bh = NULL; + struct buffer_head *bh; befs_debug(sb, "---> Enter %s %lu", __func__, (unsigned long)block); diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c index 71112aa07d84..7da05b159ade 100644 --- a/fs/befs/linuxvfs.c +++ b/fs/befs/linuxvfs.c @@ -155,7 +155,7 @@ befs_get_block(struct inode *inode, sector_t block, static struct dentry * befs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { - struct inode *inode = NULL; + struct inode *inode; struct super_block *sb = dir->i_sb; const befs_data_stream *ds = &BEFS_I(dir)->i_data.ds; befs_off_t offset; @@ -294,10 +294,10 @@ static void init_once(void *foo) static struct inode *befs_iget(struct super_block *sb, unsigned long ino) { - struct buffer_head *bh = NULL; - befs_inode *raw_inode = NULL; + struct buffer_head *bh; + befs_inode *raw_inode; struct befs_sb_info *befs_sb = BEFS_SB(sb); - struct befs_inode_info *befs_ino = NULL; + struct befs_inode_info *befs_ino; struct inode *inode; long ret = -EIO; diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c index 4c556680fa74..ae1b5404fced 100644 --- a/fs/binfmt_aout.c +++ b/fs/binfmt_aout.c @@ -127,12 +127,8 @@ static int set_brk(unsigned long start, unsigned long end) { start = PAGE_ALIGN(start); end = PAGE_ALIGN(end); - if (end > start) { - unsigned long addr; - addr = vm_brk(start, end - start); - if (BAD_ADDR(addr)) - return addr; - } + if (end > start) + return vm_brk(start, end - start); return 0; } @@ -275,7 +271,7 @@ static int load_aout_binary(struct linux_binprm * bprm) map_size = ex.a_text+ex.a_data; #endif error = vm_brk(text_addr & PAGE_MASK, map_size); - if (error != (text_addr & PAGE_MASK)) + if (error) return error; error = read_code(bprm->file, text_addr, pos, @@ -297,7 +293,10 @@ static int load_aout_binary(struct linux_binprm * bprm) } if (!bprm->file->f_op->mmap||((fd_offset & ~PAGE_MASK) != 0)) { - vm_brk(N_TXTADDR(ex), ex.a_text+ex.a_data); + error = vm_brk(N_TXTADDR(ex), ex.a_text+ex.a_data); + if (error) + return error; + read_code(bprm->file, N_TXTADDR(ex), fd_offset, ex.a_text + ex.a_data); goto beyond_if; @@ -378,8 +377,10 @@ static int load_aout_library(struct file *file) "N_TXTOFF is not page aligned. Please convert library: %pD\n", file); } - vm_brk(start_addr, ex.a_text + ex.a_data + ex.a_bss); - + retval = vm_brk(start_addr, ex.a_text + ex.a_data + ex.a_bss); + if (retval) + goto out; + read_code(file, start_addr, N_TXTOFF(ex), ex.a_text + ex.a_data); retval = 0; @@ -397,9 +398,8 @@ static int load_aout_library(struct file *file) len = PAGE_ALIGN(ex.a_text + ex.a_data); bss = ex.a_text + ex.a_data + ex.a_bss; if (bss > len) { - error = vm_brk(start_addr + len, bss - len); - retval = error; - if (error != start_addr + len) + retval = vm_brk(start_addr + len, bss - len); + if (retval) goto out; } retval = 0; diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 56224ffa94d2..a7a28110dc80 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -96,10 +96,9 @@ static int set_brk(unsigned long start, unsigned long end) start = ELF_PAGEALIGN(start); end = ELF_PAGEALIGN(end); if (end > start) { - unsigned long addr; - addr = vm_brk(start, end - start); - if (BAD_ADDR(addr)) - return addr; + int error = vm_brk(start, end - start); + if (error) + return error; } current->mm->start_brk = current->mm->brk = end; return 0; @@ -629,7 +628,7 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex, /* Map the last of the bss segment */ error = vm_brk(elf_bss, last_bss - elf_bss); - if (BAD_ADDR(error)) + if (error) goto out; } @@ -1176,8 +1175,11 @@ static int load_elf_library(struct file *file) len = ELF_PAGESTART(eppnt->p_filesz + eppnt->p_vaddr + ELF_MIN_ALIGN - 1); bss = eppnt->p_memsz + eppnt->p_vaddr; - if (bss > len) - vm_brk(len, bss - len); + if (bss > len) { + error = vm_brk(len, bss - len); + if (error) + goto out_free_ph; + } error = 0; out_free_ph: @@ -2273,7 +2275,7 @@ static int elf_core_dump(struct coredump_params *cprm) goto end_coredump; /* Align to page */ - if (!dump_skip(cprm, dataoff - cprm->file->f_pos)) + if (!dump_skip(cprm, dataoff - cprm->pos)) goto end_coredump; for (i = 0, vma = first_vma(current, gate_vma); vma != NULL; diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index 71ade0e556b7..203589311bf8 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -1787,7 +1787,7 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm) goto end_coredump; } - if (!dump_skip(cprm, dataoff - cprm->file->f_pos)) + if (!dump_skip(cprm, dataoff - cprm->pos)) goto end_coredump; if (!elf_fdpic_dump_segments(cprm)) diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c index f723cd3a455c..caf9e39bb82b 100644 --- a/fs/binfmt_flat.c +++ b/fs/binfmt_flat.c @@ -337,7 +337,7 @@ calc_reloc(unsigned long r, struct lib_info *p, int curid, int internalp) "(%d != %d)", (unsigned) r, curid, id); goto failed; } else if ( ! p->lib_list[id].loaded && - IS_ERR_VALUE(load_flat_shared_library(id, p))) { + load_flat_shared_library(id, p) < 0) { printk("BINFMT_FLAT: failed to load library %d", id); goto failed; } @@ -837,7 +837,7 @@ static int load_flat_shared_library(int id, struct lib_info *libs) res = prepare_binprm(&bprm); - if (!IS_ERR_VALUE(res)) + if (!res) res = load_flat_file(&bprm, libs, id, NULL); abort_creds(bprm.cred); @@ -883,7 +883,7 @@ static int load_flat_binary(struct linux_binprm * bprm) stack_len += FLAT_STACK_ALIGN - 1; /* reserve for upcoming alignment */ res = load_flat_file(bprm, &libinfo, 0, &stack_len); - if (IS_ERR_VALUE(res)) + if (res < 0) return res; /* Update data segment pointers for all libraries */ diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c index 3a3ced779fc7..5417516f6e59 100644 --- a/fs/binfmt_misc.c +++ b/fs/binfmt_misc.c @@ -637,13 +637,12 @@ static ssize_t bm_entry_write(struct file *file, const char __user *buffer, break; case 3: /* Delete this handler. */ - root = dget(file->f_path.dentry->d_sb->s_root); + root = file_inode(file)->i_sb->s_root; inode_lock(d_inode(root)); kill_node(e); inode_unlock(d_inode(root)); - dput(root); break; default: return res; @@ -665,8 +664,8 @@ static ssize_t bm_register_write(struct file *file, const char __user *buffer, { Node *e; struct inode *inode; - struct dentry *root, *dentry; - struct super_block *sb = file->f_path.dentry->d_sb; + struct super_block *sb = file_inode(file)->i_sb; + struct dentry *root = sb->s_root, *dentry; int err = 0; e = create_entry(buffer, count); @@ -674,7 +673,6 @@ static ssize_t bm_register_write(struct file *file, const char __user *buffer, if (IS_ERR(e)) return PTR_ERR(e); - root = dget(sb->s_root); inode_lock(d_inode(root)); dentry = lookup_one_len(e->name, root, strlen(e->name)); err = PTR_ERR(dentry); @@ -712,7 +710,6 @@ out2: dput(dentry); out: inode_unlock(d_inode(root)); - dput(root); if (err) { kfree(e); @@ -753,14 +750,13 @@ static ssize_t bm_status_write(struct file *file, const char __user *buffer, break; case 3: /* Delete all handlers. */ - root = dget(file->f_path.dentry->d_sb->s_root); + root = file_inode(file)->i_sb->s_root; inode_lock(d_inode(root)); while (!list_empty(&entries)) kill_node(list_entry(entries.next, Node, list)); inode_unlock(d_inode(root)); - dput(root); break; default: return res; diff --git a/fs/block_dev.c b/fs/block_dev.c index a063d4d8ac39..5cbd5391667e 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -29,6 +29,7 @@ #include <linux/log2.h> #include <linux/cleancache.h> #include <linux/dax.h> +#include <linux/badblocks.h> #include <asm/uaccess.h> #include "internal.h" @@ -50,6 +51,18 @@ struct block_device *I_BDEV(struct inode *inode) } EXPORT_SYMBOL(I_BDEV); +void __vfs_msg(struct super_block *sb, const char *prefix, const char *fmt, ...) +{ + struct va_format vaf; + va_list args; + + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + printk_ratelimited("%sVFS (%s): %pV\n", prefix, sb->s_id, &vaf); + va_end(args); +} + static void bdev_write_inode(struct block_device *bdev) { struct inode *inode = bdev->bd_inode; @@ -480,7 +493,7 @@ long bdev_direct_access(struct block_device *bdev, struct blk_dax_ctl *dax) if (size < 0) return size; - if (!ops->direct_access) + if (!blk_queue_dax(bdev_get_queue(bdev)) || !ops->direct_access) return -EOPNOTSUPP; if ((sector + DIV_ROUND_UP(size, 512)) > part_nr_sects_read(bdev->bd_part)) @@ -488,7 +501,7 @@ long bdev_direct_access(struct block_device *bdev, struct blk_dax_ctl *dax) sector += get_start_sect(bdev); if (sector % (PAGE_SIZE / 512)) return -EINVAL; - avail = ops->direct_access(bdev, sector, &dax->addr, &dax->pfn); + avail = ops->direct_access(bdev, sector, &dax->addr, &dax->pfn, size); if (!avail) return -ERANGE; if (avail > 0 && avail & ~PAGE_MASK) @@ -497,6 +510,75 @@ long bdev_direct_access(struct block_device *bdev, struct blk_dax_ctl *dax) } EXPORT_SYMBOL_GPL(bdev_direct_access); +/** + * bdev_dax_supported() - Check if the device supports dax for filesystem + * @sb: The superblock of the device + * @blocksize: The block size of the device + * + * This is a library function for filesystems to check if the block device + * can be mounted with dax option. + * + * Return: negative errno if unsupported, 0 if supported. + */ +int bdev_dax_supported(struct super_block *sb, int blocksize) +{ + struct blk_dax_ctl dax = { + .sector = 0, + .size = PAGE_SIZE, + }; + int err; + + if (blocksize != PAGE_SIZE) { + vfs_msg(sb, KERN_ERR, "error: unsupported blocksize for dax"); + return -EINVAL; + } + + err = bdev_direct_access(sb->s_bdev, &dax); + if (err < 0) { + switch (err) { + case -EOPNOTSUPP: + vfs_msg(sb, KERN_ERR, + "error: device does not support dax"); + break; + case -EINVAL: + vfs_msg(sb, KERN_ERR, + "error: unaligned partition for dax"); + break; + default: + vfs_msg(sb, KERN_ERR, + "error: dax access failed (%d)", err); + } + return err; + } + + return 0; +} +EXPORT_SYMBOL_GPL(bdev_dax_supported); + +/** + * bdev_dax_capable() - Return if the raw device is capable for dax + * @bdev: The device for raw block device access + */ +bool bdev_dax_capable(struct block_device *bdev) +{ + struct blk_dax_ctl dax = { + .size = PAGE_SIZE, + }; + + if (!IS_ENABLED(CONFIG_FS_DAX)) + return false; + + dax.sector = 0; + if (bdev_direct_access(bdev, &dax) < 0) + return false; + + dax.sector = bdev->bd_part->nr_sects - (PAGE_SIZE / 512); + if (bdev_direct_access(bdev, &dax) < 0) + return false; + + return true; +} + /* * pseudo-fs */ @@ -532,7 +614,6 @@ static void init_once(void *foo) memset(bdev, 0, sizeof(*bdev)); mutex_init(&bdev->bd_mutex); - INIT_LIST_HEAD(&bdev->bd_inodes); INIT_LIST_HEAD(&bdev->bd_list); #ifdef CONFIG_SYSFS INIT_LIST_HEAD(&bdev->bd_holder_disks); @@ -542,24 +623,13 @@ static void init_once(void *foo) mutex_init(&bdev->bd_fsfreeze_mutex); } -static inline void __bd_forget(struct inode *inode) -{ - list_del_init(&inode->i_devices); - inode->i_bdev = NULL; - inode->i_mapping = &inode->i_data; -} - static void bdev_evict_inode(struct inode *inode) { struct block_device *bdev = &BDEV_I(inode)->bdev; - struct list_head *p; truncate_inode_pages_final(&inode->i_data); invalidate_inode_buffers(inode); /* is it needed here? */ clear_inode(inode); spin_lock(&bdev_lock); - while ( (p = bdev->bd_inodes.next) != &bdev->bd_inodes ) { - __bd_forget(list_entry(p, struct inode, i_devices)); - } list_del_init(&bdev->bd_list); spin_unlock(&bdev_lock); } @@ -723,7 +793,6 @@ static struct block_device *bd_acquire(struct inode *inode) bdgrab(bdev); inode->i_bdev = bdev; inode->i_mapping = bdev->bd_inode->i_mapping; - list_add(&inode->i_devices, &bdev->bd_inodes); } spin_unlock(&bdev_lock); } @@ -739,7 +808,8 @@ void bd_forget(struct inode *inode) spin_lock(&bdev_lock); if (!sb_is_blkdev_sb(inode->i_sb)) bdev = inode->i_bdev; - __bd_forget(inode); + inode->i_bdev = NULL; + inode->i_mapping = &inode->i_data; spin_unlock(&bdev_lock); if (bdev) @@ -1205,7 +1275,8 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) bdev->bd_disk = disk; bdev->bd_queue = disk->queue; bdev->bd_contains = bdev; - if (IS_ENABLED(CONFIG_BLK_DEV_DAX) && disk->fops->direct_access) + if (IS_ENABLED(CONFIG_BLK_DEV_DAX) && + blk_queue_dax(disk->queue)) bdev->bd_inode->i_flags = S_DAX; else bdev->bd_inode->i_flags = 0; @@ -1238,7 +1309,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) if (!ret) { bd_set_size(bdev,(loff_t)get_capacity(disk)<<9); - if (!blkdev_dax_capable(bdev)) + if (!bdev_dax_capable(bdev)) bdev->bd_inode->i_flags &= ~S_DAX; } @@ -1275,7 +1346,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) goto out_clear; } bd_set_size(bdev, (loff_t)bdev->bd_part->nr_sects << 9); - if (!blkdev_dax_capable(bdev)) + if (!bdev_dax_capable(bdev)) bdev->bd_inode->i_flags &= ~S_DAX; } } else { @@ -1720,79 +1791,13 @@ static const struct address_space_operations def_blk_aops = { .is_dirty_writeback = buffer_check_dirty_writeback, }; -#ifdef CONFIG_FS_DAX -/* - * In the raw block case we do not need to contend with truncation nor - * unwritten file extents. Without those concerns there is no need for - * additional locking beyond the mmap_sem context that these routines - * are already executing under. - * - * Note, there is no protection if the block device is dynamically - * resized (partition grow/shrink) during a fault. A stable block device - * size is already not enforced in the blkdev_direct_IO path. - * - * For DAX, it is the responsibility of the block device driver to - * ensure the whole-disk device size is stable while requests are in - * flight. - * - * Finally, unlike the filemap_page_mkwrite() case there is no - * filesystem superblock to sync against freezing. We still include a - * pfn_mkwrite callback for dax drivers to receive write fault - * notifications. - */ -static int blkdev_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf) -{ - return __dax_fault(vma, vmf, blkdev_get_block, NULL); -} - -static int blkdev_dax_pfn_mkwrite(struct vm_area_struct *vma, - struct vm_fault *vmf) -{ - return dax_pfn_mkwrite(vma, vmf); -} - -static int blkdev_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr, - pmd_t *pmd, unsigned int flags) -{ - return __dax_pmd_fault(vma, addr, pmd, flags, blkdev_get_block, NULL); -} - -static const struct vm_operations_struct blkdev_dax_vm_ops = { - .fault = blkdev_dax_fault, - .pmd_fault = blkdev_dax_pmd_fault, - .pfn_mkwrite = blkdev_dax_pfn_mkwrite, -}; - -static const struct vm_operations_struct blkdev_default_vm_ops = { - .fault = filemap_fault, - .map_pages = filemap_map_pages, -}; - -static int blkdev_mmap(struct file *file, struct vm_area_struct *vma) -{ - struct inode *bd_inode = bdev_file_inode(file); - - file_accessed(file); - if (IS_DAX(bd_inode)) { - vma->vm_ops = &blkdev_dax_vm_ops; - vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE; - } else { - vma->vm_ops = &blkdev_default_vm_ops; - } - - return 0; -} -#else -#define blkdev_mmap generic_file_mmap -#endif - const struct file_operations def_blk_fops = { .open = blkdev_open, .release = blkdev_close, .llseek = block_llseek, .read_iter = blkdev_read_iter, .write_iter = blkdev_write_iter, - .mmap = blkdev_mmap, + .mmap = generic_file_mmap, .fsync = blkdev_fsync, .unlocked_ioctl = block_ioctl, #ifdef CONFIG_COMPAT diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 80e8472d618b..8bb3509099e8 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -1939,7 +1939,7 @@ static int inode_to_path(u64 inum, u32 name_len, unsigned long name_off, * from ipath->fspath->val[i]. * when it returns, there are ipath->fspath->elem_cnt number of paths available * in ipath->fspath->val[]. when the allocated space wasn't sufficient, the - * number of missed paths in recored in ipath->fspath->elem_missed, otherwise, + * number of missed paths is recorded in ipath->fspath->elem_missed, otherwise, * it's zero. ipath->fspath->bytes_missing holds the number of bytes that would * have been needed to return all paths. */ @@ -1991,7 +1991,7 @@ struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root, ifp = kmalloc(sizeof(*ifp), GFP_NOFS); if (!ifp) { - kfree(fspath); + vfree(fspath); return ERR_PTR(-ENOMEM); } diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 61205e3bbefa..4919aedb5fc1 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -196,6 +196,16 @@ struct btrfs_inode { struct list_head delayed_iput; long delayed_iput_count; + /* + * To avoid races between lockless (i_mutex not held) direct IO writes + * and concurrent fsync requests. Direct IO writes must acquire read + * access on this semaphore for creating an extent map and its + * corresponding ordered extent. The fast fsync path must acquire write + * access on this semaphore before it collects ordered extents and + * extent maps. + */ + struct rw_semaphore dio_sem; + struct inode vfs_inode; }; @@ -303,7 +313,7 @@ struct btrfs_dio_private { struct bio *dio_bio; /* - * The original bio may be splited to several sub-bios, this is + * The original bio may be split to several sub-bios, this is * done during endio of sub-bios */ int (*subio_endio)(struct inode *, struct btrfs_io_bio *, int); diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index 516e19d1d202..5d5cae05818d 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -1673,6 +1673,7 @@ static int btrfsic_read_block(struct btrfsic_state *state, } bio->bi_bdev = block_ctx->dev->bdev; bio->bi_iter.bi_sector = dev_bytenr >> 9; + bio_set_op_attrs(bio, REQ_OP_READ, 0); for (j = i; j < num_pages; j++) { ret = bio_add_page(bio, block_ctx->pagev[j], @@ -1685,7 +1686,7 @@ static int btrfsic_read_block(struct btrfsic_state *state, "btrfsic: error, failed to add a single page!\n"); return -1; } - if (submit_bio_wait(READ, bio)) { + if (submit_bio_wait(bio)) { printk(KERN_INFO "btrfsic: read error at logical %llu dev %s!\n", block_ctx->start, block_ctx->dev->name); @@ -1939,7 +1940,7 @@ again: /* * Clear all references of this block. Do not free * the block itself even if is not referenced anymore - * because it still carries valueable information + * because it still carries valuable information * like whether it was ever written and IO completed. */ list_for_each_entry_safe(l, tmp, &block->ref_to_list, @@ -2206,7 +2207,7 @@ static void btrfsic_bio_end_io(struct bio *bp) block->dev_bytenr, block->mirror_num); next_block = block->next_in_same_bio; block->iodone_w_error = iodone_w_error; - if (block->submit_bio_bh_rw & REQ_FLUSH) { + if (block->submit_bio_bh_rw & REQ_PREFLUSH) { dev_state->last_flush_gen++; if ((dev_state->state->print_mask & BTRFSIC_PRINT_MASK_END_IO_BIO_BH)) @@ -2242,7 +2243,7 @@ static void btrfsic_bh_end_io(struct buffer_head *bh, int uptodate) block->dev_bytenr, block->mirror_num); block->iodone_w_error = iodone_w_error; - if (block->submit_bio_bh_rw & REQ_FLUSH) { + if (block->submit_bio_bh_rw & REQ_PREFLUSH) { dev_state->last_flush_gen++; if ((dev_state->state->print_mask & BTRFSIC_PRINT_MASK_END_IO_BIO_BH)) @@ -2645,7 +2646,7 @@ static void btrfsic_dump_tree_sub(const struct btrfsic_state *state, * This algorithm is recursive because the amount of used stack space * is very small and the max recursion depth is limited. */ - indent_add = sprintf(buf, "%c-%llu(%s/%llu/%d)", + indent_add = sprintf(buf, "%c-%llu(%s/%llu/%u)", btrfsic_get_block_type(state, block), block->logical_bytenr, block->dev_state->name, block->dev_bytenr, block->mirror_num); @@ -2855,12 +2856,12 @@ static struct btrfsic_dev_state *btrfsic_dev_state_lookup( return ds; } -int btrfsic_submit_bh(int rw, struct buffer_head *bh) +int btrfsic_submit_bh(int op, int op_flags, struct buffer_head *bh) { struct btrfsic_dev_state *dev_state; if (!btrfsic_is_initialized) - return submit_bh(rw, bh); + return submit_bh(op, op_flags, bh); mutex_lock(&btrfsic_mutex); /* since btrfsic_submit_bh() might also be called before @@ -2869,26 +2870,26 @@ int btrfsic_submit_bh(int rw, struct buffer_head *bh) /* Only called to write the superblock (incl. FLUSH/FUA) */ if (NULL != dev_state && - (rw & WRITE) && bh->b_size > 0) { + (op == REQ_OP_WRITE) && bh->b_size > 0) { u64 dev_bytenr; dev_bytenr = 4096 * bh->b_blocknr; if (dev_state->state->print_mask & BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH) printk(KERN_INFO - "submit_bh(rw=0x%x, blocknr=%llu (bytenr %llu)," - " size=%zu, data=%p, bdev=%p)\n", - rw, (unsigned long long)bh->b_blocknr, + "submit_bh(op=0x%x,0x%x, blocknr=%llu " + "(bytenr %llu), size=%zu, data=%p, bdev=%p)\n", + op, op_flags, (unsigned long long)bh->b_blocknr, dev_bytenr, bh->b_size, bh->b_data, bh->b_bdev); btrfsic_process_written_block(dev_state, dev_bytenr, &bh->b_data, 1, NULL, - NULL, bh, rw); - } else if (NULL != dev_state && (rw & REQ_FLUSH)) { + NULL, bh, op_flags); + } else if (NULL != dev_state && (op_flags & REQ_PREFLUSH)) { if (dev_state->state->print_mask & BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH) printk(KERN_INFO - "submit_bh(rw=0x%x FLUSH, bdev=%p)\n", - rw, bh->b_bdev); + "submit_bh(op=0x%x,0x%x FLUSH, bdev=%p)\n", + op, op_flags, bh->b_bdev); if (!dev_state->dummy_block_for_bio_bh_flush.is_iodone) { if ((dev_state->state->print_mask & (BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH | @@ -2906,7 +2907,7 @@ int btrfsic_submit_bh(int rw, struct buffer_head *bh) block->never_written = 0; block->iodone_w_error = 0; block->flush_gen = dev_state->last_flush_gen + 1; - block->submit_bio_bh_rw = rw; + block->submit_bio_bh_rw = op_flags; block->orig_bio_bh_private = bh->b_private; block->orig_bio_bh_end_io.bh = bh->b_end_io; block->next_in_same_bio = NULL; @@ -2915,10 +2916,10 @@ int btrfsic_submit_bh(int rw, struct buffer_head *bh) } } mutex_unlock(&btrfsic_mutex); - return submit_bh(rw, bh); + return submit_bh(op, op_flags, bh); } -static void __btrfsic_submit_bio(int rw, struct bio *bio) +static void __btrfsic_submit_bio(struct bio *bio) { struct btrfsic_dev_state *dev_state; @@ -2930,7 +2931,7 @@ static void __btrfsic_submit_bio(int rw, struct bio *bio) * btrfsic_mount(), this might return NULL */ dev_state = btrfsic_dev_state_lookup(bio->bi_bdev); if (NULL != dev_state && - (rw & WRITE) && NULL != bio->bi_io_vec) { + (bio_op(bio) == REQ_OP_WRITE) && NULL != bio->bi_io_vec) { unsigned int i; u64 dev_bytenr; u64 cur_bytenr; @@ -2942,9 +2943,9 @@ static void __btrfsic_submit_bio(int rw, struct bio *bio) if (dev_state->state->print_mask & BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH) printk(KERN_INFO - "submit_bio(rw=0x%x, bi_vcnt=%u," + "submit_bio(rw=%d,0x%x, bi_vcnt=%u," " bi_sector=%llu (bytenr %llu), bi_bdev=%p)\n", - rw, bio->bi_vcnt, + bio_op(bio), bio->bi_rw, bio->bi_vcnt, (unsigned long long)bio->bi_iter.bi_sector, dev_bytenr, bio->bi_bdev); @@ -2975,18 +2976,18 @@ static void __btrfsic_submit_bio(int rw, struct bio *bio) btrfsic_process_written_block(dev_state, dev_bytenr, mapped_datav, bio->bi_vcnt, bio, &bio_is_patched, - NULL, rw); + NULL, bio->bi_rw); while (i > 0) { i--; kunmap(bio->bi_io_vec[i].bv_page); } kfree(mapped_datav); - } else if (NULL != dev_state && (rw & REQ_FLUSH)) { + } else if (NULL != dev_state && (bio->bi_rw & REQ_PREFLUSH)) { if (dev_state->state->print_mask & BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH) printk(KERN_INFO - "submit_bio(rw=0x%x FLUSH, bdev=%p)\n", - rw, bio->bi_bdev); + "submit_bio(rw=%d,0x%x FLUSH, bdev=%p)\n", + bio_op(bio), bio->bi_rw, bio->bi_bdev); if (!dev_state->dummy_block_for_bio_bh_flush.is_iodone) { if ((dev_state->state->print_mask & (BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH | @@ -3004,7 +3005,7 @@ static void __btrfsic_submit_bio(int rw, struct bio *bio) block->never_written = 0; block->iodone_w_error = 0; block->flush_gen = dev_state->last_flush_gen + 1; - block->submit_bio_bh_rw = rw; + block->submit_bio_bh_rw = bio->bi_rw; block->orig_bio_bh_private = bio->bi_private; block->orig_bio_bh_end_io.bio = bio->bi_end_io; block->next_in_same_bio = NULL; @@ -3016,16 +3017,16 @@ leave: mutex_unlock(&btrfsic_mutex); } -void btrfsic_submit_bio(int rw, struct bio *bio) +void btrfsic_submit_bio(struct bio *bio) { - __btrfsic_submit_bio(rw, bio); - submit_bio(rw, bio); + __btrfsic_submit_bio(bio); + submit_bio(bio); } -int btrfsic_submit_bio_wait(int rw, struct bio *bio) +int btrfsic_submit_bio_wait(struct bio *bio) { - __btrfsic_submit_bio(rw, bio); - return submit_bio_wait(rw, bio); + __btrfsic_submit_bio(bio); + return submit_bio_wait(bio); } int btrfsic_mount(struct btrfs_root *root, diff --git a/fs/btrfs/check-integrity.h b/fs/btrfs/check-integrity.h index 13b8566c97ab..f78dff1c7e86 100644 --- a/fs/btrfs/check-integrity.h +++ b/fs/btrfs/check-integrity.h @@ -20,9 +20,9 @@ #define __BTRFS_CHECK_INTEGRITY__ #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY -int btrfsic_submit_bh(int rw, struct buffer_head *bh); -void btrfsic_submit_bio(int rw, struct bio *bio); -int btrfsic_submit_bio_wait(int rw, struct bio *bio); +int btrfsic_submit_bh(int op, int op_flags, struct buffer_head *bh); +void btrfsic_submit_bio(struct bio *bio); +int btrfsic_submit_bio_wait(struct bio *bio); #else #define btrfsic_submit_bh submit_bh #define btrfsic_submit_bio submit_bio diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index ff61a41ac90b..cefedabf0a92 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -363,6 +363,7 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start, kfree(cb); return -ENOMEM; } + bio_set_op_attrs(bio, REQ_OP_WRITE, 0); bio->bi_private = cb; bio->bi_end_io = end_compressed_bio_write; atomic_inc(&cb->pending_bios); @@ -373,7 +374,7 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start, page = compressed_pages[pg_index]; page->mapping = inode->i_mapping; if (bio->bi_iter.bi_size) - ret = io_tree->ops->merge_bio_hook(WRITE, page, 0, + ret = io_tree->ops->merge_bio_hook(page, 0, PAGE_SIZE, bio, 0); else @@ -401,13 +402,14 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start, BUG_ON(ret); /* -ENOMEM */ } - ret = btrfs_map_bio(root, WRITE, bio, 0, 1); + ret = btrfs_map_bio(root, bio, 0, 1); BUG_ON(ret); /* -ENOMEM */ bio_put(bio); bio = compressed_bio_alloc(bdev, first_byte, GFP_NOFS); BUG_ON(!bio); + bio_set_op_attrs(bio, REQ_OP_WRITE, 0); bio->bi_private = cb; bio->bi_end_io = end_compressed_bio_write; bio_add_page(bio, page, PAGE_SIZE, 0); @@ -431,7 +433,7 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start, BUG_ON(ret); /* -ENOMEM */ } - ret = btrfs_map_bio(root, WRITE, bio, 0, 1); + ret = btrfs_map_bio(root, bio, 0, 1); BUG_ON(ret); /* -ENOMEM */ bio_put(bio); @@ -646,6 +648,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, comp_bio = compressed_bio_alloc(bdev, cur_disk_byte, GFP_NOFS); if (!comp_bio) goto fail2; + bio_set_op_attrs (comp_bio, REQ_OP_READ, 0); comp_bio->bi_private = cb; comp_bio->bi_end_io = end_compressed_bio_read; atomic_inc(&cb->pending_bios); @@ -656,7 +659,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, page->index = em_start >> PAGE_SHIFT; if (comp_bio->bi_iter.bi_size) - ret = tree->ops->merge_bio_hook(READ, page, 0, + ret = tree->ops->merge_bio_hook(page, 0, PAGE_SIZE, comp_bio, 0); else @@ -687,8 +690,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, sums += DIV_ROUND_UP(comp_bio->bi_iter.bi_size, root->sectorsize); - ret = btrfs_map_bio(root, READ, comp_bio, - mirror_num, 0); + ret = btrfs_map_bio(root, comp_bio, mirror_num, 0); if (ret) { bio->bi_error = ret; bio_endio(comp_bio); @@ -699,6 +701,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, comp_bio = compressed_bio_alloc(bdev, cur_disk_byte, GFP_NOFS); BUG_ON(!comp_bio); + bio_set_op_attrs(comp_bio, REQ_OP_READ, 0); comp_bio->bi_private = cb; comp_bio->bi_end_io = end_compressed_bio_read; @@ -717,7 +720,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, BUG_ON(ret); /* -ENOMEM */ } - ret = btrfs_map_bio(root, READ, comp_bio, mirror_num, 0); + ret = btrfs_map_bio(root, comp_bio, mirror_num, 0); if (ret) { bio->bi_error = ret; bio_endio(comp_bio); @@ -743,8 +746,11 @@ out: static struct { struct list_head idle_ws; spinlock_t ws_lock; - int num_ws; - atomic_t alloc_ws; + /* Number of free workspaces */ + int free_ws; + /* Total number of allocated workspaces */ + atomic_t total_ws; + /* Waiters for a free workspace */ wait_queue_head_t ws_wait; } btrfs_comp_ws[BTRFS_COMPRESS_TYPES]; @@ -758,16 +764,34 @@ void __init btrfs_init_compress(void) int i; for (i = 0; i < BTRFS_COMPRESS_TYPES; i++) { + struct list_head *workspace; + INIT_LIST_HEAD(&btrfs_comp_ws[i].idle_ws); spin_lock_init(&btrfs_comp_ws[i].ws_lock); - atomic_set(&btrfs_comp_ws[i].alloc_ws, 0); + atomic_set(&btrfs_comp_ws[i].total_ws, 0); init_waitqueue_head(&btrfs_comp_ws[i].ws_wait); + + /* + * Preallocate one workspace for each compression type so + * we can guarantee forward progress in the worst case + */ + workspace = btrfs_compress_op[i]->alloc_workspace(); + if (IS_ERR(workspace)) { + printk(KERN_WARNING + "BTRFS: cannot preallocate compression workspace, will try later"); + } else { + atomic_set(&btrfs_comp_ws[i].total_ws, 1); + btrfs_comp_ws[i].free_ws = 1; + list_add(workspace, &btrfs_comp_ws[i].idle_ws); + } } } /* - * this finds an available workspace or allocates a new one - * ERR_PTR is returned if things go bad. + * This finds an available workspace or allocates a new one. + * If it's not possible to allocate a new one, waits until there's one. + * Preallocation makes a forward progress guarantees and we do not return + * errors. */ static struct list_head *find_workspace(int type) { @@ -777,36 +801,58 @@ static struct list_head *find_workspace(int type) struct list_head *idle_ws = &btrfs_comp_ws[idx].idle_ws; spinlock_t *ws_lock = &btrfs_comp_ws[idx].ws_lock; - atomic_t *alloc_ws = &btrfs_comp_ws[idx].alloc_ws; + atomic_t *total_ws = &btrfs_comp_ws[idx].total_ws; wait_queue_head_t *ws_wait = &btrfs_comp_ws[idx].ws_wait; - int *num_ws = &btrfs_comp_ws[idx].num_ws; + int *free_ws = &btrfs_comp_ws[idx].free_ws; again: spin_lock(ws_lock); if (!list_empty(idle_ws)) { workspace = idle_ws->next; list_del(workspace); - (*num_ws)--; + (*free_ws)--; spin_unlock(ws_lock); return workspace; } - if (atomic_read(alloc_ws) > cpus) { + if (atomic_read(total_ws) > cpus) { DEFINE_WAIT(wait); spin_unlock(ws_lock); prepare_to_wait(ws_wait, &wait, TASK_UNINTERRUPTIBLE); - if (atomic_read(alloc_ws) > cpus && !*num_ws) + if (atomic_read(total_ws) > cpus && !*free_ws) schedule(); finish_wait(ws_wait, &wait); goto again; } - atomic_inc(alloc_ws); + atomic_inc(total_ws); spin_unlock(ws_lock); workspace = btrfs_compress_op[idx]->alloc_workspace(); if (IS_ERR(workspace)) { - atomic_dec(alloc_ws); + atomic_dec(total_ws); wake_up(ws_wait); + + /* + * Do not return the error but go back to waiting. There's a + * workspace preallocated for each type and the compression + * time is bounded so we get to a workspace eventually. This + * makes our caller's life easier. + * + * To prevent silent and low-probability deadlocks (when the + * initial preallocation fails), check if there are any + * workspaces at all. + */ + if (atomic_read(total_ws) == 0) { + static DEFINE_RATELIMIT_STATE(_rs, + /* once per minute */ 60 * HZ, + /* no burst */ 1); + + if (__ratelimit(&_rs)) { + printk(KERN_WARNING + "no compression workspaces, low memory, retrying"); + } + } + goto again; } return workspace; } @@ -820,21 +866,21 @@ static void free_workspace(int type, struct list_head *workspace) int idx = type - 1; struct list_head *idle_ws = &btrfs_comp_ws[idx].idle_ws; spinlock_t *ws_lock = &btrfs_comp_ws[idx].ws_lock; - atomic_t *alloc_ws = &btrfs_comp_ws[idx].alloc_ws; + atomic_t *total_ws = &btrfs_comp_ws[idx].total_ws; wait_queue_head_t *ws_wait = &btrfs_comp_ws[idx].ws_wait; - int *num_ws = &btrfs_comp_ws[idx].num_ws; + int *free_ws = &btrfs_comp_ws[idx].free_ws; spin_lock(ws_lock); - if (*num_ws < num_online_cpus()) { + if (*free_ws < num_online_cpus()) { list_add(workspace, idle_ws); - (*num_ws)++; + (*free_ws)++; spin_unlock(ws_lock); goto wake; } spin_unlock(ws_lock); btrfs_compress_op[idx]->free_workspace(workspace); - atomic_dec(alloc_ws); + atomic_dec(total_ws); wake: /* * Make sure counter is updated before we wake up waiters. @@ -857,7 +903,7 @@ static void free_workspaces(void) workspace = btrfs_comp_ws[i].idle_ws.next; list_del(workspace); btrfs_compress_op[i]->free_workspace(workspace); - atomic_dec(&btrfs_comp_ws[i].alloc_ws); + atomic_dec(&btrfs_comp_ws[i].total_ws); } } } @@ -894,8 +940,6 @@ int btrfs_compress_pages(int type, struct address_space *mapping, int ret; workspace = find_workspace(type); - if (IS_ERR(workspace)) - return PTR_ERR(workspace); ret = btrfs_compress_op[type-1]->compress_pages(workspace, mapping, start, len, pages, @@ -930,8 +974,6 @@ static int btrfs_decompress_biovec(int type, struct page **pages_in, int ret; workspace = find_workspace(type); - if (IS_ERR(workspace)) - return PTR_ERR(workspace); ret = btrfs_compress_op[type-1]->decompress_biovec(workspace, pages_in, disk_start, @@ -952,8 +994,6 @@ int btrfs_decompress(int type, unsigned char *data_in, struct page *dest_page, int ret; workspace = find_workspace(type); - if (IS_ERR(workspace)) - return PTR_ERR(workspace); ret = btrfs_compress_op[type-1]->decompress(workspace, data_in, dest_page, start_byte, diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index ec7928a27aaa..a85cf7d23309 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -156,7 +156,7 @@ struct extent_buffer *btrfs_root_node(struct btrfs_root *root) /* * RCU really hurts here, we could free up the root node because - * it was cow'ed but we may not get the new root node yet so do + * it was COWed but we may not get the new root node yet so do * the inc_not_zero dance and if it doesn't work then * synchronize_rcu and try again. */ @@ -955,7 +955,7 @@ int btrfs_block_can_be_shared(struct btrfs_root *root, struct extent_buffer *buf) { /* - * Tree blocks not in refernece counted trees and tree roots + * Tree blocks not in reference counted trees and tree roots * are never shared. If a block was allocated after the last * snapshot and the block was not allocated by tree relocation, * we know the block is not shared. @@ -1011,7 +1011,7 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, return ret; if (refs == 0) { ret = -EROFS; - btrfs_std_error(root->fs_info, ret, NULL); + btrfs_handle_fs_error(root->fs_info, ret, NULL); return ret; } } else { @@ -1270,7 +1270,7 @@ __tree_mod_log_oldest_root(struct btrfs_fs_info *fs_info, /* * tm is a pointer to the first operation to rewind within eb. then, all - * previous operations will be rewinded (until we reach something older than + * previous operations will be rewound (until we reach something older than * time_seq). */ static void @@ -1345,7 +1345,7 @@ __tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb, } /* - * Called with eb read locked. If the buffer cannot be rewinded, the same buffer + * Called with eb read locked. If the buffer cannot be rewound, the same buffer * is returned. If rewind operations happen, a fresh buffer is returned. The * returned buffer is always read-locked. If the returned buffer is not the * input buffer, the lock on the input buffer is released and the input buffer @@ -1373,7 +1373,8 @@ tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct btrfs_path *path, if (tm->op == MOD_LOG_KEY_REMOVE_WHILE_FREEING) { BUG_ON(tm->slot != 0); - eb_rewin = alloc_dummy_extent_buffer(fs_info, eb->start); + eb_rewin = alloc_dummy_extent_buffer(fs_info, eb->start, + eb->len); if (!eb_rewin) { btrfs_tree_read_unlock_blocking(eb); free_extent_buffer(eb); @@ -1454,7 +1455,8 @@ get_old_root(struct btrfs_root *root, u64 time_seq) } else if (old_root) { btrfs_tree_read_unlock(eb_root); free_extent_buffer(eb_root); - eb = alloc_dummy_extent_buffer(root->fs_info, logical); + eb = alloc_dummy_extent_buffer(root->fs_info, logical, + root->nodesize); } else { btrfs_set_lock_blocking_rw(eb_root, BTRFS_READ_LOCK); eb = btrfs_clone_extent_buffer(eb_root); @@ -1516,7 +1518,7 @@ static inline int should_cow_block(struct btrfs_trans_handle *trans, * 3) the root is not forced COW. * * What is forced COW: - * when we create snapshot during commiting the transaction, + * when we create snapshot during committing the transaction, * after we've finished coping src root, we must COW the shared * block to ensure the metadata consistency. */ @@ -1531,7 +1533,7 @@ static inline int should_cow_block(struct btrfs_trans_handle *trans, /* * cows a single block, see __btrfs_cow_block for the real work. - * This version of it has extra checks so that a block isn't cow'd more than + * This version of it has extra checks so that a block isn't COWed more than * once per transaction, as long as it hasn't been written yet */ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans, @@ -1552,6 +1554,7 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans, trans->transid, root->fs_info->generation); if (!should_cow_block(trans, root, buf)) { + trans->dirty = true; *cow_ret = buf; return 0; } @@ -1783,10 +1786,12 @@ static noinline int generic_bin_search(struct extent_buffer *eb, if (!err) { tmp = (struct btrfs_disk_key *)(kaddr + offset - map_start); - } else { + } else if (err == 1) { read_extent_buffer(eb, &unaligned, offset, sizeof(unaligned)); tmp = &unaligned; + } else { + return err; } } else { @@ -1928,7 +1933,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, child = read_node_slot(root, mid, 0); if (!child) { ret = -EROFS; - btrfs_std_error(root->fs_info, ret, NULL); + btrfs_handle_fs_error(root->fs_info, ret, NULL); goto enospc; } @@ -2031,7 +2036,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, */ if (!left) { ret = -EROFS; - btrfs_std_error(root->fs_info, ret, NULL); + btrfs_handle_fs_error(root->fs_info, ret, NULL); goto enospc; } wret = balance_node_right(trans, root, mid, left); @@ -2510,6 +2515,8 @@ read_block_for_search(struct btrfs_trans_handle *trans, if (!btrfs_buffer_uptodate(tmp, 0, 0)) ret = -EIO; free_extent_buffer(tmp); + } else { + ret = PTR_ERR(tmp); } return ret; } @@ -2773,8 +2780,10 @@ again: * then we don't want to set the path blocking, * so we test it here */ - if (!should_cow_block(trans, root, b)) + if (!should_cow_block(trans, root, b)) { + trans->dirty = true; goto cow_done; + } /* * must have write locks on this node and the @@ -2823,6 +2832,8 @@ cow_done: } ret = key_search(b, key, level, &prev_cmp, &slot); + if (ret < 0) + goto done; if (level != 0) { int dec = 0; @@ -2986,7 +2997,7 @@ again: btrfs_unlock_up_safe(p, level + 1); /* - * Since we can unwind eb's we want to do a real search every + * Since we can unwind ebs we want to do a real search every * time. */ prev_cmp = -1; diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 84a6a5b3384a..b2620d1f883f 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -33,6 +33,7 @@ #include <asm/kmap_types.h> #include <linux/pagemap.h> #include <linux/btrfs.h> +#include <linux/btrfs_tree.h> #include <linux/workqueue.h> #include <linux/security.h> #include <linux/sizes.h> @@ -64,98 +65,6 @@ struct btrfs_ordered_sum; #define BTRFS_COMPAT_EXTENT_TREE_V0 -/* holds pointers to all of the tree roots */ -#define BTRFS_ROOT_TREE_OBJECTID 1ULL - -/* stores information about which extents are in use, and reference counts */ -#define BTRFS_EXTENT_TREE_OBJECTID 2ULL - -/* - * chunk tree stores translations from logical -> physical block numbering - * the super block points to the chunk tree - */ -#define BTRFS_CHUNK_TREE_OBJECTID 3ULL - -/* - * stores information about which areas of a given device are in use. - * one per device. The tree of tree roots points to the device tree - */ -#define BTRFS_DEV_TREE_OBJECTID 4ULL - -/* one per subvolume, storing files and directories */ -#define BTRFS_FS_TREE_OBJECTID 5ULL - -/* directory objectid inside the root tree */ -#define BTRFS_ROOT_TREE_DIR_OBJECTID 6ULL - -/* holds checksums of all the data extents */ -#define BTRFS_CSUM_TREE_OBJECTID 7ULL - -/* holds quota configuration and tracking */ -#define BTRFS_QUOTA_TREE_OBJECTID 8ULL - -/* for storing items that use the BTRFS_UUID_KEY* types */ -#define BTRFS_UUID_TREE_OBJECTID 9ULL - -/* tracks free space in block groups. */ -#define BTRFS_FREE_SPACE_TREE_OBJECTID 10ULL - -/* device stats in the device tree */ -#define BTRFS_DEV_STATS_OBJECTID 0ULL - -/* for storing balance parameters in the root tree */ -#define BTRFS_BALANCE_OBJECTID -4ULL - -/* orhpan objectid for tracking unlinked/truncated files */ -#define BTRFS_ORPHAN_OBJECTID -5ULL - -/* does write ahead logging to speed up fsyncs */ -#define BTRFS_TREE_LOG_OBJECTID -6ULL -#define BTRFS_TREE_LOG_FIXUP_OBJECTID -7ULL - -/* for space balancing */ -#define BTRFS_TREE_RELOC_OBJECTID -8ULL -#define BTRFS_DATA_RELOC_TREE_OBJECTID -9ULL - -/* - * extent checksums all have this objectid - * this allows them to share the logging tree - * for fsyncs - */ -#define BTRFS_EXTENT_CSUM_OBJECTID -10ULL - -/* For storing free space cache */ -#define BTRFS_FREE_SPACE_OBJECTID -11ULL - -/* - * The inode number assigned to the special inode for storing - * free ino cache - */ -#define BTRFS_FREE_INO_OBJECTID -12ULL - -/* dummy objectid represents multiple objectids */ -#define BTRFS_MULTIPLE_OBJECTIDS -255ULL - -/* - * All files have objectids in this range. - */ -#define BTRFS_FIRST_FREE_OBJECTID 256ULL -#define BTRFS_LAST_FREE_OBJECTID -256ULL -#define BTRFS_FIRST_CHUNK_TREE_OBJECTID 256ULL - - -/* - * the device items go into the chunk tree. The key is in the form - * [ 1 BTRFS_DEV_ITEM_KEY device_id ] - */ -#define BTRFS_DEV_ITEMS_OBJECTID 1ULL - -#define BTRFS_BTREE_INODE_OBJECTID 1 - -#define BTRFS_EMPTY_SUBVOL_DIR_OBJECTID 2 - -#define BTRFS_DEV_REPLACE_DEVID 0ULL - /* * the max metadata block size. This limit is somewhat artificial, * but the memmove costs go through the roof for larger blocks. @@ -175,31 +84,14 @@ struct btrfs_ordered_sum; */ #define BTRFS_LINK_MAX 65535U -/* 32 bytes in various csum fields */ -#define BTRFS_CSUM_SIZE 32 - -/* csum types */ -#define BTRFS_CSUM_TYPE_CRC32 0 - static const int btrfs_csum_sizes[] = { 4 }; /* four bytes for CRC32 */ #define BTRFS_EMPTY_DIR_SIZE 0 -/* spefic to btrfs_map_block(), therefore not in include/linux/blk_types.h */ +/* specific to btrfs_map_block(), therefore not in include/linux/blk_types.h */ #define REQ_GET_READ_MIRRORS (1 << 30) -#define BTRFS_FT_UNKNOWN 0 -#define BTRFS_FT_REG_FILE 1 -#define BTRFS_FT_DIR 2 -#define BTRFS_FT_CHRDEV 3 -#define BTRFS_FT_BLKDEV 4 -#define BTRFS_FT_FIFO 5 -#define BTRFS_FT_SOCK 6 -#define BTRFS_FT_SYMLINK 7 -#define BTRFS_FT_XATTR 8 -#define BTRFS_FT_MAX 9 - /* ioprio of readahead is set to idle */ #define BTRFS_IOPRIO_READA (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)) @@ -207,138 +99,10 @@ static const int btrfs_csum_sizes[] = { 4 }; #define BTRFS_MAX_EXTENT_SIZE SZ_128M -/* - * The key defines the order in the tree, and so it also defines (optimal) - * block layout. - * - * objectid corresponds to the inode number. - * - * type tells us things about the object, and is a kind of stream selector. - * so for a given inode, keys with type of 1 might refer to the inode data, - * type of 2 may point to file data in the btree and type == 3 may point to - * extents. - * - * offset is the starting byte offset for this key in the stream. - * - * btrfs_disk_key is in disk byte order. struct btrfs_key is always - * in cpu native order. Otherwise they are identical and their sizes - * should be the same (ie both packed) - */ -struct btrfs_disk_key { - __le64 objectid; - u8 type; - __le64 offset; -} __attribute__ ((__packed__)); - -struct btrfs_key { - u64 objectid; - u8 type; - u64 offset; -} __attribute__ ((__packed__)); - struct btrfs_mapping_tree { struct extent_map_tree map_tree; }; -struct btrfs_dev_item { - /* the internal btrfs device id */ - __le64 devid; - - /* size of the device */ - __le64 total_bytes; - - /* bytes used */ - __le64 bytes_used; - - /* optimal io alignment for this device */ - __le32 io_align; - - /* optimal io width for this device */ - __le32 io_width; - - /* minimal io size for this device */ - __le32 sector_size; - - /* type and info about this device */ - __le64 type; - - /* expected generation for this device */ - __le64 generation; - - /* - * starting byte of this partition on the device, - * to allow for stripe alignment in the future - */ - __le64 start_offset; - - /* grouping information for allocation decisions */ - __le32 dev_group; - - /* seek speed 0-100 where 100 is fastest */ - u8 seek_speed; - - /* bandwidth 0-100 where 100 is fastest */ - u8 bandwidth; - - /* btrfs generated uuid for this device */ - u8 uuid[BTRFS_UUID_SIZE]; - - /* uuid of FS who owns this device */ - u8 fsid[BTRFS_UUID_SIZE]; -} __attribute__ ((__packed__)); - -struct btrfs_stripe { - __le64 devid; - __le64 offset; - u8 dev_uuid[BTRFS_UUID_SIZE]; -} __attribute__ ((__packed__)); - -struct btrfs_chunk { - /* size of this chunk in bytes */ - __le64 length; - - /* objectid of the root referencing this chunk */ - __le64 owner; - - __le64 stripe_len; - __le64 type; - - /* optimal io alignment for this chunk */ - __le32 io_align; - - /* optimal io width for this chunk */ - __le32 io_width; - - /* minimal io size for this chunk */ - __le32 sector_size; - - /* 2^16 stripes is quite a lot, a second limit is the size of a single - * item in the btree - */ - __le16 num_stripes; - - /* sub stripes only matter for raid10 */ - __le16 sub_stripes; - struct btrfs_stripe stripe; - /* additional stripes go here */ -} __attribute__ ((__packed__)); - -#define BTRFS_FREE_SPACE_EXTENT 1 -#define BTRFS_FREE_SPACE_BITMAP 2 - -struct btrfs_free_space_entry { - __le64 offset; - __le64 bytes; - u8 type; -} __attribute__ ((__packed__)); - -struct btrfs_free_space_header { - struct btrfs_disk_key location; - __le64 generation; - __le64 num_entries; - __le64 num_bitmaps; -} __attribute__ ((__packed__)); - static inline unsigned long btrfs_chunk_item_size(int num_stripes) { BUG_ON(num_stripes == 0); @@ -346,9 +110,6 @@ static inline unsigned long btrfs_chunk_item_size(int num_stripes) sizeof(struct btrfs_stripe) * (num_stripes - 1); } -#define BTRFS_HEADER_FLAG_WRITTEN (1ULL << 0) -#define BTRFS_HEADER_FLAG_RELOC (1ULL << 1) - /* * File system states */ @@ -357,13 +118,6 @@ static inline unsigned long btrfs_chunk_item_size(int num_stripes) #define BTRFS_FS_STATE_TRANS_ABORTED 2 #define BTRFS_FS_STATE_DEV_REPLACING 3 -/* Super block flags */ -/* Errors detected */ -#define BTRFS_SUPER_FLAG_ERROR (1ULL << 2) - -#define BTRFS_SUPER_FLAG_SEEDING (1ULL << 32) -#define BTRFS_SUPER_FLAG_METADUMP (1ULL << 33) - #define BTRFS_BACKREF_REV_MAX 256 #define BTRFS_BACKREF_REV_SHIFT 56 #define BTRFS_BACKREF_REV_MASK (((u64)BTRFS_BACKREF_REV_MAX - 1) << \ @@ -410,7 +164,6 @@ struct btrfs_header { * room to translate 14 chunks with 3 stripes each. */ #define BTRFS_SYSTEM_CHUNK_ARRAY_SIZE 2048 -#define BTRFS_LABEL_SIZE 256 /* * just in case we somehow lose the roots and are not able to mount, @@ -507,31 +260,6 @@ struct btrfs_super_block { * Compat flags that we support. If any incompat flags are set other than the * ones specified below then we will fail to mount */ -#define BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE (1ULL << 0) - -#define BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF (1ULL << 0) -#define BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL (1ULL << 1) -#define BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS (1ULL << 2) -#define BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO (1ULL << 3) -/* - * some patches floated around with a second compression method - * lets save that incompat here for when they do get in - * Note we don't actually support it, we're just reserving the - * number - */ -#define BTRFS_FEATURE_INCOMPAT_COMPRESS_LZOv2 (1ULL << 4) - -/* - * older kernels tried to do bigger metadata blocks, but the - * code was pretty buggy. Lets not let them try anymore. - */ -#define BTRFS_FEATURE_INCOMPAT_BIG_METADATA (1ULL << 5) - -#define BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF (1ULL << 6) -#define BTRFS_FEATURE_INCOMPAT_RAID56 (1ULL << 7) -#define BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA (1ULL << 8) -#define BTRFS_FEATURE_INCOMPAT_NO_HOLES (1ULL << 9) - #define BTRFS_FEATURE_COMPAT_SUPP 0ULL #define BTRFS_FEATURE_COMPAT_SAFE_SET 0ULL #define BTRFS_FEATURE_COMPAT_SAFE_CLEAR 0ULL @@ -624,357 +352,8 @@ struct btrfs_path { unsigned int need_commit_sem:1; unsigned int skip_release_on_error:1; }; - -/* - * items in the extent btree are used to record the objectid of the - * owner of the block and the number of references - */ - -struct btrfs_extent_item { - __le64 refs; - __le64 generation; - __le64 flags; -} __attribute__ ((__packed__)); - -struct btrfs_extent_item_v0 { - __le32 refs; -} __attribute__ ((__packed__)); - #define BTRFS_MAX_EXTENT_ITEM_SIZE(r) ((BTRFS_LEAF_DATA_SIZE(r) >> 4) - \ sizeof(struct btrfs_item)) - -#define BTRFS_EXTENT_FLAG_DATA (1ULL << 0) -#define BTRFS_EXTENT_FLAG_TREE_BLOCK (1ULL << 1) - -/* following flags only apply to tree blocks */ - -/* use full backrefs for extent pointers in the block */ -#define BTRFS_BLOCK_FLAG_FULL_BACKREF (1ULL << 8) - -/* - * this flag is only used internally by scrub and may be changed at any time - * it is only declared here to avoid collisions - */ -#define BTRFS_EXTENT_FLAG_SUPER (1ULL << 48) - -struct btrfs_tree_block_info { - struct btrfs_disk_key key; - u8 level; -} __attribute__ ((__packed__)); - -struct btrfs_extent_data_ref { - __le64 root; - __le64 objectid; - __le64 offset; - __le32 count; -} __attribute__ ((__packed__)); - -struct btrfs_shared_data_ref { - __le32 count; -} __attribute__ ((__packed__)); - -struct btrfs_extent_inline_ref { - u8 type; - __le64 offset; -} __attribute__ ((__packed__)); - -/* old style backrefs item */ -struct btrfs_extent_ref_v0 { - __le64 root; - __le64 generation; - __le64 objectid; - __le32 count; -} __attribute__ ((__packed__)); - - -/* dev extents record free space on individual devices. The owner - * field points back to the chunk allocation mapping tree that allocated - * the extent. The chunk tree uuid field is a way to double check the owner - */ -struct btrfs_dev_extent { - __le64 chunk_tree; - __le64 chunk_objectid; - __le64 chunk_offset; - __le64 length; - u8 chunk_tree_uuid[BTRFS_UUID_SIZE]; -} __attribute__ ((__packed__)); - -struct btrfs_inode_ref { - __le64 index; - __le16 name_len; - /* name goes here */ -} __attribute__ ((__packed__)); - -struct btrfs_inode_extref { - __le64 parent_objectid; - __le64 index; - __le16 name_len; - __u8 name[0]; - /* name goes here */ -} __attribute__ ((__packed__)); - -struct btrfs_timespec { - __le64 sec; - __le32 nsec; -} __attribute__ ((__packed__)); - -struct btrfs_inode_item { - /* nfs style generation number */ - __le64 generation; - /* transid that last touched this inode */ - __le64 transid; - __le64 size; - __le64 nbytes; - __le64 block_group; - __le32 nlink; - __le32 uid; - __le32 gid; - __le32 mode; - __le64 rdev; - __le64 flags; - - /* modification sequence number for NFS */ - __le64 sequence; - - /* - * a little future expansion, for more than this we can - * just grow the inode item and version it - */ - __le64 reserved[4]; - struct btrfs_timespec atime; - struct btrfs_timespec ctime; - struct btrfs_timespec mtime; - struct btrfs_timespec otime; -} __attribute__ ((__packed__)); - -struct btrfs_dir_log_item { - __le64 end; -} __attribute__ ((__packed__)); - -struct btrfs_dir_item { - struct btrfs_disk_key location; - __le64 transid; - __le16 data_len; - __le16 name_len; - u8 type; -} __attribute__ ((__packed__)); - -#define BTRFS_ROOT_SUBVOL_RDONLY (1ULL << 0) - -/* - * Internal in-memory flag that a subvolume has been marked for deletion but - * still visible as a directory - */ -#define BTRFS_ROOT_SUBVOL_DEAD (1ULL << 48) - -struct btrfs_root_item { - struct btrfs_inode_item inode; - __le64 generation; - __le64 root_dirid; - __le64 bytenr; - __le64 byte_limit; - __le64 bytes_used; - __le64 last_snapshot; - __le64 flags; - __le32 refs; - struct btrfs_disk_key drop_progress; - u8 drop_level; - u8 level; - - /* - * The following fields appear after subvol_uuids+subvol_times - * were introduced. - */ - - /* - * This generation number is used to test if the new fields are valid - * and up to date while reading the root item. Every time the root item - * is written out, the "generation" field is copied into this field. If - * anyone ever mounted the fs with an older kernel, we will have - * mismatching generation values here and thus must invalidate the - * new fields. See btrfs_update_root and btrfs_find_last_root for - * details. - * the offset of generation_v2 is also used as the start for the memset - * when invalidating the fields. - */ - __le64 generation_v2; - u8 uuid[BTRFS_UUID_SIZE]; - u8 parent_uuid[BTRFS_UUID_SIZE]; - u8 received_uuid[BTRFS_UUID_SIZE]; - __le64 ctransid; /* updated when an inode changes */ - __le64 otransid; /* trans when created */ - __le64 stransid; /* trans when sent. non-zero for received subvol */ - __le64 rtransid; /* trans when received. non-zero for received subvol */ - struct btrfs_timespec ctime; - struct btrfs_timespec otime; - struct btrfs_timespec stime; - struct btrfs_timespec rtime; - __le64 reserved[8]; /* for future */ -} __attribute__ ((__packed__)); - -/* - * this is used for both forward and backward root refs - */ -struct btrfs_root_ref { - __le64 dirid; - __le64 sequence; - __le16 name_len; -} __attribute__ ((__packed__)); - -struct btrfs_disk_balance_args { - /* - * profiles to operate on, single is denoted by - * BTRFS_AVAIL_ALLOC_BIT_SINGLE - */ - __le64 profiles; - - /* - * usage filter - * BTRFS_BALANCE_ARGS_USAGE with a single value means '0..N' - * BTRFS_BALANCE_ARGS_USAGE_RANGE - range syntax, min..max - */ - union { - __le64 usage; - struct { - __le32 usage_min; - __le32 usage_max; - }; - }; - - /* devid filter */ - __le64 devid; - - /* devid subset filter [pstart..pend) */ - __le64 pstart; - __le64 pend; - - /* btrfs virtual address space subset filter [vstart..vend) */ - __le64 vstart; - __le64 vend; - - /* - * profile to convert to, single is denoted by - * BTRFS_AVAIL_ALLOC_BIT_SINGLE - */ - __le64 target; - - /* BTRFS_BALANCE_ARGS_* */ - __le64 flags; - - /* - * BTRFS_BALANCE_ARGS_LIMIT with value 'limit' - * BTRFS_BALANCE_ARGS_LIMIT_RANGE - the extend version can use minimum - * and maximum - */ - union { - __le64 limit; - struct { - __le32 limit_min; - __le32 limit_max; - }; - }; - - /* - * Process chunks that cross stripes_min..stripes_max devices, - * BTRFS_BALANCE_ARGS_STRIPES_RANGE - */ - __le32 stripes_min; - __le32 stripes_max; - - __le64 unused[6]; -} __attribute__ ((__packed__)); - -/* - * store balance parameters to disk so that balance can be properly - * resumed after crash or unmount - */ -struct btrfs_balance_item { - /* BTRFS_BALANCE_* */ - __le64 flags; - - struct btrfs_disk_balance_args data; - struct btrfs_disk_balance_args meta; - struct btrfs_disk_balance_args sys; - - __le64 unused[4]; -} __attribute__ ((__packed__)); - -#define BTRFS_FILE_EXTENT_INLINE 0 -#define BTRFS_FILE_EXTENT_REG 1 -#define BTRFS_FILE_EXTENT_PREALLOC 2 - -struct btrfs_file_extent_item { - /* - * transaction id that created this extent - */ - __le64 generation; - /* - * max number of bytes to hold this extent in ram - * when we split a compressed extent we can't know how big - * each of the resulting pieces will be. So, this is - * an upper limit on the size of the extent in ram instead of - * an exact limit. - */ - __le64 ram_bytes; - - /* - * 32 bits for the various ways we might encode the data, - * including compression and encryption. If any of these - * are set to something a given disk format doesn't understand - * it is treated like an incompat flag for reading and writing, - * but not for stat. - */ - u8 compression; - u8 encryption; - __le16 other_encoding; /* spare for later use */ - - /* are we inline data or a real extent? */ - u8 type; - - /* - * disk space consumed by the extent, checksum blocks are included - * in these numbers - * - * At this offset in the structure, the inline extent data start. - */ - __le64 disk_bytenr; - __le64 disk_num_bytes; - /* - * the logical offset in file blocks (no csums) - * this extent record is for. This allows a file extent to point - * into the middle of an existing extent on disk, sharing it - * between two snapshots (useful if some bytes in the middle of the - * extent have changed - */ - __le64 offset; - /* - * the logical number of file blocks (no csums included). This - * always reflects the size uncompressed and without encoding. - */ - __le64 num_bytes; - -} __attribute__ ((__packed__)); - -struct btrfs_csum_item { - u8 csum; -} __attribute__ ((__packed__)); - -struct btrfs_dev_stats_item { - /* - * grow this item struct at the end for future enhancements and keep - * the existing values unchanged - */ - __le64 values[BTRFS_DEV_STAT_VALUES_MAX]; -} __attribute__ ((__packed__)); - -#define BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_ALWAYS 0 -#define BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID 1 -#define BTRFS_DEV_REPLACE_ITEM_STATE_NEVER_STARTED 0 -#define BTRFS_DEV_REPLACE_ITEM_STATE_STARTED 1 -#define BTRFS_DEV_REPLACE_ITEM_STATE_SUSPENDED 2 -#define BTRFS_DEV_REPLACE_ITEM_STATE_FINISHED 3 -#define BTRFS_DEV_REPLACE_ITEM_STATE_CANCELED 4 - struct btrfs_dev_replace { u64 replace_state; /* see #define above */ u64 time_started; /* seconds since 1-Jan-1970 */ @@ -1005,175 +384,6 @@ struct btrfs_dev_replace { struct btrfs_scrub_progress scrub_progress; }; -struct btrfs_dev_replace_item { - /* - * grow this item struct at the end for future enhancements and keep - * the existing values unchanged - */ - __le64 src_devid; - __le64 cursor_left; - __le64 cursor_right; - __le64 cont_reading_from_srcdev_mode; - - __le64 replace_state; - __le64 time_started; - __le64 time_stopped; - __le64 num_write_errors; - __le64 num_uncorrectable_read_errors; -} __attribute__ ((__packed__)); - -/* different types of block groups (and chunks) */ -#define BTRFS_BLOCK_GROUP_DATA (1ULL << 0) -#define BTRFS_BLOCK_GROUP_SYSTEM (1ULL << 1) -#define BTRFS_BLOCK_GROUP_METADATA (1ULL << 2) -#define BTRFS_BLOCK_GROUP_RAID0 (1ULL << 3) -#define BTRFS_BLOCK_GROUP_RAID1 (1ULL << 4) -#define BTRFS_BLOCK_GROUP_DUP (1ULL << 5) -#define BTRFS_BLOCK_GROUP_RAID10 (1ULL << 6) -#define BTRFS_BLOCK_GROUP_RAID5 (1ULL << 7) -#define BTRFS_BLOCK_GROUP_RAID6 (1ULL << 8) -#define BTRFS_BLOCK_GROUP_RESERVED (BTRFS_AVAIL_ALLOC_BIT_SINGLE | \ - BTRFS_SPACE_INFO_GLOBAL_RSV) - -enum btrfs_raid_types { - BTRFS_RAID_RAID10, - BTRFS_RAID_RAID1, - BTRFS_RAID_DUP, - BTRFS_RAID_RAID0, - BTRFS_RAID_SINGLE, - BTRFS_RAID_RAID5, - BTRFS_RAID_RAID6, - BTRFS_NR_RAID_TYPES -}; - -#define BTRFS_BLOCK_GROUP_TYPE_MASK (BTRFS_BLOCK_GROUP_DATA | \ - BTRFS_BLOCK_GROUP_SYSTEM | \ - BTRFS_BLOCK_GROUP_METADATA) - -#define BTRFS_BLOCK_GROUP_PROFILE_MASK (BTRFS_BLOCK_GROUP_RAID0 | \ - BTRFS_BLOCK_GROUP_RAID1 | \ - BTRFS_BLOCK_GROUP_RAID5 | \ - BTRFS_BLOCK_GROUP_RAID6 | \ - BTRFS_BLOCK_GROUP_DUP | \ - BTRFS_BLOCK_GROUP_RAID10) -#define BTRFS_BLOCK_GROUP_RAID56_MASK (BTRFS_BLOCK_GROUP_RAID5 | \ - BTRFS_BLOCK_GROUP_RAID6) - -/* - * We need a bit for restriper to be able to tell when chunks of type - * SINGLE are available. This "extended" profile format is used in - * fs_info->avail_*_alloc_bits (in-memory) and balance item fields - * (on-disk). The corresponding on-disk bit in chunk.type is reserved - * to avoid remappings between two formats in future. - */ -#define BTRFS_AVAIL_ALLOC_BIT_SINGLE (1ULL << 48) - -/* - * A fake block group type that is used to communicate global block reserve - * size to userspace via the SPACE_INFO ioctl. - */ -#define BTRFS_SPACE_INFO_GLOBAL_RSV (1ULL << 49) - -#define BTRFS_EXTENDED_PROFILE_MASK (BTRFS_BLOCK_GROUP_PROFILE_MASK | \ - BTRFS_AVAIL_ALLOC_BIT_SINGLE) - -static inline u64 chunk_to_extended(u64 flags) -{ - if ((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0) - flags |= BTRFS_AVAIL_ALLOC_BIT_SINGLE; - - return flags; -} -static inline u64 extended_to_chunk(u64 flags) -{ - return flags & ~BTRFS_AVAIL_ALLOC_BIT_SINGLE; -} - -struct btrfs_block_group_item { - __le64 used; - __le64 chunk_objectid; - __le64 flags; -} __attribute__ ((__packed__)); - -struct btrfs_free_space_info { - __le32 extent_count; - __le32 flags; -} __attribute__ ((__packed__)); - -#define BTRFS_FREE_SPACE_USING_BITMAPS (1ULL << 0) - -#define BTRFS_QGROUP_LEVEL_SHIFT 48 -static inline u64 btrfs_qgroup_level(u64 qgroupid) -{ - return qgroupid >> BTRFS_QGROUP_LEVEL_SHIFT; -} - -/* - * is subvolume quota turned on? - */ -#define BTRFS_QGROUP_STATUS_FLAG_ON (1ULL << 0) -/* - * RESCAN is set during the initialization phase - */ -#define BTRFS_QGROUP_STATUS_FLAG_RESCAN (1ULL << 1) -/* - * Some qgroup entries are known to be out of date, - * either because the configuration has changed in a way that - * makes a rescan necessary, or because the fs has been mounted - * with a non-qgroup-aware version. - * Turning qouta off and on again makes it inconsistent, too. - */ -#define BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT (1ULL << 2) - -#define BTRFS_QGROUP_STATUS_VERSION 1 - -struct btrfs_qgroup_status_item { - __le64 version; - /* - * the generation is updated during every commit. As older - * versions of btrfs are not aware of qgroups, it will be - * possible to detect inconsistencies by checking the - * generation on mount time - */ - __le64 generation; - - /* flag definitions see above */ - __le64 flags; - - /* - * only used during scanning to record the progress - * of the scan. It contains a logical address - */ - __le64 rescan; -} __attribute__ ((__packed__)); - -struct btrfs_qgroup_info_item { - __le64 generation; - __le64 rfer; - __le64 rfer_cmpr; - __le64 excl; - __le64 excl_cmpr; -} __attribute__ ((__packed__)); - -/* flags definition for qgroup limits */ -#define BTRFS_QGROUP_LIMIT_MAX_RFER (1ULL << 0) -#define BTRFS_QGROUP_LIMIT_MAX_EXCL (1ULL << 1) -#define BTRFS_QGROUP_LIMIT_RSV_RFER (1ULL << 2) -#define BTRFS_QGROUP_LIMIT_RSV_EXCL (1ULL << 3) -#define BTRFS_QGROUP_LIMIT_RFER_CMPR (1ULL << 4) -#define BTRFS_QGROUP_LIMIT_EXCL_CMPR (1ULL << 5) - -struct btrfs_qgroup_limit_item { - /* - * only updated when any of the other values change - */ - __le64 flags; - __le64 max_rfer; - __le64 max_excl; - __le64 rsv_rfer; - __le64 rsv_excl; -} __attribute__ ((__packed__)); - /* For raid type sysfs entries */ struct raid_kobject { int raid_type; @@ -1221,7 +431,7 @@ struct btrfs_space_info { * bytes_pinned does not reflect the bytes that will be pinned once the * delayed refs are flushed, so this counter is inc'ed every time we * call btrfs_free_extent so it is a realtime count of what will be - * freed once the transaction is committed. It will be zero'ed every + * freed once the transaction is committed. It will be zeroed every * time the transaction commits. */ struct percpu_counter total_bytes_pinned; @@ -1408,6 +618,27 @@ struct btrfs_block_group_cache { struct btrfs_io_ctl io_ctl; + /* + * Incremented when doing extent allocations and holding a read lock + * on the space_info's groups_sem semaphore. + * Decremented when an ordered extent that represents an IO against this + * block group's range is created (after it's added to its inode's + * root's list of ordered extents) or immediately after the allocation + * if it's a metadata extent or fallocate extent (for these cases we + * don't create ordered extents). + */ + atomic_t reservations; + + /* + * Incremented while holding the spinlock *lock* by a task checking if + * it can perform a nocow write (incremented if the value for the *ro* + * field is 0). Decremented by such tasks once they create an ordered + * extent or before that if some error happens before reaching that step. + * This is to prevent races between block group relocation and nocow + * writes through direct IO. + */ + atomic_t nocow_writers; + /* Lock for free space tree operations. */ struct mutex free_space_lock; @@ -2026,228 +1257,6 @@ struct btrfs_root { atomic_t qgroup_meta_rsv; }; -struct btrfs_ioctl_defrag_range_args { - /* start of the defrag operation */ - __u64 start; - - /* number of bytes to defrag, use (u64)-1 to say all */ - __u64 len; - - /* - * flags for the operation, which can include turning - * on compression for this one defrag - */ - __u64 flags; - - /* - * any extent bigger than this will be considered - * already defragged. Use 0 to take the kernel default - * Use 1 to say every single extent must be rewritten - */ - __u32 extent_thresh; - - /* - * which compression method to use if turning on compression - * for this defrag operation. If unspecified, zlib will - * be used - */ - __u32 compress_type; - - /* spare for later */ - __u32 unused[4]; -}; - - -/* - * inode items have the data typically returned from stat and store other - * info about object characteristics. There is one for every file and dir in - * the FS - */ -#define BTRFS_INODE_ITEM_KEY 1 -#define BTRFS_INODE_REF_KEY 12 -#define BTRFS_INODE_EXTREF_KEY 13 -#define BTRFS_XATTR_ITEM_KEY 24 -#define BTRFS_ORPHAN_ITEM_KEY 48 -/* reserve 2-15 close to the inode for later flexibility */ - -/* - * dir items are the name -> inode pointers in a directory. There is one - * for every name in a directory. - */ -#define BTRFS_DIR_LOG_ITEM_KEY 60 -#define BTRFS_DIR_LOG_INDEX_KEY 72 -#define BTRFS_DIR_ITEM_KEY 84 -#define BTRFS_DIR_INDEX_KEY 96 -/* - * extent data is for file data - */ -#define BTRFS_EXTENT_DATA_KEY 108 - -/* - * extent csums are stored in a separate tree and hold csums for - * an entire extent on disk. - */ -#define BTRFS_EXTENT_CSUM_KEY 128 - -/* - * root items point to tree roots. They are typically in the root - * tree used by the super block to find all the other trees - */ -#define BTRFS_ROOT_ITEM_KEY 132 - -/* - * root backrefs tie subvols and snapshots to the directory entries that - * reference them - */ -#define BTRFS_ROOT_BACKREF_KEY 144 - -/* - * root refs make a fast index for listing all of the snapshots and - * subvolumes referenced by a given root. They point directly to the - * directory item in the root that references the subvol - */ -#define BTRFS_ROOT_REF_KEY 156 - -/* - * extent items are in the extent map tree. These record which blocks - * are used, and how many references there are to each block - */ -#define BTRFS_EXTENT_ITEM_KEY 168 - -/* - * The same as the BTRFS_EXTENT_ITEM_KEY, except it's metadata we already know - * the length, so we save the level in key->offset instead of the length. - */ -#define BTRFS_METADATA_ITEM_KEY 169 - -#define BTRFS_TREE_BLOCK_REF_KEY 176 - -#define BTRFS_EXTENT_DATA_REF_KEY 178 - -#define BTRFS_EXTENT_REF_V0_KEY 180 - -#define BTRFS_SHARED_BLOCK_REF_KEY 182 - -#define BTRFS_SHARED_DATA_REF_KEY 184 - -/* - * block groups give us hints into the extent allocation trees. Which - * blocks are free etc etc - */ -#define BTRFS_BLOCK_GROUP_ITEM_KEY 192 - -/* - * Every block group is represented in the free space tree by a free space info - * item, which stores some accounting information. It is keyed on - * (block_group_start, FREE_SPACE_INFO, block_group_length). - */ -#define BTRFS_FREE_SPACE_INFO_KEY 198 - -/* - * A free space extent tracks an extent of space that is free in a block group. - * It is keyed on (start, FREE_SPACE_EXTENT, length). - */ -#define BTRFS_FREE_SPACE_EXTENT_KEY 199 - -/* - * When a block group becomes very fragmented, we convert it to use bitmaps - * instead of extents. A free space bitmap is keyed on - * (start, FREE_SPACE_BITMAP, length); the corresponding item is a bitmap with - * (length / sectorsize) bits. - */ -#define BTRFS_FREE_SPACE_BITMAP_KEY 200 - -#define BTRFS_DEV_EXTENT_KEY 204 -#define BTRFS_DEV_ITEM_KEY 216 -#define BTRFS_CHUNK_ITEM_KEY 228 - -/* - * Records the overall state of the qgroups. - * There's only one instance of this key present, - * (0, BTRFS_QGROUP_STATUS_KEY, 0) - */ -#define BTRFS_QGROUP_STATUS_KEY 240 -/* - * Records the currently used space of the qgroup. - * One key per qgroup, (0, BTRFS_QGROUP_INFO_KEY, qgroupid). - */ -#define BTRFS_QGROUP_INFO_KEY 242 -/* - * Contains the user configured limits for the qgroup. - * One key per qgroup, (0, BTRFS_QGROUP_LIMIT_KEY, qgroupid). - */ -#define BTRFS_QGROUP_LIMIT_KEY 244 -/* - * Records the child-parent relationship of qgroups. For - * each relation, 2 keys are present: - * (childid, BTRFS_QGROUP_RELATION_KEY, parentid) - * (parentid, BTRFS_QGROUP_RELATION_KEY, childid) - */ -#define BTRFS_QGROUP_RELATION_KEY 246 - -/* - * Obsolete name, see BTRFS_TEMPORARY_ITEM_KEY. - */ -#define BTRFS_BALANCE_ITEM_KEY 248 - -/* - * The key type for tree items that are stored persistently, but do not need to - * exist for extended period of time. The items can exist in any tree. - * - * [subtype, BTRFS_TEMPORARY_ITEM_KEY, data] - * - * Existing items: - * - * - balance status item - * (BTRFS_BALANCE_OBJECTID, BTRFS_TEMPORARY_ITEM_KEY, 0) - */ -#define BTRFS_TEMPORARY_ITEM_KEY 248 - -/* - * Obsolete name, see BTRFS_PERSISTENT_ITEM_KEY - */ -#define BTRFS_DEV_STATS_KEY 249 - -/* - * The key type for tree items that are stored persistently and usually exist - * for a long period, eg. filesystem lifetime. The item kinds can be status - * information, stats or preference values. The item can exist in any tree. - * - * [subtype, BTRFS_PERSISTENT_ITEM_KEY, data] - * - * Existing items: - * - * - device statistics, store IO stats in the device tree, one key for all - * stats - * (BTRFS_DEV_STATS_OBJECTID, BTRFS_DEV_STATS_KEY, 0) - */ -#define BTRFS_PERSISTENT_ITEM_KEY 249 - -/* - * Persistantly stores the device replace state in the device tree. - * The key is built like this: (0, BTRFS_DEV_REPLACE_KEY, 0). - */ -#define BTRFS_DEV_REPLACE_KEY 250 - -/* - * Stores items that allow to quickly map UUIDs to something else. - * These items are part of the filesystem UUID tree. - * The key is built like this: - * (UUID_upper_64_bits, BTRFS_UUID_KEY*, UUID_lower_64_bits). - */ -#if BTRFS_UUID_SIZE != 16 -#error "UUID items require BTRFS_UUID_SIZE == 16!" -#endif -#define BTRFS_UUID_KEY_SUBVOL 251 /* for UUIDs assigned to subvols */ -#define BTRFS_UUID_KEY_RECEIVED_SUBVOL 252 /* for UUIDs assigned to - * received subvols */ - -/* - * string items are for debugging. They just store a short string of - * data in the FS - */ -#define BTRFS_STRING_ITEM_KEY 253 - /* * Flags for mount options. * @@ -2392,7 +1401,7 @@ static inline void btrfs_init_map_token (struct btrfs_map_token *token) token->kaddr = NULL; } -/* some macros to generate set/get funcs for the struct fields. This +/* some macros to generate set/get functions for the struct fields. This * assumes there is a lefoo_to_cpu for every type, so lets make a simple * one for u8: */ @@ -3499,11 +2508,17 @@ int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans, struct btrfs_root *root); int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans, struct btrfs_root *root); +void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info, + const u64 start); +void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg); +bool btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr); +void btrfs_dec_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr); +void btrfs_wait_nocow_writers(struct btrfs_block_group_cache *bg); void btrfs_put_block_group(struct btrfs_block_group_cache *cache); int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, struct btrfs_root *root, unsigned long count); int btrfs_async_run_delayed_refs(struct btrfs_root *root, - unsigned long count, int wait); + unsigned long count, u64 transid, int wait); int btrfs_lookup_data_extent(struct btrfs_root *root, u64 start, u64 len); int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, @@ -4076,7 +3091,7 @@ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, struct btrfs_root *new_root, struct btrfs_root *parent_root, u64 new_dirid); -int btrfs_merge_bio_hook(int rw, struct page *page, unsigned long offset, +int btrfs_merge_bio_hook(struct page *page, unsigned long offset, size_t size, struct bio *bio, unsigned long bio_flags); int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); @@ -4122,6 +3137,7 @@ void btrfs_test_inode_set_ops(struct inode *inode); /* ioctl.c */ long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg); +long btrfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg); int btrfs_ioctl_get_supported_features(void __user *arg); void btrfs_update_iflags(struct inode *inode); void btrfs_inherit_iflags(struct inode *inode, struct inode *dir); @@ -4326,10 +3342,9 @@ static inline void assfail(char *expr, char *file, int line) #define ASSERT(expr) ((void)0) #endif -#define btrfs_assert() __printf(5, 6) __cold -void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function, +void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function, unsigned int line, int errno, const char *fmt, ...); const char *btrfs_decode_error(int errno); @@ -4339,6 +3354,46 @@ void __btrfs_abort_transaction(struct btrfs_trans_handle *trans, struct btrfs_root *root, const char *function, unsigned int line, int errno); +/* + * Call btrfs_abort_transaction as early as possible when an error condition is + * detected, that way the exact line number is reported. + */ +#define btrfs_abort_transaction(trans, root, errno) \ +do { \ + /* Report first abort since mount */ \ + if (!test_and_set_bit(BTRFS_FS_STATE_TRANS_ABORTED, \ + &((root)->fs_info->fs_state))) { \ + WARN(1, KERN_DEBUG \ + "BTRFS: Transaction aborted (error %d)\n", \ + (errno)); \ + } \ + __btrfs_abort_transaction((trans), (root), __func__, \ + __LINE__, (errno)); \ +} while (0) + +#define btrfs_handle_fs_error(fs_info, errno, fmt, args...) \ +do { \ + __btrfs_handle_fs_error((fs_info), __func__, __LINE__, \ + (errno), fmt, ##args); \ +} while (0) + +__printf(5, 6) +__cold +void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function, + unsigned int line, int errno, const char *fmt, ...); +/* + * If BTRFS_MOUNT_PANIC_ON_FATAL_ERROR is in mount_opt, __btrfs_panic + * will panic(). Otherwise we BUG() here. + */ +#define btrfs_panic(fs_info, errno, fmt, args...) \ +do { \ + __btrfs_panic(fs_info, __func__, __LINE__, errno, fmt, ##args); \ + BUG(); \ +} while (0) + + +/* compatibility and incompatibility defines */ + #define btrfs_set_fs_incompat(__fs_info, opt) \ __btrfs_set_fs_incompat((__fs_info), BTRFS_FEATURE_INCOMPAT_##opt) @@ -4455,44 +3510,6 @@ static inline int __btrfs_fs_compat_ro(struct btrfs_fs_info *fs_info, u64 flag) return !!(btrfs_super_compat_ro_flags(disk_super) & flag); } -/* - * Call btrfs_abort_transaction as early as possible when an error condition is - * detected, that way the exact line number is reported. - */ -#define btrfs_abort_transaction(trans, root, errno) \ -do { \ - /* Report first abort since mount */ \ - if (!test_and_set_bit(BTRFS_FS_STATE_TRANS_ABORTED, \ - &((root)->fs_info->fs_state))) { \ - WARN(1, KERN_DEBUG \ - "BTRFS: Transaction aborted (error %d)\n", \ - (errno)); \ - } \ - __btrfs_abort_transaction((trans), (root), __func__, \ - __LINE__, (errno)); \ -} while (0) - -#define btrfs_std_error(fs_info, errno, fmt, args...) \ -do { \ - __btrfs_std_error((fs_info), __func__, __LINE__, \ - (errno), fmt, ##args); \ -} while (0) - -__printf(5, 6) -__cold -void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function, - unsigned int line, int errno, const char *fmt, ...); - -/* - * If BTRFS_MOUNT_PANIC_ON_FATAL_ERROR is in mount_opt, __btrfs_panic - * will panic(). Otherwise we BUG() here. - */ -#define btrfs_panic(fs_info, errno, fmt, args...) \ -do { \ - __btrfs_panic(fs_info, __func__, __LINE__, errno, fmt, ##args); \ - BUG(); \ -} while (0) - /* acl.c */ #ifdef CONFIG_BTRFS_FS_POSIX_ACL struct posix_acl *btrfs_get_acl(struct inode *inode, int type); diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 6cef0062f929..d3aaabbfada0 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -134,7 +134,7 @@ again: /* cached in the btrfs inode and can be accessed */ atomic_add(2, &node->refs); - ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM); + ret = radix_tree_preload(GFP_NOFS); if (ret) { kmem_cache_free(delayed_node_cache, node); return ERR_PTR(ret); @@ -1606,15 +1606,23 @@ int btrfs_inode_delayed_dir_index_count(struct inode *inode) return 0; } -void btrfs_get_delayed_items(struct inode *inode, struct list_head *ins_list, - struct list_head *del_list) +bool btrfs_readdir_get_delayed_items(struct inode *inode, + struct list_head *ins_list, + struct list_head *del_list) { struct btrfs_delayed_node *delayed_node; struct btrfs_delayed_item *item; delayed_node = btrfs_get_delayed_node(inode); if (!delayed_node) - return; + return false; + + /* + * We can only do one readdir with delayed items at a time because of + * item->readdir_list. + */ + inode_unlock_shared(inode); + inode_lock(inode); mutex_lock(&delayed_node->mutex); item = __btrfs_first_delayed_insertion_item(delayed_node); @@ -1641,10 +1649,13 @@ void btrfs_get_delayed_items(struct inode *inode, struct list_head *ins_list, * requeue or dequeue this delayed node. */ atomic_dec(&delayed_node->refs); + + return true; } -void btrfs_put_delayed_items(struct list_head *ins_list, - struct list_head *del_list) +void btrfs_readdir_put_delayed_items(struct inode *inode, + struct list_head *ins_list, + struct list_head *del_list) { struct btrfs_delayed_item *curr, *next; @@ -1659,6 +1670,12 @@ void btrfs_put_delayed_items(struct list_head *ins_list, if (atomic_dec_and_test(&curr->refs)) kfree(curr); } + + /* + * The VFS is going to do up_read(), so we need to downgrade back to a + * read lock. + */ + downgrade_write(&inode->i_rwsem); } int btrfs_should_delete_dir_index(struct list_head *del_list, diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h index 0167853c84ae..2495b3d4075f 100644 --- a/fs/btrfs/delayed-inode.h +++ b/fs/btrfs/delayed-inode.h @@ -137,10 +137,12 @@ void btrfs_kill_all_delayed_nodes(struct btrfs_root *root); void btrfs_destroy_delayed_inodes(struct btrfs_root *root); /* Used for readdir() */ -void btrfs_get_delayed_items(struct inode *inode, struct list_head *ins_list, - struct list_head *del_list); -void btrfs_put_delayed_items(struct list_head *ins_list, - struct list_head *del_list); +bool btrfs_readdir_get_delayed_items(struct inode *inode, + struct list_head *ins_list, + struct list_head *del_list); +void btrfs_readdir_put_delayed_items(struct inode *inode, + struct list_head *ins_list, + struct list_head *del_list); int btrfs_should_delete_dir_index(struct list_head *del_list, u64 index); int btrfs_readdir_delayed_dir_index(struct dir_context *ctx, diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index c24b653c7343..5fca9534a271 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -188,7 +188,7 @@ struct btrfs_delayed_ref_root { /* * To make qgroup to skip given root. - * This is for snapshot, as btrfs_qgroup_inherit() will manully + * This is for snapshot, as btrfs_qgroup_inherit() will manually * modify counters for snapshot and its source, so we should skip * the snapshot in new_root/old_roots or it will get calculated twice */ diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 26bcb487f958..63ef9cdf0144 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -44,9 +44,6 @@ static void btrfs_dev_replace_update_device_in_mapping_tree( struct btrfs_fs_info *fs_info, struct btrfs_device *srcdev, struct btrfs_device *tgtdev); -static int btrfs_dev_replace_find_srcdev(struct btrfs_root *root, u64 srcdevid, - char *srcdev_name, - struct btrfs_device **device); static u64 __btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info); static int btrfs_dev_replace_kthread(void *data); static int btrfs_dev_replace_continue_on_mount(struct btrfs_fs_info *fs_info); @@ -305,8 +302,8 @@ void btrfs_after_dev_replace_commit(struct btrfs_fs_info *fs_info) dev_replace->cursor_left_last_write_of_item; } -int btrfs_dev_replace_start(struct btrfs_root *root, - struct btrfs_ioctl_dev_replace_args *args) +int btrfs_dev_replace_start(struct btrfs_root *root, char *tgtdev_name, + u64 srcdevid, char *srcdev_name, int read_src) { struct btrfs_trans_handle *trans; struct btrfs_fs_info *fs_info = root->fs_info; @@ -315,29 +312,16 @@ int btrfs_dev_replace_start(struct btrfs_root *root, struct btrfs_device *tgt_device = NULL; struct btrfs_device *src_device = NULL; - switch (args->start.cont_reading_from_srcdev_mode) { - case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_ALWAYS: - case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_AVOID: - break; - default: - return -EINVAL; - } - - if ((args->start.srcdevid == 0 && args->start.srcdev_name[0] == '\0') || - args->start.tgtdev_name[0] == '\0') - return -EINVAL; - /* the disk copy procedure reuses the scrub code */ mutex_lock(&fs_info->volume_mutex); - ret = btrfs_dev_replace_find_srcdev(root, args->start.srcdevid, - args->start.srcdev_name, - &src_device); + ret = btrfs_find_device_by_devspec(root, srcdevid, + srcdev_name, &src_device); if (ret) { mutex_unlock(&fs_info->volume_mutex); return ret; } - ret = btrfs_init_dev_replace_tgtdev(root, args->start.tgtdev_name, + ret = btrfs_init_dev_replace_tgtdev(root, tgtdev_name, src_device, &tgt_device); mutex_unlock(&fs_info->volume_mutex); if (ret) @@ -364,18 +348,17 @@ int btrfs_dev_replace_start(struct btrfs_root *root, break; case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: - args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_ALREADY_STARTED; + ret = BTRFS_IOCTL_DEV_REPLACE_RESULT_ALREADY_STARTED; goto leave; } - dev_replace->cont_reading_from_srcdev_mode = - args->start.cont_reading_from_srcdev_mode; + dev_replace->cont_reading_from_srcdev_mode = read_src; WARN_ON(!src_device); dev_replace->srcdev = src_device; WARN_ON(!tgt_device); dev_replace->tgtdev = tgt_device; - btrfs_info_in_rcu(root->fs_info, + btrfs_info_in_rcu(fs_info, "dev_replace from %s (devid %llu) to %s started", src_device->missing ? "<missing disk>" : rcu_str_deref(src_device->name), @@ -396,14 +379,13 @@ int btrfs_dev_replace_start(struct btrfs_root *root, dev_replace->item_needs_writeback = 1; atomic64_set(&dev_replace->num_write_errors, 0); atomic64_set(&dev_replace->num_uncorrectable_read_errors, 0); - args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR; btrfs_dev_replace_unlock(dev_replace, 1); ret = btrfs_sysfs_add_device_link(tgt_device->fs_devices, tgt_device); if (ret) - btrfs_err(root->fs_info, "kobj add dev failed %d\n", ret); + btrfs_err(fs_info, "kobj add dev failed %d\n", ret); - btrfs_wait_ordered_roots(root->fs_info, -1); + btrfs_wait_ordered_roots(root->fs_info, -1, 0, (u64)-1); /* force writing the updated state information to disk */ trans = btrfs_start_transaction(root, 0); @@ -421,11 +403,9 @@ int btrfs_dev_replace_start(struct btrfs_root *root, btrfs_device_get_total_bytes(src_device), &dev_replace->scrub_progress, 0, 1); - ret = btrfs_dev_replace_finishing(root->fs_info, ret); - /* don't warn if EINPROGRESS, someone else might be running scrub */ + ret = btrfs_dev_replace_finishing(fs_info, ret); if (ret == -EINPROGRESS) { - args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_SCRUB_INPROGRESS; - ret = 0; + ret = BTRFS_IOCTL_DEV_REPLACE_RESULT_SCRUB_INPROGRESS; } else { WARN_ON(ret); } @@ -440,8 +420,37 @@ leave: return ret; } +int btrfs_dev_replace_by_ioctl(struct btrfs_root *root, + struct btrfs_ioctl_dev_replace_args *args) +{ + int ret; + + switch (args->start.cont_reading_from_srcdev_mode) { + case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_ALWAYS: + case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_AVOID: + break; + default: + return -EINVAL; + } + + if ((args->start.srcdevid == 0 && args->start.srcdev_name[0] == '\0') || + args->start.tgtdev_name[0] == '\0') + return -EINVAL; + + ret = btrfs_dev_replace_start(root, args->start.tgtdev_name, + args->start.srcdevid, + args->start.srcdev_name, + args->start.cont_reading_from_srcdev_mode); + args->result = ret; + /* don't warn if EINPROGRESS, someone else might be running scrub */ + if (ret == BTRFS_IOCTL_DEV_REPLACE_RESULT_SCRUB_INPROGRESS) + ret = 0; + + return ret; +} + /* - * blocked until all flighting bios are finished. + * blocked until all in-flight bios operations are finished. */ static void btrfs_rm_dev_replace_blocked(struct btrfs_fs_info *fs_info) { @@ -495,7 +504,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); return ret; } - btrfs_wait_ordered_roots(root->fs_info, -1); + btrfs_wait_ordered_roots(root->fs_info, -1, 0, (u64)-1); trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { @@ -560,10 +569,9 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, ASSERT(list_empty(&src_device->resized_list)); tgt_device->commit_total_bytes = src_device->commit_total_bytes; tgt_device->commit_bytes_used = src_device->bytes_used; - if (fs_info->sb->s_bdev == src_device->bdev) - fs_info->sb->s_bdev = tgt_device->bdev; - if (fs_info->fs_devices->latest_bdev == src_device->bdev) - fs_info->fs_devices->latest_bdev = tgt_device->bdev; + + btrfs_assign_next_active_device(fs_info, src_device, tgt_device); + list_add(&tgt_device->dev_alloc_list, &fs_info->fs_devices->alloc_list); fs_info->fs_devices->rw_devices++; @@ -626,25 +634,6 @@ static void btrfs_dev_replace_update_device_in_mapping_tree( write_unlock(&em_tree->lock); } -static int btrfs_dev_replace_find_srcdev(struct btrfs_root *root, u64 srcdevid, - char *srcdev_name, - struct btrfs_device **device) -{ - int ret; - - if (srcdevid) { - ret = 0; - *device = btrfs_find_device(root->fs_info, srcdevid, NULL, - NULL); - if (!*device) - ret = -ENOENT; - } else { - ret = btrfs_find_device_missing_or_by_path(root, srcdev_name, - device); - } - return ret; -} - void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info, struct btrfs_ioctl_dev_replace_args *args) { diff --git a/fs/btrfs/dev-replace.h b/fs/btrfs/dev-replace.h index 29e3ef5f96bd..e922b42d91df 100644 --- a/fs/btrfs/dev-replace.h +++ b/fs/btrfs/dev-replace.h @@ -25,8 +25,10 @@ int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info); int btrfs_run_dev_replace(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info); void btrfs_after_dev_replace_commit(struct btrfs_fs_info *fs_info); -int btrfs_dev_replace_start(struct btrfs_root *root, +int btrfs_dev_replace_by_ioctl(struct btrfs_root *root, struct btrfs_ioctl_dev_replace_args *args); +int btrfs_dev_replace_start(struct btrfs_root *root, char *tgtdev_name, + u64 srcdevid, char *srcdev_name, int read_src); void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info, struct btrfs_ioctl_dev_replace_args *args); int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info, diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 4e47849d7427..9a726ded2c6d 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -124,7 +124,6 @@ struct async_submit_bio { struct list_head list; extent_submit_bio_hook_t *submit_bio_start; extent_submit_bio_hook_t *submit_bio_done; - int rw; int mirror_num; unsigned long bio_flags; /* @@ -384,7 +383,7 @@ static int verify_parent_transid(struct extent_io_tree *io_tree, /* * Things reading via commit roots that don't have normal protection, * like send, can have a really old block in cache that may point at a - * block that has been free'd and re-allocated. So don't clear uptodate + * block that has been freed and re-allocated. So don't clear uptodate * if we find an eb that is under IO (dirty/writeback) because we could * end up reading in the stale data and then writing it back out and * making everybody very sad. @@ -418,7 +417,7 @@ static int btrfs_check_super_csum(char *raw_disk_sb) /* * The super_block structure does not span the whole * BTRFS_SUPER_INFO_SIZE range, we expect that the unused space - * is filled with zeros and is included in the checkum. + * is filled with zeros and is included in the checksum. */ crc = btrfs_csum_data(raw_disk_sb + BTRFS_CSUM_SIZE, crc, BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE); @@ -600,7 +599,7 @@ static noinline int check_leaf(struct btrfs_root *root, /* * Check to make sure that we don't point outside of the leaf, - * just incase all the items are consistent to eachother, but + * just in case all the items are consistent to each other, but * all point outside of the leaf. */ if (btrfs_item_end_nr(leaf, slot) > @@ -727,7 +726,7 @@ static void end_workqueue_bio(struct bio *bio) fs_info = end_io_wq->info; end_io_wq->error = bio->bi_error; - if (bio->bi_rw & REQ_WRITE) { + if (bio_op(bio) == REQ_OP_WRITE) { if (end_io_wq->metadata == BTRFS_WQ_ENDIO_METADATA) { wq = fs_info->endio_meta_write_workers; func = btrfs_endio_meta_write_helper; @@ -797,7 +796,7 @@ static void run_one_async_start(struct btrfs_work *work) int ret; async = container_of(work, struct async_submit_bio, work); - ret = async->submit_bio_start(async->inode, async->rw, async->bio, + ret = async->submit_bio_start(async->inode, async->bio, async->mirror_num, async->bio_flags, async->bio_offset); if (ret) @@ -830,9 +829,8 @@ static void run_one_async_done(struct btrfs_work *work) return; } - async->submit_bio_done(async->inode, async->rw, async->bio, - async->mirror_num, async->bio_flags, - async->bio_offset); + async->submit_bio_done(async->inode, async->bio, async->mirror_num, + async->bio_flags, async->bio_offset); } static void run_one_async_free(struct btrfs_work *work) @@ -844,7 +842,7 @@ static void run_one_async_free(struct btrfs_work *work) } int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode, - int rw, struct bio *bio, int mirror_num, + struct bio *bio, int mirror_num, unsigned long bio_flags, u64 bio_offset, extent_submit_bio_hook_t *submit_bio_start, @@ -857,7 +855,6 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode, return -ENOMEM; async->inode = inode; - async->rw = rw; async->bio = bio; async->mirror_num = mirror_num; async->submit_bio_start = submit_bio_start; @@ -873,7 +870,7 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode, atomic_inc(&fs_info->nr_async_submits); - if (rw & REQ_SYNC) + if (bio->bi_rw & REQ_SYNC) btrfs_set_work_high_priority(&async->work); btrfs_queue_work(fs_info->workers, &async->work); @@ -903,9 +900,8 @@ static int btree_csum_one_bio(struct bio *bio) return ret; } -static int __btree_submit_bio_start(struct inode *inode, int rw, - struct bio *bio, int mirror_num, - unsigned long bio_flags, +static int __btree_submit_bio_start(struct inode *inode, struct bio *bio, + int mirror_num, unsigned long bio_flags, u64 bio_offset) { /* @@ -915,7 +911,7 @@ static int __btree_submit_bio_start(struct inode *inode, int rw, return btree_csum_one_bio(bio); } -static int __btree_submit_bio_done(struct inode *inode, int rw, struct bio *bio, +static int __btree_submit_bio_done(struct inode *inode, struct bio *bio, int mirror_num, unsigned long bio_flags, u64 bio_offset) { @@ -925,7 +921,7 @@ static int __btree_submit_bio_done(struct inode *inode, int rw, struct bio *bio, * when we're called for a write, we're already in the async * submission context. Just jump into btrfs_map_bio */ - ret = btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num, 1); + ret = btrfs_map_bio(BTRFS_I(inode)->root, bio, mirror_num, 1); if (ret) { bio->bi_error = ret; bio_endio(bio); @@ -944,14 +940,14 @@ static int check_async_write(struct inode *inode, unsigned long bio_flags) return 1; } -static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, +static int btree_submit_bio_hook(struct inode *inode, struct bio *bio, int mirror_num, unsigned long bio_flags, u64 bio_offset) { int async = check_async_write(inode, bio_flags); int ret; - if (!(rw & REQ_WRITE)) { + if (bio_op(bio) != REQ_OP_WRITE) { /* * called for a read, do the setup so that checksum validation * can happen in the async kernel threads @@ -960,21 +956,19 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, bio, BTRFS_WQ_ENDIO_METADATA); if (ret) goto out_w_error; - ret = btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, - mirror_num, 0); + ret = btrfs_map_bio(BTRFS_I(inode)->root, bio, mirror_num, 0); } else if (!async) { ret = btree_csum_one_bio(bio); if (ret) goto out_w_error; - ret = btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, - mirror_num, 0); + ret = btrfs_map_bio(BTRFS_I(inode)->root, bio, mirror_num, 0); } else { /* * kthread helpers are used to submit writes so that * checksumming can happen in parallel across all CPUs */ ret = btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info, - inode, rw, bio, mirror_num, 0, + inode, bio, mirror_num, 0, bio_offset, __btree_submit_bio_start, __btree_submit_bio_done); @@ -1098,7 +1092,7 @@ void readahead_tree_block(struct btrfs_root *root, u64 bytenr) struct inode *btree_inode = root->fs_info->btree_inode; buf = btrfs_find_create_tree_block(root, bytenr); - if (!buf) + if (IS_ERR(buf)) return; read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree, buf, 0, WAIT_NONE, btree_get_extent, 0); @@ -1114,7 +1108,7 @@ int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, int ret; buf = btrfs_find_create_tree_block(root, bytenr); - if (!buf) + if (IS_ERR(buf)) return 0; set_bit(EXTENT_BUFFER_READAHEAD, &buf->bflags); @@ -1147,7 +1141,8 @@ struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root, u64 bytenr) { if (btrfs_test_is_dummy_root(root)) - return alloc_test_extent_buffer(root->fs_info, bytenr); + return alloc_test_extent_buffer(root->fs_info, bytenr, + root->nodesize); return alloc_extent_buffer(root->fs_info, bytenr); } @@ -1171,8 +1166,8 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr, int ret; buf = btrfs_find_create_tree_block(root, bytenr); - if (!buf) - return ERR_PTR(-ENOMEM); + if (IS_ERR(buf)) + return buf; ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid); if (ret) { @@ -1314,14 +1309,16 @@ static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info, #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS /* Should only be used by the testing infrastructure */ -struct btrfs_root *btrfs_alloc_dummy_root(void) +struct btrfs_root *btrfs_alloc_dummy_root(u32 sectorsize, u32 nodesize) { struct btrfs_root *root; root = btrfs_alloc_root(NULL, GFP_KERNEL); if (!root) return ERR_PTR(-ENOMEM); - __setup_root(4096, 4096, 4096, root, NULL, 1); + /* We don't use the stripesize in selftest, set it as sectorsize */ + __setup_root(nodesize, sectorsize, sectorsize, root, NULL, + BTRFS_ROOT_TREE_OBJECTID); set_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state); root->alloc_bytenr = 0; @@ -1640,7 +1637,7 @@ int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info, { int ret; - ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM); + ret = radix_tree_preload(GFP_NOFS); if (ret) return ret; @@ -1803,6 +1800,13 @@ static int cleaner_kthread(void *arg) if (btrfs_need_cleaner_sleep(root)) goto sleep; + /* + * Do not do anything if we might cause open_ctree() to block + * before we have finished mounting the filesystem. + */ + if (!root->fs_info->open) + goto sleep; + if (!mutex_trylock(&root->fs_info->cleaner_mutex)) goto sleep; @@ -2417,7 +2421,7 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info, /* returns with log_tree_root freed on success */ ret = btrfs_recover_log_trees(log_tree_root); if (ret) { - btrfs_std_error(tree_root->fs_info, ret, + btrfs_handle_fs_error(tree_root->fs_info, ret, "Failed to recover log tree"); free_extent_buffer(log_tree_root->node); kfree(log_tree_root); @@ -2713,7 +2717,7 @@ int open_ctree(struct super_block *sb, * Pass the whole disk block of size BTRFS_SUPER_INFO_SIZE (4k). */ if (btrfs_check_super_csum(bh->b_data)) { - printk(KERN_ERR "BTRFS: superblock checksum mismatch\n"); + btrfs_err(fs_info, "superblock checksum mismatch"); err = -EINVAL; brelse(bh); goto fail_alloc; @@ -2733,7 +2737,7 @@ int open_ctree(struct super_block *sb, ret = btrfs_check_super_valid(fs_info, sb->s_flags & MS_RDONLY); if (ret) { - printk(KERN_ERR "BTRFS: superblock contains fatal errors\n"); + btrfs_err(fs_info, "superblock contains fatal errors"); err = -EINVAL; goto fail_alloc; } @@ -2768,9 +2772,9 @@ int open_ctree(struct super_block *sb, features = btrfs_super_incompat_flags(disk_super) & ~BTRFS_FEATURE_INCOMPAT_SUPP; if (features) { - printk(KERN_ERR "BTRFS: couldn't mount because of " - "unsupported optional features (%Lx).\n", - features); + btrfs_err(fs_info, + "cannot mount because of unsupported optional features (%llx)", + features); err = -EINVAL; goto fail_alloc; } @@ -2781,7 +2785,7 @@ int open_ctree(struct super_block *sb, features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO; if (features & BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA) - printk(KERN_INFO "BTRFS: has skinny extents\n"); + btrfs_info(fs_info, "has skinny extents"); /* * flag our filesystem as having big metadata blocks if @@ -2789,13 +2793,14 @@ int open_ctree(struct super_block *sb, */ if (btrfs_super_nodesize(disk_super) > PAGE_SIZE) { if (!(features & BTRFS_FEATURE_INCOMPAT_BIG_METADATA)) - printk(KERN_INFO "BTRFS: flagging fs with big metadata feature\n"); + btrfs_info(fs_info, + "flagging fs with big metadata feature"); features |= BTRFS_FEATURE_INCOMPAT_BIG_METADATA; } nodesize = btrfs_super_nodesize(disk_super); sectorsize = btrfs_super_sectorsize(disk_super); - stripesize = btrfs_super_stripesize(disk_super); + stripesize = sectorsize; fs_info->dirty_metadata_batch = nodesize * (1 + ilog2(nr_cpu_ids)); fs_info->delalloc_batch = sectorsize * 512 * (1 + ilog2(nr_cpu_ids)); @@ -2805,9 +2810,9 @@ int open_ctree(struct super_block *sb, */ if ((features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) && (sectorsize != nodesize)) { - printk(KERN_ERR "BTRFS: unequal leaf/node/sector sizes " - "are not allowed for mixed block groups on %s\n", - sb->s_id); + btrfs_err(fs_info, +"unequal nodesize/sectorsize (%u != %u) are not allowed for mixed block groups", + nodesize, sectorsize); goto fail_alloc; } @@ -2820,8 +2825,8 @@ int open_ctree(struct super_block *sb, features = btrfs_super_compat_ro_flags(disk_super) & ~BTRFS_FEATURE_COMPAT_RO_SUPP; if (!(sb->s_flags & MS_RDONLY) && features) { - printk(KERN_ERR "BTRFS: couldn't mount RDWR because of " - "unsupported option features (%Lx).\n", + btrfs_err(fs_info, + "cannot mount read-write because of unsupported optional features (%llx)", features); err = -EINVAL; goto fail_alloc; @@ -2850,8 +2855,7 @@ int open_ctree(struct super_block *sb, ret = btrfs_read_sys_array(tree_root); mutex_unlock(&fs_info->chunk_mutex); if (ret) { - printk(KERN_ERR "BTRFS: failed to read the system " - "array on %s\n", sb->s_id); + btrfs_err(fs_info, "failed to read the system array: %d", ret); goto fail_sb_buffer; } @@ -2865,8 +2869,7 @@ int open_ctree(struct super_block *sb, generation); if (IS_ERR(chunk_root->node) || !extent_buffer_uptodate(chunk_root->node)) { - printk(KERN_ERR "BTRFS: failed to read chunk root on %s\n", - sb->s_id); + btrfs_err(fs_info, "failed to read chunk root"); if (!IS_ERR(chunk_root->node)) free_extent_buffer(chunk_root->node); chunk_root->node = NULL; @@ -2880,8 +2883,7 @@ int open_ctree(struct super_block *sb, ret = btrfs_read_chunk_tree(chunk_root); if (ret) { - printk(KERN_ERR "BTRFS: failed to read chunk tree on %s\n", - sb->s_id); + btrfs_err(fs_info, "failed to read chunk tree: %d", ret); goto fail_tree_roots; } @@ -2892,8 +2894,7 @@ int open_ctree(struct super_block *sb, btrfs_close_extra_devices(fs_devices, 0); if (!fs_devices->latest_bdev) { - printk(KERN_ERR "BTRFS: failed to read devices on %s\n", - sb->s_id); + btrfs_err(fs_info, "failed to read devices"); goto fail_tree_roots; } @@ -2905,8 +2906,7 @@ retry_root_backup: generation); if (IS_ERR(tree_root->node) || !extent_buffer_uptodate(tree_root->node)) { - printk(KERN_WARNING "BTRFS: failed to read tree root on %s\n", - sb->s_id); + btrfs_warn(fs_info, "failed to read tree root"); if (!IS_ERR(tree_root->node)) free_extent_buffer(tree_root->node); tree_root->node = NULL; @@ -2938,20 +2938,19 @@ retry_root_backup: ret = btrfs_recover_balance(fs_info); if (ret) { - printk(KERN_ERR "BTRFS: failed to recover balance\n"); + btrfs_err(fs_info, "failed to recover balance: %d", ret); goto fail_block_groups; } ret = btrfs_init_dev_stats(fs_info); if (ret) { - printk(KERN_ERR "BTRFS: failed to init dev_stats: %d\n", - ret); + btrfs_err(fs_info, "failed to init dev_stats: %d", ret); goto fail_block_groups; } ret = btrfs_init_dev_replace(fs_info); if (ret) { - pr_err("BTRFS: failed to init dev_replace: %d\n", ret); + btrfs_err(fs_info, "failed to init dev_replace: %d", ret); goto fail_block_groups; } @@ -2959,31 +2958,33 @@ retry_root_backup: ret = btrfs_sysfs_add_fsid(fs_devices, NULL); if (ret) { - pr_err("BTRFS: failed to init sysfs fsid interface: %d\n", ret); + btrfs_err(fs_info, "failed to init sysfs fsid interface: %d", + ret); goto fail_block_groups; } ret = btrfs_sysfs_add_device(fs_devices); if (ret) { - pr_err("BTRFS: failed to init sysfs device interface: %d\n", ret); + btrfs_err(fs_info, "failed to init sysfs device interface: %d", + ret); goto fail_fsdev_sysfs; } ret = btrfs_sysfs_add_mounted(fs_info); if (ret) { - pr_err("BTRFS: failed to init sysfs interface: %d\n", ret); + btrfs_err(fs_info, "failed to init sysfs interface: %d", ret); goto fail_fsdev_sysfs; } ret = btrfs_init_space_info(fs_info); if (ret) { - printk(KERN_ERR "BTRFS: Failed to initial space info: %d\n", ret); + btrfs_err(fs_info, "failed to initialize space info: %d", ret); goto fail_sysfs; } ret = btrfs_read_block_groups(fs_info->extent_root); if (ret) { - printk(KERN_ERR "BTRFS: Failed to read block groups: %d\n", ret); + btrfs_err(fs_info, "failed to read block groups: %d", ret); goto fail_sysfs; } fs_info->num_tolerated_disk_barrier_failures = @@ -2991,7 +2992,8 @@ retry_root_backup: if (fs_info->fs_devices->missing_devices > fs_info->num_tolerated_disk_barrier_failures && !(sb->s_flags & MS_RDONLY)) { - pr_warn("BTRFS: missing devices(%llu) exceeds the limit(%d), writeable mount is not allowed\n", + btrfs_warn(fs_info, +"missing devices (%llu) exceeds the limit (%d), writeable mount is not allowed", fs_info->fs_devices->missing_devices, fs_info->num_tolerated_disk_barrier_failures); goto fail_sysfs; @@ -3011,13 +3013,12 @@ retry_root_backup: if (!btrfs_test_opt(tree_root, SSD) && !btrfs_test_opt(tree_root, NOSSD) && !fs_info->fs_devices->rotating) { - printk(KERN_INFO "BTRFS: detected SSD devices, enabling SSD " - "mode\n"); + btrfs_info(fs_info, "detected SSD devices, enabling SSD mode"); btrfs_set_opt(fs_info->mount_opt, SSD); } /* - * Mount does not set all options immediatelly, we can do it now and do + * Mount does not set all options immediately, we can do it now and do * not have to wait for transaction commit */ btrfs_apply_pending_changes(fs_info); @@ -3030,8 +3031,9 @@ retry_root_backup: 1 : 0, fs_info->check_integrity_print_mask); if (ret) - printk(KERN_WARNING "BTRFS: failed to initialize" - " integrity check module %s\n", sb->s_id); + btrfs_warn(fs_info, + "failed to initialize integrity check module: %d", + ret); } #endif ret = btrfs_read_qgroup_config(fs_info); @@ -3061,8 +3063,8 @@ retry_root_backup: ret = btrfs_recover_relocation(tree_root); mutex_unlock(&fs_info->cleaner_mutex); if (ret < 0) { - printk(KERN_WARNING - "BTRFS: failed to recover relocation\n"); + btrfs_warn(fs_info, "failed to recover relocation: %d", + ret); err = -EINVAL; goto fail_qgroup; } @@ -3083,11 +3085,11 @@ retry_root_backup: if (btrfs_test_opt(tree_root, FREE_SPACE_TREE) && !btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) { - pr_info("BTRFS: creating free space tree\n"); + btrfs_info(fs_info, "creating free space tree"); ret = btrfs_create_free_space_tree(fs_info); if (ret) { - pr_warn("BTRFS: failed to create free space tree %d\n", - ret); + btrfs_warn(fs_info, + "failed to create free space tree: %d", ret); close_ctree(tree_root); return ret; } @@ -3104,14 +3106,14 @@ retry_root_backup: ret = btrfs_resume_balance_async(fs_info); if (ret) { - printk(KERN_WARNING "BTRFS: failed to resume balance\n"); + btrfs_warn(fs_info, "failed to resume balance: %d", ret); close_ctree(tree_root); return ret; } ret = btrfs_resume_dev_replace_async(fs_info); if (ret) { - pr_warn("BTRFS: failed to resume dev_replace\n"); + btrfs_warn(fs_info, "failed to resume device replace: %d", ret); close_ctree(tree_root); return ret; } @@ -3120,33 +3122,33 @@ retry_root_backup: if (btrfs_test_opt(tree_root, CLEAR_CACHE) && btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) { - pr_info("BTRFS: clearing free space tree\n"); + btrfs_info(fs_info, "clearing free space tree"); ret = btrfs_clear_free_space_tree(fs_info); if (ret) { - pr_warn("BTRFS: failed to clear free space tree %d\n", - ret); + btrfs_warn(fs_info, + "failed to clear free space tree: %d", ret); close_ctree(tree_root); return ret; } } if (!fs_info->uuid_root) { - pr_info("BTRFS: creating UUID tree\n"); + btrfs_info(fs_info, "creating UUID tree"); ret = btrfs_create_uuid_tree(fs_info); if (ret) { - pr_warn("BTRFS: failed to create the UUID tree %d\n", - ret); + btrfs_warn(fs_info, + "failed to create the UUID tree: %d", ret); close_ctree(tree_root); return ret; } } else if (btrfs_test_opt(tree_root, RESCAN_UUID_TREE) || fs_info->generation != btrfs_super_uuid_tree_generation(disk_super)) { - pr_info("BTRFS: checking UUID tree\n"); + btrfs_info(fs_info, "checking UUID tree"); ret = btrfs_check_uuid_tree(fs_info); if (ret) { - pr_warn("BTRFS: failed to check the UUID tree %d\n", - ret); + btrfs_warn(fs_info, + "failed to check the UUID tree: %d", ret); close_ctree(tree_root); return ret; } @@ -3245,7 +3247,7 @@ static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate) btrfs_warn_rl_in_rcu(device->dev_root->fs_info, "lost page write due to IO error on %s", rcu_str_deref(device->name)); - /* note, we dont' set_buffer_write_io_error because we have + /* note, we don't set_buffer_write_io_error because we have * our own ways of dealing with the IO errors */ clear_buffer_uptodate(bh); @@ -3410,9 +3412,9 @@ static int write_dev_supers(struct btrfs_device *device, * to go down lazy. */ if (i == 0) - ret = btrfsic_submit_bh(WRITE_FUA, bh); + ret = btrfsic_submit_bh(REQ_OP_WRITE, WRITE_FUA, bh); else - ret = btrfsic_submit_bh(WRITE_SYNC, bh); + ret = btrfsic_submit_bh(REQ_OP_WRITE, WRITE_SYNC, bh); if (ret) errors++; } @@ -3476,12 +3478,13 @@ static int write_dev_flush(struct btrfs_device *device, int wait) bio->bi_end_io = btrfs_end_empty_barrier; bio->bi_bdev = device->bdev; + bio_set_op_attrs(bio, REQ_OP_WRITE, WRITE_FLUSH); init_completion(&device->flush_wait); bio->bi_private = &device->flush_wait; device->flush_bio = bio; bio_get(bio); - btrfsic_submit_bio(WRITE_FLUSH, bio); + btrfsic_submit_bio(bio); return 0; } @@ -3646,7 +3649,7 @@ static int write_all_supers(struct btrfs_root *root, int max_mirrors) if (ret) { mutex_unlock( &root->fs_info->fs_devices->device_list_mutex); - btrfs_std_error(root->fs_info, ret, + btrfs_handle_fs_error(root->fs_info, ret, "errors while submitting device barriers."); return ret; } @@ -3686,7 +3689,7 @@ static int write_all_supers(struct btrfs_root *root, int max_mirrors) mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); /* FUA is masked off if unsupported and can't be the reason */ - btrfs_std_error(root->fs_info, -EIO, + btrfs_handle_fs_error(root->fs_info, -EIO, "%d errors while writing supers", total_errors); return -EIO; } @@ -3704,7 +3707,7 @@ static int write_all_supers(struct btrfs_root *root, int max_mirrors) } mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); if (total_errors > max_errors) { - btrfs_std_error(root->fs_info, -EIO, + btrfs_handle_fs_error(root->fs_info, -EIO, "%d errors while writing supers", total_errors); return -EIO; } @@ -4120,6 +4123,16 @@ static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info, * Hint to catch really bogus numbers, bitflips or so, more exact checks are * done later */ + if (btrfs_super_bytes_used(sb) < 6 * btrfs_super_nodesize(sb)) { + btrfs_err(fs_info, "bytes_used is too small %llu", + btrfs_super_bytes_used(sb)); + ret = -EINVAL; + } + if (!is_power_of_2(btrfs_super_stripesize(sb))) { + btrfs_err(fs_info, "invalid stripesize %u", + btrfs_super_stripesize(sb)); + ret = -EINVAL; + } if (btrfs_super_num_devices(sb) > (1UL << 31)) printk(KERN_WARNING "BTRFS: suspicious number of devices: %llu\n", btrfs_super_num_devices(sb)); @@ -4357,7 +4370,7 @@ static int btrfs_destroy_marked_extents(struct btrfs_root *root, if (ret) break; - clear_extent_bits(dirty_pages, start, end, mark, GFP_NOFS); + clear_extent_bits(dirty_pages, start, end, mark); while (start <= end) { eb = btrfs_find_tree_block(root->fs_info, start); start += root->nodesize; @@ -4392,7 +4405,7 @@ again: if (ret) break; - clear_extent_dirty(unpin, start, end, GFP_NOFS); + clear_extent_dirty(unpin, start, end); btrfs_error_unpin_extent_range(root, start, end); cond_resched(); } diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 8e79d0070bcf..dbf3e1aab69e 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -90,7 +90,7 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info, void btrfs_free_fs_root(struct btrfs_root *root); #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS -struct btrfs_root *btrfs_alloc_dummy_root(void); +struct btrfs_root *btrfs_alloc_dummy_root(u32 sectorsize, u32 nodesize); #endif /* @@ -122,7 +122,7 @@ void btrfs_csum_final(u32 crc, char *result); int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio, enum btrfs_wq_endio_type metadata); int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode, - int rw, struct bio *bio, int mirror_num, + struct bio *bio, int mirror_num, unsigned long bio_flags, u64 bio_offset, extent_submit_bio_hook_t *submit_bio_start, extent_submit_bio_hook_t *submit_bio_done); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 84e060eb0de8..b480fd555774 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -231,9 +231,9 @@ static int add_excluded_extent(struct btrfs_root *root, { u64 end = start + num_bytes - 1; set_extent_bits(&root->fs_info->freed_extents[0], - start, end, EXTENT_UPTODATE, GFP_NOFS); + start, end, EXTENT_UPTODATE); set_extent_bits(&root->fs_info->freed_extents[1], - start, end, EXTENT_UPTODATE, GFP_NOFS); + start, end, EXTENT_UPTODATE); return 0; } @@ -246,9 +246,9 @@ static void free_excluded_extents(struct btrfs_root *root, end = start + cache->key.offset - 1; clear_extent_bits(&root->fs_info->freed_extents[0], - start, end, EXTENT_UPTODATE, GFP_NOFS); + start, end, EXTENT_UPTODATE); clear_extent_bits(&root->fs_info->freed_extents[1], - start, end, EXTENT_UPTODATE, GFP_NOFS); + start, end, EXTENT_UPTODATE); } static int exclude_super_stripes(struct btrfs_root *root, @@ -980,7 +980,7 @@ out_free: * event that tree block loses its owner tree's reference and do the * back refs conversion. * - * When a tree block is COW'd through a tree, there are four cases: + * When a tree block is COWed through a tree, there are four cases: * * The reference count of the block is one and the tree is the block's * owner tree. Nothing to do in this case. @@ -2042,8 +2042,13 @@ int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, struct btrfs_bio *bbio = NULL; + /* + * Avoid races with device replace and make sure our bbio has devices + * associated to its stripes that don't go away while we are discarding. + */ + btrfs_bio_counter_inc_blocked(root->fs_info); /* Tell the block device(s) that the sectors can be discarded */ - ret = btrfs_map_block(root->fs_info, REQ_DISCARD, + ret = btrfs_map_block(root->fs_info, REQ_OP_DISCARD, bytenr, &num_bytes, &bbio, 0); /* Error condition is -ENOMEM */ if (!ret) { @@ -2074,6 +2079,7 @@ int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, } btrfs_put_bbio(bbio); } + btrfs_bio_counter_dec(root->fs_info); if (actual_bytes) *actual_bytes = discarded_bytes; @@ -2595,7 +2601,7 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, } /* - * Need to drop our head ref lock and re-aqcuire the + * Need to drop our head ref lock and re-acquire the * delayed ref lock and then re-check to make sure * nobody got added. */ @@ -2747,7 +2753,7 @@ static inline u64 heads_to_leaves(struct btrfs_root *root, u64 heads) /* * We don't ever fill up leaves all the way so multiply by 2 just to be - * closer to what we're really going to want to ouse. + * closer to what we're really going to want to use. */ return div_u64(num_bytes, BTRFS_LEAF_DATA_SIZE(root)); } @@ -2829,6 +2835,7 @@ int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans, struct async_delayed_refs { struct btrfs_root *root; + u64 transid; int count; int error; int sync; @@ -2844,6 +2851,10 @@ static void delayed_ref_async_start(struct btrfs_work *work) async = container_of(work, struct async_delayed_refs, work); + /* if the commit is already started, we don't need to wait here */ + if (btrfs_transaction_blocked(async->root->fs_info)) + goto done; + trans = btrfs_join_transaction(async->root); if (IS_ERR(trans)) { async->error = PTR_ERR(trans); @@ -2851,14 +2862,19 @@ static void delayed_ref_async_start(struct btrfs_work *work) } /* - * trans->sync means that when we call end_transaciton, we won't + * trans->sync means that when we call end_transaction, we won't * wait on delayed refs */ trans->sync = true; + + /* Don't bother flushing if we got into a different transaction */ + if (trans->transid > async->transid) + goto end; + ret = btrfs_run_delayed_refs(trans, async->root, async->count); if (ret) async->error = ret; - +end: ret = btrfs_end_transaction(trans, async->root); if (ret && !async->error) async->error = ret; @@ -2870,7 +2886,7 @@ done: } int btrfs_async_run_delayed_refs(struct btrfs_root *root, - unsigned long count, int wait) + unsigned long count, u64 transid, int wait) { struct async_delayed_refs *async; int ret; @@ -2882,6 +2898,7 @@ int btrfs_async_run_delayed_refs(struct btrfs_root *root, async->root = root->fs_info->tree_root; async->count = count; async->error = 0; + async->transid = transid; if (wait) async->sync = 1; else @@ -3824,6 +3841,59 @@ int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr) return readonly; } +bool btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr) +{ + struct btrfs_block_group_cache *bg; + bool ret = true; + + bg = btrfs_lookup_block_group(fs_info, bytenr); + if (!bg) + return false; + + spin_lock(&bg->lock); + if (bg->ro) + ret = false; + else + atomic_inc(&bg->nocow_writers); + spin_unlock(&bg->lock); + + /* no put on block group, done by btrfs_dec_nocow_writers */ + if (!ret) + btrfs_put_block_group(bg); + + return ret; + +} + +void btrfs_dec_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr) +{ + struct btrfs_block_group_cache *bg; + + bg = btrfs_lookup_block_group(fs_info, bytenr); + ASSERT(bg); + if (atomic_dec_and_test(&bg->nocow_writers)) + wake_up_atomic_t(&bg->nocow_writers); + /* + * Once for our lookup and once for the lookup done by a previous call + * to btrfs_inc_nocow_writers() + */ + btrfs_put_block_group(bg); + btrfs_put_block_group(bg); +} + +static int btrfs_wait_nocow_writers_atomic_t(atomic_t *a) +{ + schedule(); + return 0; +} + +void btrfs_wait_nocow_writers(struct btrfs_block_group_cache *bg) +{ + wait_on_atomic_t(&bg->nocow_writers, + btrfs_wait_nocow_writers_atomic_t, + TASK_UNINTERRUPTIBLE); +} + static const char *alloc_name(u64 flags) { switch (flags) { @@ -4141,7 +4211,7 @@ commit_trans: if (need_commit > 0) { btrfs_start_delalloc_roots(fs_info, 0, -1); - btrfs_wait_ordered_roots(fs_info, -1); + btrfs_wait_ordered_roots(fs_info, -1, 0, (u64)-1); } trans = btrfs_join_transaction(root); @@ -4243,7 +4313,7 @@ void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start, * Called if we need to clear a data reservation for this inode * Normally in a error case. * - * This one will handle the per-indoe data rsv map for accurate reserved + * This one will handle the per-inode data rsv map for accurate reserved * space framework. */ void btrfs_free_reserved_data_space(struct inode *inode, u64 start, u64 len) @@ -4583,7 +4653,8 @@ static void btrfs_writeback_inodes_sb_nr(struct btrfs_root *root, */ btrfs_start_delalloc_roots(root->fs_info, 0, nr_items); if (!current->journal_info) - btrfs_wait_ordered_roots(root->fs_info, nr_items); + btrfs_wait_ordered_roots(root->fs_info, nr_items, + 0, (u64)-1); } } @@ -4620,7 +4691,7 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig, /* Calc the number of the pages we need flush for space reservation */ items = calc_reclaim_items_nr(root, to_reclaim); - to_reclaim = items * EXTENT_SIZE_PER_ITEM; + to_reclaim = (u64)items * EXTENT_SIZE_PER_ITEM; trans = (struct btrfs_trans_handle *)current->journal_info; block_rsv = &root->fs_info->delalloc_block_rsv; @@ -4632,7 +4703,8 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig, if (trans) return; if (wait_ordered) - btrfs_wait_ordered_roots(root->fs_info, items); + btrfs_wait_ordered_roots(root->fs_info, items, + 0, (u64)-1); return; } @@ -4671,7 +4743,8 @@ skip_async: loops++; if (wait_ordered && !trans) { - btrfs_wait_ordered_roots(root->fs_info, items); + btrfs_wait_ordered_roots(root->fs_info, items, + 0, (u64)-1); } else { time_left = schedule_timeout_killable(1); if (time_left) @@ -4911,7 +4984,7 @@ void btrfs_init_async_reclaim_work(struct work_struct *work) * @orig_bytes - the number of bytes we want * @flush - whether or not we can flush to make our reservation * - * This will reserve orgi_bytes number of bytes from the space info associated + * This will reserve orig_bytes number of bytes from the space info associated * with the block_rsv. If there is not enough space it will make an attempt to * flush out space to make room. It will do this by flushing delalloc if * possible or committing the transaction. If flush is 0 then no attempts to @@ -5516,7 +5589,7 @@ void btrfs_orphan_release_metadata(struct inode *inode) * common file/directory operations, they change two fs/file trees * and root tree, the number of items that the qgroup reserves is * different with the free space reservation. So we can not use - * the space reseravtion mechanism in start_transaction(). + * the space reservation mechanism in start_transaction(). */ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, struct btrfs_block_rsv *rsv, @@ -5565,7 +5638,7 @@ void btrfs_subvolume_release_metadata(struct btrfs_root *root, /** * drop_outstanding_extent - drop an outstanding extent * @inode: the inode we're dropping the extent for - * @num_bytes: the number of bytes we're relaseing. + * @num_bytes: the number of bytes we're releasing. * * This is called when we are freeing up an outstanding extent, either called * after an error or after an extent is written. This will return the number of @@ -5591,7 +5664,7 @@ static unsigned drop_outstanding_extent(struct inode *inode, u64 num_bytes) drop_inode_space = 1; /* - * If we have more or the same amount of outsanding extents than we have + * If we have more or the same amount of outstanding extents than we have * reserved then we need to leave the reserved extents count alone. */ if (BTRFS_I(inode)->outstanding_extents >= @@ -5605,8 +5678,8 @@ static unsigned drop_outstanding_extent(struct inode *inode, u64 num_bytes) } /** - * calc_csum_metadata_size - return the amount of metada space that must be - * reserved/free'd for the given bytes. + * calc_csum_metadata_size - return the amount of metadata space that must be + * reserved/freed for the given bytes. * @inode: the inode we're manipulating * @num_bytes: the number of bytes in question * @reserve: 1 if we are reserving space, 0 if we are freeing space @@ -5758,7 +5831,7 @@ out_fail: /* * This is tricky, but first we need to figure out how much we - * free'd from any free-ers that occurred during this + * freed from any free-ers that occurred during this * reservation, so we reset ->csum_bytes to the csum_bytes * before we dropped our lock, and then call the free for the * number of bytes that were freed while we were trying our @@ -5780,7 +5853,7 @@ out_fail: /* * Now reset ->csum_bytes to what it should be. If bytes is - * more than to_free then we would have free'd more space had we + * more than to_free then we would have freed more space had we * not had an artificially high ->csum_bytes, so we need to free * the remainder. If bytes is the same or less then we don't * need to do anything, the other free-ers did the correct @@ -6172,6 +6245,57 @@ int btrfs_exclude_logged_extents(struct btrfs_root *log, return 0; } +static void +btrfs_inc_block_group_reservations(struct btrfs_block_group_cache *bg) +{ + atomic_inc(&bg->reservations); +} + +void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info, + const u64 start) +{ + struct btrfs_block_group_cache *bg; + + bg = btrfs_lookup_block_group(fs_info, start); + ASSERT(bg); + if (atomic_dec_and_test(&bg->reservations)) + wake_up_atomic_t(&bg->reservations); + btrfs_put_block_group(bg); +} + +static int btrfs_wait_bg_reservations_atomic_t(atomic_t *a) +{ + schedule(); + return 0; +} + +void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg) +{ + struct btrfs_space_info *space_info = bg->space_info; + + ASSERT(bg->ro); + + if (!(bg->flags & BTRFS_BLOCK_GROUP_DATA)) + return; + + /* + * Our block group is read only but before we set it to read only, + * some task might have had allocated an extent from it already, but it + * has not yet created a respective ordered extent (and added it to a + * root's list of ordered extents). + * Therefore wait for any task currently allocating extents, since the + * block group's reservations counter is incremented while a read lock + * on the groups' semaphore is held and decremented after releasing + * the read access on that semaphore and creating the ordered extent. + */ + down_write(&space_info->groups_sem); + up_write(&space_info->groups_sem); + + wait_on_atomic_t(&bg->reservations, + btrfs_wait_bg_reservations_atomic_t, + TASK_UNINTERRUPTIBLE); +} + /** * btrfs_update_reserved_bytes - update the block_group and space info counters * @cache: The cache we are manipulating @@ -6408,7 +6532,7 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, ret = btrfs_discard_extent(root, start, end + 1 - start, NULL); - clear_extent_dirty(unpin, start, end, GFP_NOFS); + clear_extent_dirty(unpin, start, end); unpin_extent_range(root, start, end, true); mutex_unlock(&fs_info->unused_bg_unpin_mutex); cond_resched(); @@ -7025,36 +7149,35 @@ btrfs_lock_cluster(struct btrfs_block_group_cache *block_group, int delalloc) { struct btrfs_block_group_cache *used_bg = NULL; - bool locked = false; -again: + spin_lock(&cluster->refill_lock); - if (locked) { - if (used_bg == cluster->block_group) + while (1) { + used_bg = cluster->block_group; + if (!used_bg) + return NULL; + + if (used_bg == block_group) return used_bg; - up_read(&used_bg->data_rwsem); - btrfs_put_block_group(used_bg); - } + btrfs_get_block_group(used_bg); - used_bg = cluster->block_group; - if (!used_bg) - return NULL; + if (!delalloc) + return used_bg; - if (used_bg == block_group) - return used_bg; + if (down_read_trylock(&used_bg->data_rwsem)) + return used_bg; - btrfs_get_block_group(used_bg); + spin_unlock(&cluster->refill_lock); - if (!delalloc) - return used_bg; + down_read(&used_bg->data_rwsem); - if (down_read_trylock(&used_bg->data_rwsem)) - return used_bg; + spin_lock(&cluster->refill_lock); + if (used_bg == cluster->block_group) + return used_bg; - spin_unlock(&cluster->refill_lock); - down_read(&used_bg->data_rwsem); - locked = true; - goto again; + up_read(&used_bg->data_rwsem); + btrfs_put_block_group(used_bg); + } } static inline void @@ -7431,6 +7554,7 @@ checks: btrfs_add_free_space(block_group, offset, num_bytes); goto loop; } + btrfs_inc_block_group_reservations(block_group); /* we are all good, lets return */ ins->objectid = search_start; @@ -7471,7 +7595,7 @@ loop: if (loop == LOOP_CACHING_NOWAIT) { /* * We want to skip the LOOP_CACHING_WAIT step if we - * don't have any unached bgs and we've alrelady done a + * don't have any uncached bgs and we've already done a * full search through. */ if (orig_have_caching_bg || !full_search) @@ -7612,8 +7736,10 @@ again: WARN_ON(num_bytes < root->sectorsize); ret = find_free_extent(root, num_bytes, empty_size, hint_byte, ins, flags, delalloc); - - if (ret == -ENOSPC) { + if (!ret && !is_data) { + btrfs_dec_block_group_reservations(root->fs_info, + ins->objectid); + } else if (ret == -ENOSPC) { if (!final_tried && ins->offset) { num_bytes = min(num_bytes >> 1, ins->offset); num_bytes = round_down(num_bytes, root->sectorsize); @@ -7873,7 +7999,7 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, /* * Mixed block groups will exclude before processing the log so we only - * need to do the exlude dance if this fs isn't mixed. + * need to do the exclude dance if this fs isn't mixed. */ if (!btrfs_fs_incompat(root->fs_info, MIXED_GROUPS)) { ret = __exclude_logged_extent(root, ins->objectid, ins->offset); @@ -7901,8 +8027,9 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf; buf = btrfs_find_create_tree_block(root, bytenr); - if (!buf) - return ERR_PTR(-ENOMEM); + if (IS_ERR(buf)) + return buf; + btrfs_set_header_generation(buf, trans->transid); btrfs_set_buffer_lockdep_class(root->root_key.objectid, buf, level); btrfs_tree_lock(buf); @@ -7923,13 +8050,13 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, buf->start + buf->len - 1, GFP_NOFS); else set_extent_new(&root->dirty_log_pages, buf->start, - buf->start + buf->len - 1, GFP_NOFS); + buf->start + buf->len - 1); } else { buf->log_index = -1; set_extent_dirty(&trans->transaction->dirty_pages, buf->start, buf->start + buf->len - 1, GFP_NOFS); } - trans->blocks_used++; + trans->dirty = true; /* this returns a buffer locked for blocking */ return buf; } @@ -8544,8 +8671,9 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, next = btrfs_find_tree_block(root->fs_info, bytenr); if (!next) { next = btrfs_find_create_tree_block(root, bytenr); - if (!next) - return -ENOMEM; + if (IS_ERR(next)) + return PTR_ERR(next); + btrfs_set_buffer_lockdep_class(root->root_key.objectid, next, level - 1); reada = 1; @@ -9058,7 +9186,7 @@ out: if (!for_reloc && root_dropped == false) btrfs_add_dead_root(root); if (err && err != -EAGAIN) - btrfs_std_error(root->fs_info, err, NULL); + btrfs_handle_fs_error(root->fs_info, err, NULL); return err; } @@ -9317,7 +9445,7 @@ u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo) u64 free_bytes = 0; int factor; - /* It's df, we don't care if it's racey */ + /* It's df, we don't care if it's racy */ if (list_empty(&sinfo->ro_bgs)) return 0; @@ -10526,14 +10654,14 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) */ mutex_lock(&fs_info->unused_bg_unpin_mutex); ret = clear_extent_bits(&fs_info->freed_extents[0], start, end, - EXTENT_DIRTY, GFP_NOFS); + EXTENT_DIRTY); if (ret) { mutex_unlock(&fs_info->unused_bg_unpin_mutex); btrfs_dec_block_group_ro(root, block_group); goto end_trans; } ret = clear_extent_bits(&fs_info->freed_extents[1], start, end, - EXTENT_DIRTY, GFP_NOFS); + EXTENT_DIRTY); if (ret) { mutex_unlock(&fs_info->unused_bg_unpin_mutex); btrfs_dec_block_group_ro(root, block_group); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index d247fc0eea19..cee4cb99b8ce 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -726,14 +726,6 @@ next: start = last_end + 1; if (start <= end && state && !need_resched()) goto hit_next; - goto search_again; - -out: - spin_unlock(&tree->lock); - if (prealloc) - free_extent_state(prealloc); - - return 0; search_again: if (start > end) @@ -742,6 +734,14 @@ search_again: if (gfpflags_allow_blocking(mask)) cond_resched(); goto again; + +out: + spin_unlock(&tree->lock); + if (prealloc) + free_extent_state(prealloc); + + return 0; + } static void wait_on_state(struct extent_io_tree *tree, @@ -873,8 +873,14 @@ __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, bits |= EXTENT_FIRST_DELALLOC; again: if (!prealloc && gfpflags_allow_blocking(mask)) { + /* + * Don't care for allocation failure here because we might end + * up not needing the pre-allocated extent state at all, which + * is the case if we only have in the tree extent states that + * cover our input range and don't cover too any other range. + * If we end up needing a new extent state we allocate it later. + */ prealloc = alloc_extent_state(mask); - BUG_ON(!prealloc); } spin_lock(&tree->lock); @@ -1037,7 +1043,13 @@ hit_next: goto out; } - goto search_again; +search_again: + if (start > end) + goto out; + spin_unlock(&tree->lock); + if (gfpflags_allow_blocking(mask)) + cond_resched(); + goto again; out: spin_unlock(&tree->lock); @@ -1046,13 +1058,6 @@ out: return err; -search_again: - if (start > end) - goto out; - spin_unlock(&tree->lock); - if (gfpflags_allow_blocking(mask)) - cond_resched(); - goto again; } int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, @@ -1073,17 +1078,18 @@ int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, * @bits: the bits to set in this range * @clear_bits: the bits to clear in this range * @cached_state: state that we're going to cache - * @mask: the allocation mask * * This will go through and set bits for the given range. If any states exist * already in this range they are set with the given bit and cleared of the * clear_bits. This is only meant to be used by things that are mergeable, ie * converting from say DELALLOC to DIRTY. This is not meant to be used with * boundary bits like LOCK. + * + * All allocations are done with GFP_NOFS. */ int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, unsigned bits, unsigned clear_bits, - struct extent_state **cached_state, gfp_t mask) + struct extent_state **cached_state) { struct extent_state *state; struct extent_state *prealloc = NULL; @@ -1098,7 +1104,7 @@ int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, btrfs_debug_check_extent_io_range(tree, start, end); again: - if (!prealloc && gfpflags_allow_blocking(mask)) { + if (!prealloc) { /* * Best effort, don't worry if extent state allocation fails * here for the first iteration. We might have a cached state @@ -1106,7 +1112,7 @@ again: * extent state allocations are needed. We'll only know this * after locking the tree. */ - prealloc = alloc_extent_state(mask); + prealloc = alloc_extent_state(GFP_NOFS); if (!prealloc && !first_iteration) return -ENOMEM; } @@ -1263,7 +1269,13 @@ hit_next: goto out; } - goto search_again; +search_again: + if (start > end) + goto out; + spin_unlock(&tree->lock); + cond_resched(); + first_iteration = false; + goto again; out: spin_unlock(&tree->lock); @@ -1271,21 +1283,11 @@ out: free_extent_state(prealloc); return err; - -search_again: - if (start > end) - goto out; - spin_unlock(&tree->lock); - if (gfpflags_allow_blocking(mask)) - cond_resched(); - first_iteration = false; - goto again; } /* wrappers around set/clear extent bit */ int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, - unsigned bits, gfp_t mask, - struct extent_changeset *changeset) + unsigned bits, struct extent_changeset *changeset) { /* * We don't support EXTENT_LOCKED yet, as current changeset will @@ -1295,7 +1297,7 @@ int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, */ BUG_ON(bits & EXTENT_LOCKED); - return __set_extent_bit(tree, start, end, bits, 0, NULL, NULL, mask, + return __set_extent_bit(tree, start, end, bits, 0, NULL, NULL, GFP_NOFS, changeset); } @@ -1308,8 +1310,7 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, } int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, - unsigned bits, gfp_t mask, - struct extent_changeset *changeset) + unsigned bits, struct extent_changeset *changeset) { /* * Don't support EXTENT_LOCKED case, same reason as @@ -1317,7 +1318,7 @@ int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, */ BUG_ON(bits & EXTENT_LOCKED); - return __clear_extent_bit(tree, start, end, bits, 0, 0, NULL, mask, + return __clear_extent_bit(tree, start, end, bits, 0, 0, NULL, GFP_NOFS, changeset); } @@ -1975,13 +1976,13 @@ int free_io_failure(struct inode *inode, struct io_failure_record *rec) set_state_failrec(failure_tree, rec->start, NULL); ret = clear_extent_bits(failure_tree, rec->start, rec->start + rec->len - 1, - EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS); + EXTENT_LOCKED | EXTENT_DIRTY); if (ret) err = ret; ret = clear_extent_bits(&BTRFS_I(inode)->io_tree, rec->start, rec->start + rec->len - 1, - EXTENT_DAMAGED, GFP_NOFS); + EXTENT_DAMAGED); if (ret && !err) err = ret; @@ -2024,9 +2025,16 @@ int repair_io_failure(struct inode *inode, u64 start, u64 length, u64 logical, bio->bi_iter.bi_size = 0; map_length = length; + /* + * Avoid races with device replace and make sure our bbio has devices + * associated to its stripes that don't go away while we are doing the + * read repair operation. + */ + btrfs_bio_counter_inc_blocked(fs_info); ret = btrfs_map_block(fs_info, WRITE, logical, &map_length, &bbio, mirror_num); if (ret) { + btrfs_bio_counter_dec(fs_info); bio_put(bio); return -EIO; } @@ -2036,14 +2044,17 @@ int repair_io_failure(struct inode *inode, u64 start, u64 length, u64 logical, dev = bbio->stripes[mirror_num-1].dev; btrfs_put_bbio(bbio); if (!dev || !dev->bdev || !dev->writeable) { + btrfs_bio_counter_dec(fs_info); bio_put(bio); return -EIO; } bio->bi_bdev = dev->bdev; + bio->bi_rw = WRITE_SYNC; bio_add_page(bio, page, length, pg_offset); - if (btrfsic_submit_bio_wait(WRITE_SYNC, bio)) { + if (btrfsic_submit_bio_wait(bio)) { /* try to remap that extent elsewhere? */ + btrfs_bio_counter_dec(fs_info); bio_put(bio); btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS); return -EIO; @@ -2053,6 +2064,7 @@ int repair_io_failure(struct inode *inode, u64 start, u64 length, u64 logical, "read error corrected: ino %llu off %llu (dev %s sector %llu)", btrfs_ino(inode), start, rcu_str_deref(dev->name), sector); + btrfs_bio_counter_dec(fs_info); bio_put(bio); return 0; } @@ -2232,13 +2244,12 @@ int btrfs_get_io_failure_record(struct inode *inode, u64 start, u64 end, /* set the bits in the private failure tree */ ret = set_extent_bits(failure_tree, start, end, - EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS); + EXTENT_LOCKED | EXTENT_DIRTY); if (ret >= 0) ret = set_state_failrec(failure_tree, start, failrec); /* set the bits in the inode's tree */ if (ret >= 0) - ret = set_extent_bits(tree, start, end, EXTENT_DAMAGED, - GFP_NOFS); + ret = set_extent_bits(tree, start, end, EXTENT_DAMAGED); if (ret < 0) { kfree(failrec); return ret; @@ -2376,7 +2387,7 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset, int read_mode; int ret; - BUG_ON(failed_bio->bi_rw & REQ_WRITE); + BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE); ret = btrfs_get_io_failure_record(inode, start, end, &failrec); if (ret) @@ -2402,12 +2413,12 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset, free_io_failure(inode, failrec); return -EIO; } + bio_set_op_attrs(bio, REQ_OP_READ, read_mode); pr_debug("Repair Read Error: submitting new read[%#x] to this_mirror=%d, in_validation=%d\n", read_mode, failrec->this_mirror, failrec->in_validation); - ret = tree->ops->submit_bio_hook(inode, read_mode, bio, - failrec->this_mirror, + ret = tree->ops->submit_bio_hook(inode, bio, failrec->this_mirror, failrec->bio_flags, 0); if (ret) { free_io_failure(inode, failrec); @@ -2713,8 +2724,8 @@ struct bio *btrfs_io_bio_alloc(gfp_t gfp_mask, unsigned int nr_iovecs) } -static int __must_check submit_one_bio(int rw, struct bio *bio, - int mirror_num, unsigned long bio_flags) +static int __must_check submit_one_bio(struct bio *bio, int mirror_num, + unsigned long bio_flags) { int ret = 0; struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; @@ -2725,33 +2736,32 @@ static int __must_check submit_one_bio(int rw, struct bio *bio, start = page_offset(page) + bvec->bv_offset; bio->bi_private = NULL; - bio_get(bio); if (tree->ops && tree->ops->submit_bio_hook) - ret = tree->ops->submit_bio_hook(page->mapping->host, rw, bio, + ret = tree->ops->submit_bio_hook(page->mapping->host, bio, mirror_num, bio_flags, start); else - btrfsic_submit_bio(rw, bio); + btrfsic_submit_bio(bio); bio_put(bio); return ret; } -static int merge_bio(int rw, struct extent_io_tree *tree, struct page *page, +static int merge_bio(struct extent_io_tree *tree, struct page *page, unsigned long offset, size_t size, struct bio *bio, unsigned long bio_flags) { int ret = 0; if (tree->ops && tree->ops->merge_bio_hook) - ret = tree->ops->merge_bio_hook(rw, page, offset, size, bio, + ret = tree->ops->merge_bio_hook(page, offset, size, bio, bio_flags); BUG_ON(ret < 0); return ret; } -static int submit_extent_page(int rw, struct extent_io_tree *tree, +static int submit_extent_page(int op, int op_flags, struct extent_io_tree *tree, struct writeback_control *wbc, struct page *page, sector_t sector, size_t size, unsigned long offset, @@ -2779,10 +2789,9 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree, if (prev_bio_flags != bio_flags || !contig || force_bio_submit || - merge_bio(rw, tree, page, offset, page_size, bio, bio_flags) || + merge_bio(tree, page, offset, page_size, bio, bio_flags) || bio_add_page(bio, page, page_size, offset) < page_size) { - ret = submit_one_bio(rw, bio, mirror_num, - prev_bio_flags); + ret = submit_one_bio(bio, mirror_num, prev_bio_flags); if (ret < 0) { *bio_ret = NULL; return ret; @@ -2803,6 +2812,7 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree, bio_add_page(bio, page, page_size, offset); bio->bi_end_io = end_io_func; bio->bi_private = tree; + bio_set_op_attrs(bio, op, op_flags); if (wbc) { wbc_init_bio(wbc, bio); wbc_account_io(wbc, page, page_size); @@ -2811,7 +2821,7 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree, if (bio_ret) *bio_ret = bio; else - ret = submit_one_bio(rw, bio, mirror_num, bio_flags); + ret = submit_one_bio(bio, mirror_num, bio_flags); return ret; } @@ -2875,7 +2885,7 @@ static int __do_readpage(struct extent_io_tree *tree, get_extent_t *get_extent, struct extent_map **em_cached, struct bio **bio, int mirror_num, - unsigned long *bio_flags, int rw, + unsigned long *bio_flags, int read_flags, u64 *prev_em_start) { struct inode *inode = page->mapping->host; @@ -3058,8 +3068,8 @@ static int __do_readpage(struct extent_io_tree *tree, } pnr -= page->index; - ret = submit_extent_page(rw, tree, NULL, page, - sector, disk_io_size, pg_offset, + ret = submit_extent_page(REQ_OP_READ, read_flags, tree, NULL, + page, sector, disk_io_size, pg_offset, bdev, bio, pnr, end_bio_extent_readpage, mirror_num, *bio_flags, @@ -3090,7 +3100,7 @@ static inline void __do_contiguous_readpages(struct extent_io_tree *tree, get_extent_t *get_extent, struct extent_map **em_cached, struct bio **bio, int mirror_num, - unsigned long *bio_flags, int rw, + unsigned long *bio_flags, u64 *prev_em_start) { struct inode *inode; @@ -3111,7 +3121,7 @@ static inline void __do_contiguous_readpages(struct extent_io_tree *tree, for (index = 0; index < nr_pages; index++) { __do_readpage(tree, pages[index], get_extent, em_cached, bio, - mirror_num, bio_flags, rw, prev_em_start); + mirror_num, bio_flags, 0, prev_em_start); put_page(pages[index]); } } @@ -3121,7 +3131,7 @@ static void __extent_readpages(struct extent_io_tree *tree, int nr_pages, get_extent_t *get_extent, struct extent_map **em_cached, struct bio **bio, int mirror_num, - unsigned long *bio_flags, int rw, + unsigned long *bio_flags, u64 *prev_em_start) { u64 start = 0; @@ -3143,7 +3153,7 @@ static void __extent_readpages(struct extent_io_tree *tree, index - first_index, start, end, get_extent, em_cached, bio, mirror_num, bio_flags, - rw, prev_em_start); + prev_em_start); start = page_start; end = start + PAGE_SIZE - 1; first_index = index; @@ -3154,7 +3164,7 @@ static void __extent_readpages(struct extent_io_tree *tree, __do_contiguous_readpages(tree, &pages[first_index], index - first_index, start, end, get_extent, em_cached, bio, - mirror_num, bio_flags, rw, + mirror_num, bio_flags, prev_em_start); } @@ -3162,7 +3172,7 @@ static int __extent_read_full_page(struct extent_io_tree *tree, struct page *page, get_extent_t *get_extent, struct bio **bio, int mirror_num, - unsigned long *bio_flags, int rw) + unsigned long *bio_flags, int read_flags) { struct inode *inode = page->mapping->host; struct btrfs_ordered_extent *ordered; @@ -3182,7 +3192,7 @@ static int __extent_read_full_page(struct extent_io_tree *tree, } ret = __do_readpage(tree, page, get_extent, NULL, bio, mirror_num, - bio_flags, rw, NULL); + bio_flags, read_flags, NULL); return ret; } @@ -3194,20 +3204,16 @@ int extent_read_full_page(struct extent_io_tree *tree, struct page *page, int ret; ret = __extent_read_full_page(tree, page, get_extent, &bio, mirror_num, - &bio_flags, READ); + &bio_flags, 0); if (bio) - ret = submit_one_bio(READ, bio, mirror_num, bio_flags); + ret = submit_one_bio(bio, mirror_num, bio_flags); return ret; } -static noinline void update_nr_written(struct page *page, - struct writeback_control *wbc, - unsigned long nr_written) +static void update_nr_written(struct page *page, struct writeback_control *wbc, + unsigned long nr_written) { wbc->nr_to_write -= nr_written; - if (wbc->range_cyclic || (wbc->nr_to_write > 0 && - wbc->range_start == 0 && wbc->range_end == LLONG_MAX)) - page->mapping->writeback_index = page->index + nr_written; } /* @@ -3368,6 +3374,8 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode, while (cur <= end) { u64 em_end; + unsigned long max_nr; + if (cur >= i_size) { if (tree->ops && tree->ops->writepage_end_io_hook) tree->ops->writepage_end_io_hook(page, cur, @@ -3423,32 +3431,23 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode, continue; } - if (tree->ops && tree->ops->writepage_io_hook) { - ret = tree->ops->writepage_io_hook(page, cur, - cur + iosize - 1); - } else { - ret = 0; + max_nr = (i_size >> PAGE_SHIFT) + 1; + + set_range_writeback(tree, cur, cur + iosize - 1); + if (!PageWriteback(page)) { + btrfs_err(BTRFS_I(inode)->root->fs_info, + "page %lu not writeback, cur %llu end %llu", + page->index, cur, end); } - if (ret) { - SetPageError(page); - } else { - unsigned long max_nr = (i_size >> PAGE_SHIFT) + 1; - set_range_writeback(tree, cur, cur + iosize - 1); - if (!PageWriteback(page)) { - btrfs_err(BTRFS_I(inode)->root->fs_info, - "page %lu not writeback, cur %llu end %llu", - page->index, cur, end); - } + ret = submit_extent_page(REQ_OP_WRITE, write_flags, tree, wbc, + page, sector, iosize, pg_offset, + bdev, &epd->bio, max_nr, + end_bio_extent_writepage, + 0, 0, 0, false); + if (ret) + SetPageError(page); - ret = submit_extent_page(write_flags, tree, wbc, page, - sector, iosize, pg_offset, - bdev, &epd->bio, max_nr, - end_bio_extent_writepage, - 0, 0, 0, false); - if (ret) - SetPageError(page); - } cur = cur + iosize; pg_offset += iosize; nr++; @@ -3481,13 +3480,11 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, size_t pg_offset = 0; loff_t i_size = i_size_read(inode); unsigned long end_index = i_size >> PAGE_SHIFT; - int write_flags; + int write_flags = 0; unsigned long nr_written = 0; if (wbc->sync_mode == WB_SYNC_ALL) write_flags = WRITE_SYNC; - else - write_flags = WRITE; trace___extent_writepage(page, inode, wbc); @@ -3731,7 +3728,7 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb, u64 offset = eb->start; unsigned long i, num_pages; unsigned long bio_flags = 0; - int rw = (epd->sync_io ? WRITE_SYNC : WRITE) | REQ_META; + int write_flags = (epd->sync_io ? WRITE_SYNC : 0) | REQ_META; int ret = 0; clear_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags); @@ -3745,9 +3742,10 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb, clear_page_dirty_for_io(p); set_page_writeback(p); - ret = submit_extent_page(rw, tree, wbc, p, offset >> 9, - PAGE_SIZE, 0, bdev, &epd->bio, - -1, end_bio_extent_buffer_writepage, + ret = submit_extent_page(REQ_OP_WRITE, write_flags, tree, wbc, + p, offset >> 9, PAGE_SIZE, 0, bdev, + &epd->bio, -1, + end_bio_extent_buffer_writepage, 0, epd->bio_flags, bio_flags, false); epd->bio_flags = bio_flags; if (ret) { @@ -3920,12 +3918,13 @@ static int extent_write_cache_pages(struct extent_io_tree *tree, struct inode *inode = mapping->host; int ret = 0; int done = 0; - int err = 0; int nr_to_write_done = 0; struct pagevec pvec; int nr_pages; pgoff_t index; pgoff_t end; /* Inclusive */ + pgoff_t done_index; + int range_whole = 0; int scanned = 0; int tag; @@ -3948,6 +3947,8 @@ static int extent_write_cache_pages(struct extent_io_tree *tree, } else { index = wbc->range_start >> PAGE_SHIFT; end = wbc->range_end >> PAGE_SHIFT; + if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) + range_whole = 1; scanned = 1; } if (wbc->sync_mode == WB_SYNC_ALL) @@ -3957,6 +3958,7 @@ static int extent_write_cache_pages(struct extent_io_tree *tree, retry: if (wbc->sync_mode == WB_SYNC_ALL) tag_pages_for_writeback(mapping, index, end); + done_index = index; while (!done && !nr_to_write_done && (index <= end) && (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag, min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) { @@ -3966,6 +3968,7 @@ retry: for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; + done_index = page->index; /* * At this point we hold neither mapping->tree_lock nor * lock on the page itself: the page may be truncated or @@ -4007,8 +4010,20 @@ retry: unlock_page(page); ret = 0; } - if (!err && ret < 0) - err = ret; + if (ret < 0) { + /* + * done_index is set past this page, + * so media errors will not choke + * background writeout for the entire + * file. This has consequences for + * range_cyclic semantics (ie. it may + * not be suitable for data integrity + * writeout). + */ + done_index = page->index + 1; + done = 1; + break; + } /* * the filesystem may choose to bump up nr_to_write. @@ -4020,7 +4035,7 @@ retry: pagevec_release(&pvec); cond_resched(); } - if (!scanned && !done && !err) { + if (!scanned && !done) { /* * We hit the last page and there is more work to be done: wrap * back to the start of the file @@ -4029,20 +4044,23 @@ retry: index = 0; goto retry; } + + if (wbc->range_cyclic || (wbc->nr_to_write > 0 && range_whole)) + mapping->writeback_index = done_index; + btrfs_add_delayed_iput(inode); - return err; + return ret; } static void flush_epd_write_bio(struct extent_page_data *epd) { if (epd->bio) { - int rw = WRITE; int ret; - if (epd->sync_io) - rw = WRITE_SYNC; + bio_set_op_attrs(epd->bio, REQ_OP_WRITE, + epd->sync_io ? WRITE_SYNC : 0); - ret = submit_one_bio(rw, epd->bio, 0, epd->bio_flags); + ret = submit_one_bio(epd->bio, 0, epd->bio_flags); BUG_ON(ret < 0); /* -ENOMEM */ epd->bio = NULL; } @@ -4160,7 +4178,8 @@ int extent_readpages(struct extent_io_tree *tree, prefetchw(&page->flags); list_del(&page->lru); if (add_to_page_cache_lru(page, mapping, - page->index, GFP_NOFS)) { + page->index, + readahead_gfp_mask(mapping))) { put_page(page); continue; } @@ -4169,19 +4188,19 @@ int extent_readpages(struct extent_io_tree *tree, if (nr < ARRAY_SIZE(pagepool)) continue; __extent_readpages(tree, pagepool, nr, get_extent, &em_cached, - &bio, 0, &bio_flags, READ, &prev_em_start); + &bio, 0, &bio_flags, &prev_em_start); nr = 0; } if (nr) __extent_readpages(tree, pagepool, nr, get_extent, &em_cached, - &bio, 0, &bio_flags, READ, &prev_em_start); + &bio, 0, &bio_flags, &prev_em_start); if (em_cached) free_extent_map(em_cached); BUG_ON(!list_empty(pages)); if (bio) - return submit_one_bio(READ, bio, 0, bio_flags); + return submit_one_bio(bio, 0, bio_flags); return 0; } @@ -4379,8 +4398,12 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, if (ret < 0) { btrfs_free_path(path); return ret; + } else { + WARN_ON(!ret); + if (ret == 1) + ret = 0; } - WARN_ON(!ret); + path->slots[0]--; btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]); found_type = found_key.type; @@ -4591,7 +4614,7 @@ static void btrfs_release_extent_buffer_page(struct extent_buffer *eb) if (mapped) spin_unlock(&page->mapping->private_lock); - /* One for when we alloced the page */ + /* One for when we allocated the page */ put_page(page); } while (index != 0); } @@ -4704,16 +4727,16 @@ err: } struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info, - u64 start) + u64 start, u32 nodesize) { unsigned long len; if (!fs_info) { /* * Called only from tests that don't always have a fs_info - * available, but we know that nodesize is 4096 + * available */ - len = 4096; + len = nodesize; } else { len = fs_info->tree_root->nodesize; } @@ -4809,7 +4832,7 @@ struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info, #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info, - u64 start) + u64 start, u32 nodesize) { struct extent_buffer *eb, *exists = NULL; int ret; @@ -4817,12 +4840,12 @@ struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info, eb = find_extent_buffer(fs_info, start); if (eb) return eb; - eb = alloc_dummy_extent_buffer(fs_info, start); + eb = alloc_dummy_extent_buffer(fs_info, start, nodesize); if (!eb) return NULL; eb->fs_info = fs_info; again: - ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM); + ret = radix_tree_preload(GFP_NOFS); if (ret) goto free_eb; spin_lock(&fs_info->buffer_lock); @@ -4868,18 +4891,25 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, int uptodate = 1; int ret; + if (!IS_ALIGNED(start, fs_info->tree_root->sectorsize)) { + btrfs_err(fs_info, "bad tree block start %llu", start); + return ERR_PTR(-EINVAL); + } + eb = find_extent_buffer(fs_info, start); if (eb) return eb; eb = __alloc_extent_buffer(fs_info, start, len); if (!eb) - return NULL; + return ERR_PTR(-ENOMEM); for (i = 0; i < num_pages; i++, index++) { p = find_or_create_page(mapping, index, GFP_NOFS|__GFP_NOFAIL); - if (!p) + if (!p) { + exists = ERR_PTR(-ENOMEM); goto free_eb; + } spin_lock(&mapping->private_lock); if (PagePrivate(p)) { @@ -4923,9 +4953,11 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, if (uptodate) set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); again: - ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM); - if (ret) + ret = radix_tree_preload(GFP_NOFS); + if (ret) { + exists = ERR_PTR(ret); goto free_eb; + } spin_lock(&fs_info->buffer_lock); ret = radix_tree_insert(&fs_info->buffer_radix, @@ -5203,7 +5235,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree, err = __extent_read_full_page(tree, page, get_extent, &bio, mirror_num, &bio_flags, - READ | REQ_META); + REQ_META); if (err) ret = err; } else { @@ -5212,8 +5244,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree, } if (bio) { - err = submit_one_bio(READ | REQ_META, bio, mirror_num, - bio_flags); + err = submit_one_bio(bio, mirror_num, bio_flags); if (err) return err; } @@ -5309,6 +5340,11 @@ int read_extent_buffer_to_user(struct extent_buffer *eb, void __user *dstv, return ret; } +/* + * return 0 if the item is found within a page. + * return 1 if the item spans two pages. + * return -EINVAL otherwise. + */ int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start, unsigned long min_len, char **map, unsigned long *map_start, @@ -5323,7 +5359,7 @@ int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start, PAGE_SHIFT; if (i != end_i) - return -EINVAL; + return 1; if (i == 0) { offset = start_offset; @@ -5751,7 +5787,7 @@ int try_release_extent_buffer(struct page *page) struct extent_buffer *eb; /* - * We need to make sure noboody is attaching this page to an eb right + * We need to make sure nobody is attaching this page to an eb right * now. */ spin_lock(&page->mapping->private_lock); diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index b5e0ade90e88..bc2729a7612d 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -63,17 +63,16 @@ struct btrfs_root; struct btrfs_io_bio; struct io_failure_record; -typedef int (extent_submit_bio_hook_t)(struct inode *inode, int rw, - struct bio *bio, int mirror_num, - unsigned long bio_flags, u64 bio_offset); +typedef int (extent_submit_bio_hook_t)(struct inode *inode, struct bio *bio, + int mirror_num, unsigned long bio_flags, + u64 bio_offset); struct extent_io_ops { int (*fill_delalloc)(struct inode *inode, struct page *locked_page, u64 start, u64 end, int *page_started, unsigned long *nr_written); int (*writepage_start_hook)(struct page *page, u64 start, u64 end); - int (*writepage_io_hook)(struct page *page, u64 start, u64 end); extent_submit_bio_hook_t *submit_bio_hook; - int (*merge_bio_hook)(int rw, struct page *page, unsigned long offset, + int (*merge_bio_hook)(struct page *page, unsigned long offset, size_t size, struct bio *bio, unsigned long bio_flags); int (*readpage_io_failed_hook)(struct page *page, int failed_mirror); @@ -221,8 +220,7 @@ int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, unsigned bits, int filled, struct extent_state *cached_state); int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, - unsigned bits, gfp_t mask, - struct extent_changeset *changeset); + unsigned bits, struct extent_changeset *changeset); int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, unsigned bits, int wake, int delete, struct extent_state **cached, gfp_t mask); @@ -241,27 +239,27 @@ static inline int unlock_extent_cached(struct extent_io_tree *tree, u64 start, } static inline int clear_extent_bits(struct extent_io_tree *tree, u64 start, - u64 end, unsigned bits, gfp_t mask) + u64 end, unsigned bits) { int wake = 0; if (bits & EXTENT_LOCKED) wake = 1; - return clear_extent_bit(tree, start, end, bits, wake, 0, NULL, mask); + return clear_extent_bit(tree, start, end, bits, wake, 0, NULL, + GFP_NOFS); } int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, - unsigned bits, gfp_t mask, - struct extent_changeset *changeset); + unsigned bits, struct extent_changeset *changeset); int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, unsigned bits, u64 *failed_start, struct extent_state **cached_state, gfp_t mask); static inline int set_extent_bits(struct extent_io_tree *tree, u64 start, - u64 end, unsigned bits, gfp_t mask) + u64 end, unsigned bits) { - return set_extent_bit(tree, start, end, bits, NULL, NULL, mask); + return set_extent_bit(tree, start, end, bits, NULL, NULL, GFP_NOFS); } static inline int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, @@ -279,37 +277,38 @@ static inline int set_extent_dirty(struct extent_io_tree *tree, u64 start, } static inline int clear_extent_dirty(struct extent_io_tree *tree, u64 start, - u64 end, gfp_t mask) + u64 end) { return clear_extent_bit(tree, start, end, EXTENT_DIRTY | EXTENT_DELALLOC | - EXTENT_DO_ACCOUNTING, 0, 0, NULL, mask); + EXTENT_DO_ACCOUNTING, 0, 0, NULL, GFP_NOFS); } int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, unsigned bits, unsigned clear_bits, - struct extent_state **cached_state, gfp_t mask); + struct extent_state **cached_state); static inline int set_extent_delalloc(struct extent_io_tree *tree, u64 start, - u64 end, struct extent_state **cached_state, gfp_t mask) + u64 end, struct extent_state **cached_state) { return set_extent_bit(tree, start, end, EXTENT_DELALLOC | EXTENT_UPTODATE, - NULL, cached_state, mask); + NULL, cached_state, GFP_NOFS); } static inline int set_extent_defrag(struct extent_io_tree *tree, u64 start, - u64 end, struct extent_state **cached_state, gfp_t mask) + u64 end, struct extent_state **cached_state) { return set_extent_bit(tree, start, end, EXTENT_DELALLOC | EXTENT_UPTODATE | EXTENT_DEFRAG, - NULL, cached_state, mask); + NULL, cached_state, GFP_NOFS); } static inline int set_extent_new(struct extent_io_tree *tree, u64 start, - u64 end, gfp_t mask) + u64 end) { - return set_extent_bit(tree, start, end, EXTENT_NEW, NULL, NULL, mask); + return set_extent_bit(tree, start, end, EXTENT_NEW, NULL, NULL, + GFP_NOFS); } static inline int set_extent_uptodate(struct extent_io_tree *tree, u64 start, @@ -349,7 +348,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info, u64 start, unsigned long len); struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info, - u64 start); + u64 start, u32 nodesize); struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src); struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info, u64 start); @@ -469,5 +468,5 @@ noinline u64 find_lock_delalloc_range(struct inode *inode, u64 *end, u64 max_bytes); #endif struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info, - u64 start); + u64 start, u32 nodesize); #endif diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 318b048eb254..e0715fcfb11e 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -62,7 +62,7 @@ struct extent_map *alloc_extent_map(void) /** * free_extent_map - drop reference count of an extent_map - * @em: extent map being releasead + * @em: extent map being released * * Drops the reference out on @em by one and free the structure * if the reference count hits zero. diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 7a7d6e253cfc..62a81ee13a5f 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -248,7 +248,7 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root, BTRFS_DATA_RELOC_TREE_OBJECTID) { set_extent_bits(io_tree, offset, offset + root->sectorsize - 1, - EXTENT_NODATASUM, GFP_NOFS); + EXTENT_NODATASUM); } else { btrfs_info(BTRFS_I(inode)->root->fs_info, "no csum found for inode %llu start %llu", diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index ea9f10bb089c..2234e88cf674 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1534,30 +1534,30 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, reserve_bytes = round_up(write_bytes + sector_offset, root->sectorsize); - if ((BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW | - BTRFS_INODE_PREALLOC)) && - check_can_nocow(inode, pos, &write_bytes) > 0) { - /* - * For nodata cow case, no need to reserve - * data space. - */ - only_release_metadata = true; - /* - * our prealloc extent may be smaller than - * write_bytes, so scale down. - */ - num_pages = DIV_ROUND_UP(write_bytes + offset, - PAGE_SIZE); - reserve_bytes = round_up(write_bytes + sector_offset, - root->sectorsize); - goto reserve_metadata; - } - ret = btrfs_check_data_free_space(inode, pos, write_bytes); - if (ret < 0) - break; + if (ret < 0) { + if ((BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW | + BTRFS_INODE_PREALLOC)) && + check_can_nocow(inode, pos, &write_bytes) > 0) { + /* + * For nodata cow case, no need to reserve + * data space. + */ + only_release_metadata = true; + /* + * our prealloc extent may be smaller than + * write_bytes, so scale down. + */ + num_pages = DIV_ROUND_UP(write_bytes + offset, + PAGE_SIZE); + reserve_bytes = round_up(write_bytes + + sector_offset, + root->sectorsize); + } else { + break; + } + } -reserve_metadata: ret = btrfs_delalloc_reserve_metadata(inode, reserve_bytes); if (ret) { if (!only_release_metadata) @@ -1596,6 +1596,13 @@ again: copied = btrfs_copy_from_user(pos, write_bytes, pages, i); + num_sectors = BTRFS_BYTES_TO_BLKS(root->fs_info, + reserve_bytes); + dirty_sectors = round_up(copied + sector_offset, + root->sectorsize); + dirty_sectors = BTRFS_BYTES_TO_BLKS(root->fs_info, + dirty_sectors); + /* * if we have trouble faulting in the pages, fall * back to one page at a time @@ -1605,6 +1612,7 @@ again: if (copied == 0) { force_page_uptodate = true; + dirty_sectors = 0; dirty_pages = 0; } else { force_page_uptodate = false; @@ -1615,20 +1623,19 @@ again: /* * If we had a short copy we need to release the excess delaloc * bytes we reserved. We need to increment outstanding_extents - * because btrfs_delalloc_release_space will decrement it, but + * because btrfs_delalloc_release_space and + * btrfs_delalloc_release_metadata will decrement it, but * we still have an outstanding extent for the chunk we actually * managed to copy. */ - num_sectors = BTRFS_BYTES_TO_BLKS(root->fs_info, - reserve_bytes); - dirty_sectors = round_up(copied + sector_offset, - root->sectorsize); - dirty_sectors = BTRFS_BYTES_TO_BLKS(root->fs_info, - dirty_sectors); - if (num_sectors > dirty_sectors) { - release_bytes = (write_bytes - copied) - & ~((u64)root->sectorsize - 1); + /* + * we round down because we don't want to count + * any partial blocks actually sent through the + * IO machines + */ + release_bytes = round_down(release_bytes - copied, + root->sectorsize); if (copied > 0) { spin_lock(&BTRFS_I(inode)->lock); BTRFS_I(inode)->outstanding_extents++; @@ -1696,7 +1703,9 @@ again: btrfs_end_write_no_snapshoting(root); btrfs_delalloc_release_metadata(inode, release_bytes); } else { - btrfs_delalloc_release_space(inode, pos, release_bytes); + btrfs_delalloc_release_space(inode, + round_down(pos, root->sectorsize), + release_bytes); } } @@ -2020,7 +2029,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) BTRFS_I(inode)->last_trans <= root->fs_info->last_trans_committed)) { /* - * We'v had everything committed since the last time we were + * We've had everything committed since the last time we were * modified so clear this flag in case it was set for whatever * reason, it's no longer relevant. */ @@ -2368,7 +2377,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) /* Check the aligned pages after the first unaligned page, * if offset != orig_start, which means the first unaligned page - * including serveral following pages are already in holes, + * including several following pages are already in holes, * the extra check can be skipped */ if (offset == orig_start) { /* after truncate page, check hole again */ @@ -2952,7 +2961,7 @@ const struct file_operations btrfs_file_operations = { .fallocate = btrfs_fallocate, .unlocked_ioctl = btrfs_ioctl, #ifdef CONFIG_COMPAT - .compat_ioctl = btrfs_ioctl, + .compat_ioctl = btrfs_compat_ioctl, #endif .copy_file_range = btrfs_copy_file_range, .clone_file_range = btrfs_clone_file_range, diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 5e6062c26129..69d270f6602c 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -29,7 +29,7 @@ #include "inode-map.h" #include "volumes.h" -#define BITS_PER_BITMAP (PAGE_SIZE * 8) +#define BITS_PER_BITMAP (PAGE_SIZE * 8UL) #define MAX_CACHE_BYTES_PER_GIG SZ_32K struct btrfs_trim_range { @@ -1415,11 +1415,11 @@ static inline u64 offset_to_bitmap(struct btrfs_free_space_ctl *ctl, u64 offset) { u64 bitmap_start; - u32 bytes_per_bitmap; + u64 bytes_per_bitmap; bytes_per_bitmap = BITS_PER_BITMAP * ctl->unit; bitmap_start = offset - ctl->start; - bitmap_start = div_u64(bitmap_start, bytes_per_bitmap); + bitmap_start = div64_u64(bitmap_start, bytes_per_bitmap); bitmap_start *= bytes_per_bitmap; bitmap_start += ctl->start; @@ -1638,10 +1638,10 @@ static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl) u64 bitmap_bytes; u64 extent_bytes; u64 size = block_group->key.offset; - u32 bytes_per_bg = BITS_PER_BITMAP * ctl->unit; - u32 max_bitmaps = div_u64(size + bytes_per_bg - 1, bytes_per_bg); + u64 bytes_per_bg = BITS_PER_BITMAP * ctl->unit; + u64 max_bitmaps = div64_u64(size + bytes_per_bg - 1, bytes_per_bg); - max_bitmaps = max_t(u32, max_bitmaps, 1); + max_bitmaps = max_t(u64, max_bitmaps, 1); ASSERT(ctl->total_bitmaps <= max_bitmaps); @@ -1660,7 +1660,7 @@ static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl) * sure we don't go over our overall goal of MAX_CACHE_BYTES_PER_GIG as * we add more bitmaps. */ - bitmap_bytes = (ctl->total_bitmaps + 1) * PAGE_SIZE; + bitmap_bytes = (ctl->total_bitmaps + 1) * ctl->unit; if (bitmap_bytes >= max_bytes) { ctl->extents_thresh = 0; @@ -1983,7 +1983,7 @@ static bool use_bitmap(struct btrfs_free_space_ctl *ctl, /* * If this block group has some small extents we don't want to * use up all of our free slots in the cache with them, we want - * to reserve them to larger extents, however if we have plent + * to reserve them to larger extents, however if we have plenty * of cache left then go ahead an dadd them, no sense in adding * the overhead of a bitmap if we don't have to. */ @@ -3662,7 +3662,7 @@ have_info: if (tmp->offset + tmp->bytes < offset) break; if (offset + bytes < tmp->offset) { - n = rb_prev(&info->offset_index); + n = rb_prev(&tmp->offset_index); continue; } info = tmp; @@ -3676,7 +3676,7 @@ have_info: if (offset + bytes < tmp->offset) break; if (tmp->offset + tmp->bytes < offset) { - n = rb_next(&info->offset_index); + n = rb_next(&tmp->offset_index); continue; } info = tmp; diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h index 33178c490ace..3af651c2bbc7 100644 --- a/fs/btrfs/free-space-cache.h +++ b/fs/btrfs/free-space-cache.h @@ -123,7 +123,7 @@ int btrfs_return_cluster_to_free_space( int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group, u64 *trimmed, u64 start, u64 end, u64 minlen); -/* Support functions for runnint our sanity tests */ +/* Support functions for running our sanity tests */ #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS int test_add_free_space_entry(struct btrfs_block_group_cache *cache, u64 offset, u64 bytes, bool bitmap); diff --git a/fs/btrfs/hash.c b/fs/btrfs/hash.c index aae520b2aee5..a97fdc156a03 100644 --- a/fs/btrfs/hash.c +++ b/fs/btrfs/hash.c @@ -24,6 +24,11 @@ int __init btrfs_hash_init(void) return PTR_ERR_OR_ZERO(tfm); } +const char* btrfs_crc32c_impl(void) +{ + return crypto_tfm_alg_driver_name(crypto_shash_tfm(tfm)); +} + void btrfs_hash_exit(void) { crypto_free_shash(tfm); diff --git a/fs/btrfs/hash.h b/fs/btrfs/hash.h index 118a2316e5d3..c3a2ec554361 100644 --- a/fs/btrfs/hash.h +++ b/fs/btrfs/hash.h @@ -22,6 +22,7 @@ int __init btrfs_hash_init(void); void btrfs_hash_exit(void); +const char* btrfs_crc32c_impl(void); u32 btrfs_crc32c(u32 crc, const void *address, unsigned int length); diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c index be4d22a5022f..b8acc07ac6c2 100644 --- a/fs/btrfs/inode-item.c +++ b/fs/btrfs/inode-item.c @@ -157,7 +157,7 @@ static int btrfs_del_inode_extref(struct btrfs_trans_handle *trans, */ if (!btrfs_find_name_in_ext_backref(path, ref_objectid, name, name_len, &extref)) { - btrfs_std_error(root->fs_info, -ENOENT, NULL); + btrfs_handle_fs_error(root->fs_info, -ENOENT, NULL); ret = -EROFS; goto out; } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 6b7fe291a174..df731c0ebec7 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -455,7 +455,7 @@ again: /* * skip compression for a small file range(<=blocksize) that - * isn't an inline extent, since it dosen't save disk space at all. + * isn't an inline extent, since it doesn't save disk space at all. */ if (total_compressed <= blocksize && (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size)) @@ -824,6 +824,7 @@ retry: async_extent->ram_size - 1, 0); goto out_free_reserve; } + btrfs_dec_block_group_reservations(root->fs_info, ins.objectid); /* * clear dirty, set writeback and unlock the pages. @@ -861,6 +862,7 @@ retry: } return; out_free_reserve: + btrfs_dec_block_group_reservations(root->fs_info, ins.objectid); btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1); out_free: extent_clear_unlock_delalloc(inode, async_extent->start, @@ -1038,6 +1040,8 @@ static noinline int cow_file_range(struct inode *inode, goto out_drop_extent_cache; } + btrfs_dec_block_group_reservations(root->fs_info, ins.objectid); + if (disk_num_bytes < cur_alloc_size) break; @@ -1066,6 +1070,7 @@ out: out_drop_extent_cache: btrfs_drop_extent_cache(inode, start, start + ram_size - 1, 0); out_reserve: + btrfs_dec_block_group_reservations(root->fs_info, ins.objectid); btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1); out_unlock: extent_clear_unlock_delalloc(inode, start, end, locked_page, @@ -1377,6 +1382,9 @@ next_slot: */ if (csum_exist_in_range(root, disk_bytenr, num_bytes)) goto out_check; + if (!btrfs_inc_nocow_writers(root->fs_info, + disk_bytenr)) + goto out_check; nocow = 1; } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { extent_end = found_key.offset + @@ -1391,6 +1399,9 @@ out_check: path->slots[0]++; if (!nolock && nocow) btrfs_end_write_no_snapshoting(root); + if (nocow) + btrfs_dec_nocow_writers(root->fs_info, + disk_bytenr); goto next_slot; } if (!nocow) { @@ -1411,6 +1422,9 @@ out_check: if (ret) { if (!nolock && nocow) btrfs_end_write_no_snapshoting(root); + if (nocow) + btrfs_dec_nocow_writers(root->fs_info, + disk_bytenr); goto error; } cow_start = (u64)-1; @@ -1453,6 +1467,8 @@ out_check: ret = btrfs_add_ordered_extent(inode, cur_offset, disk_bytenr, num_bytes, num_bytes, type); + if (nocow) + btrfs_dec_nocow_writers(root->fs_info, disk_bytenr); BUG_ON(ret); /* -ENOMEM */ if (root->root_key.objectid == @@ -1807,7 +1823,7 @@ static void btrfs_clear_bit_hook(struct inode *inode, * extent_io.c merge_bio_hook, this must check the chunk tree to make sure * we don't create bios that span stripes or chunks */ -int btrfs_merge_bio_hook(int rw, struct page *page, unsigned long offset, +int btrfs_merge_bio_hook(struct page *page, unsigned long offset, size_t size, struct bio *bio, unsigned long bio_flags) { @@ -1822,7 +1838,7 @@ int btrfs_merge_bio_hook(int rw, struct page *page, unsigned long offset, length = bio->bi_iter.bi_size; map_length = length; - ret = btrfs_map_block(root->fs_info, rw, logical, + ret = btrfs_map_block(root->fs_info, bio_op(bio), logical, &map_length, NULL, 0); /* Will always return 0 with map_multi == NULL */ BUG_ON(ret < 0); @@ -1839,9 +1855,8 @@ int btrfs_merge_bio_hook(int rw, struct page *page, unsigned long offset, * At IO completion time the cums attached on the ordered extent record * are inserted into the btree */ -static int __btrfs_submit_bio_start(struct inode *inode, int rw, - struct bio *bio, int mirror_num, - unsigned long bio_flags, +static int __btrfs_submit_bio_start(struct inode *inode, struct bio *bio, + int mirror_num, unsigned long bio_flags, u64 bio_offset) { struct btrfs_root *root = BTRFS_I(inode)->root; @@ -1860,14 +1875,14 @@ static int __btrfs_submit_bio_start(struct inode *inode, int rw, * At IO completion time the cums attached on the ordered extent record * are inserted into the btree */ -static int __btrfs_submit_bio_done(struct inode *inode, int rw, struct bio *bio, +static int __btrfs_submit_bio_done(struct inode *inode, struct bio *bio, int mirror_num, unsigned long bio_flags, u64 bio_offset) { struct btrfs_root *root = BTRFS_I(inode)->root; int ret; - ret = btrfs_map_bio(root, rw, bio, mirror_num, 1); + ret = btrfs_map_bio(root, bio, mirror_num, 1); if (ret) { bio->bi_error = ret; bio_endio(bio); @@ -1879,7 +1894,7 @@ static int __btrfs_submit_bio_done(struct inode *inode, int rw, struct bio *bio, * extent_io.c submission hook. This does the right thing for csum calculation * on write, or reading the csums from the tree before a read */ -static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, +static int btrfs_submit_bio_hook(struct inode *inode, struct bio *bio, int mirror_num, unsigned long bio_flags, u64 bio_offset) { @@ -1894,7 +1909,7 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, if (btrfs_is_free_space_inode(inode)) metadata = BTRFS_WQ_ENDIO_FREE_SPACE; - if (!(rw & REQ_WRITE)) { + if (bio_op(bio) != REQ_OP_WRITE) { ret = btrfs_bio_wq_end_io(root->fs_info, bio, metadata); if (ret) goto out; @@ -1916,7 +1931,7 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, goto mapit; /* we're doing a write, do the async checksumming */ ret = btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info, - inode, rw, bio, mirror_num, + inode, bio, mirror_num, bio_flags, bio_offset, __btrfs_submit_bio_start, __btrfs_submit_bio_done); @@ -1928,7 +1943,7 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, } mapit: - ret = btrfs_map_bio(root, rw, bio, mirror_num, 0); + ret = btrfs_map_bio(root, bio, mirror_num, 0); out: if (ret < 0) { @@ -1962,7 +1977,7 @@ int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end, { WARN_ON((end & (PAGE_SIZE - 1)) == 0); return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end, - cached_state, GFP_NOFS); + cached_state); } /* see btrfs_writepage_start_hook for details on why this is required */ @@ -3103,8 +3118,7 @@ static int btrfs_readpage_end_io_hook(struct btrfs_io_bio *io_bio, if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID && test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1, NULL)) { - clear_extent_bits(io_tree, start, end, EXTENT_NODATASUM, - GFP_NOFS); + clear_extent_bits(io_tree, start, end, EXTENT_NODATASUM); return 0; } @@ -3256,7 +3270,16 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode) /* grab metadata reservation from transaction handle */ if (reserve) { ret = btrfs_orphan_reserve_metadata(trans, inode); - BUG_ON(ret); /* -ENOSPC in reservation; Logic error? JDM */ + ASSERT(!ret); + if (ret) { + atomic_dec(&root->orphan_inodes); + clear_bit(BTRFS_INODE_ORPHAN_META_RESERVED, + &BTRFS_I(inode)->runtime_flags); + if (insert) + clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, + &BTRFS_I(inode)->runtime_flags); + return ret; + } } /* insert an orphan item to track this unlinked/truncated file */ @@ -3706,7 +3729,7 @@ cache_index: * and doesn't have an inode ref with the name "bar" anymore. * * Setting last_unlink_trans to last_trans is a pessimistic approach, - * but it guarantees correctness at the expense of ocassional full + * but it guarantees correctness at the expense of occasional full * transaction commits on fsync if our inode is a directory, or if our * inode is not a directory, logging its parent unnecessarily. */ @@ -4534,6 +4557,7 @@ delete: BUG_ON(ret); if (btrfs_should_throttle_delayed_refs(trans, root)) btrfs_async_run_delayed_refs(root, + trans->transid, trans->delayed_ref_updates * 2, 0); if (be_nice) { if (truncate_space_check(trans, root, @@ -4962,7 +4986,7 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr) * be instantly completed which will give us extents that need * to be truncated. If we fail to get an orphan inode down we * could have left over extents that were never meant to live, - * so we need to garuntee from this point on that everything + * so we need to guarantee from this point on that everything * will be consistent. */ ret = btrfs_orphan_add(trans, inode); @@ -5232,7 +5256,7 @@ void btrfs_evict_inode(struct inode *inode) } /* - * We can't just steal from the global reserve, we need tomake + * We can't just steal from the global reserve, we need to make * sure there is room to do it, if not we need to commit and try * again. */ @@ -5733,6 +5757,7 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx) int name_len; int is_curr = 0; /* ctx->pos points to the current index? */ bool emitted; + bool put = false; /* FIXME, use a real flag for deciding about the key type */ if (root->fs_info->tree_root == root) @@ -5750,7 +5775,8 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx) if (key_type == BTRFS_DIR_INDEX_KEY) { INIT_LIST_HEAD(&ins_list); INIT_LIST_HEAD(&del_list); - btrfs_get_delayed_items(inode, &ins_list, &del_list); + put = btrfs_readdir_get_delayed_items(inode, &ins_list, + &del_list); } key.type = key_type; @@ -5897,8 +5923,8 @@ next: nopos: ret = 0; err: - if (key_type == BTRFS_DIR_INDEX_KEY) - btrfs_put_delayed_items(&ins_list, &del_list); + if (put) + btrfs_readdir_put_delayed_items(inode, &ins_list, &del_list); btrfs_free_path(path); return ret; } @@ -6964,7 +6990,18 @@ insert: * existing will always be non-NULL, since there must be * extent causing the -EEXIST. */ - if (start >= extent_map_end(existing) || + if (existing->start == em->start && + extent_map_end(existing) == extent_map_end(em) && + em->block_start == existing->block_start) { + /* + * these two extents are the same, it happens + * with inlines especially + */ + free_extent_map(em); + em = existing; + err = 0; + + } else if (start >= extent_map_end(existing) || start <= existing->start) { /* * The existing extent map is the one nearest to @@ -7129,6 +7166,43 @@ out: return em; } +static struct extent_map *btrfs_create_dio_extent(struct inode *inode, + const u64 start, + const u64 len, + const u64 orig_start, + const u64 block_start, + const u64 block_len, + const u64 orig_block_len, + const u64 ram_bytes, + const int type) +{ + struct extent_map *em = NULL; + int ret; + + down_read(&BTRFS_I(inode)->dio_sem); + if (type != BTRFS_ORDERED_NOCOW) { + em = create_pinned_em(inode, start, len, orig_start, + block_start, block_len, orig_block_len, + ram_bytes, type); + if (IS_ERR(em)) + goto out; + } + ret = btrfs_add_ordered_extent_dio(inode, start, block_start, + len, block_len, type); + if (ret) { + if (em) { + free_extent_map(em); + btrfs_drop_extent_cache(inode, start, + start + len - 1, 0); + } + em = ERR_PTR(ret); + } + out: + up_read(&BTRFS_I(inode)->dio_sem); + + return em; +} + static struct extent_map *btrfs_new_extent_direct(struct inode *inode, u64 start, u64 len) { @@ -7144,41 +7218,13 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode, if (ret) return ERR_PTR(ret); - /* - * Create the ordered extent before the extent map. This is to avoid - * races with the fast fsync path that would lead to it logging file - * extent items that point to disk extents that were not yet written to. - * The fast fsync path collects ordered extents into a local list and - * then collects all the new extent maps, so we must create the ordered - * extent first and make sure the fast fsync path collects any new - * ordered extents after collecting new extent maps as well. - * The fsync path simply can not rely on inode_dio_wait() because it - * causes deadlock with AIO. - */ - ret = btrfs_add_ordered_extent_dio(inode, start, ins.objectid, - ins.offset, ins.offset, 0); - if (ret) { + em = btrfs_create_dio_extent(inode, start, ins.offset, start, + ins.objectid, ins.offset, ins.offset, + ins.offset, 0); + btrfs_dec_block_group_reservations(root->fs_info, ins.objectid); + if (IS_ERR(em)) btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1); - return ERR_PTR(ret); - } - - em = create_pinned_em(inode, start, ins.offset, start, ins.objectid, - ins.offset, ins.offset, ins.offset, 0); - if (IS_ERR(em)) { - struct btrfs_ordered_extent *oe; - btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1); - oe = btrfs_lookup_ordered_extent(inode, start); - ASSERT(oe); - if (WARN_ON(!oe)) - return em; - set_bit(BTRFS_ORDERED_IOERR, &oe->flags); - set_bit(BTRFS_ORDERED_IO_DONE, &oe->flags); - btrfs_remove_ordered_extent(inode, oe); - /* Once for our lookup and once for the ordered extents tree. */ - btrfs_put_ordered_extent(oe); - btrfs_put_ordered_extent(oe); - } return em; } @@ -7408,7 +7454,7 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend, cached_state); /* * We're concerned with the entire range that we're going to be - * doing DIO to, so we need to make sure theres no ordered + * doing DIO to, so we need to make sure there's no ordered * extents in this range. */ ordered = btrfs_lookup_ordered_range(inode, lockstart, @@ -7570,7 +7616,7 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, if (current->journal_info) { /* * Need to pull our outstanding extents and set journal_info to NULL so - * that anything that needs to check if there's a transction doesn't get + * that anything that needs to check if there's a transaction doesn't get * confused. */ dio_data = current->journal_info; @@ -7603,7 +7649,7 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, * decompress it, so there will be buffering required no matter what we * do, so go ahead and fallback to buffered. * - * We return -ENOTBLK because thats what makes DIO go ahead and go back + * We return -ENOTBLK because that's what makes DIO go ahead and go back * to buffered IO. Don't blame me, this is the price we pay for using * the generic code. */ @@ -7650,24 +7696,21 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, block_start = em->block_start + (start - em->start); if (can_nocow_extent(inode, start, &len, &orig_start, - &orig_block_len, &ram_bytes) == 1) { + &orig_block_len, &ram_bytes) == 1 && + btrfs_inc_nocow_writers(root->fs_info, block_start)) { + struct extent_map *em2; + + em2 = btrfs_create_dio_extent(inode, start, len, + orig_start, block_start, + len, orig_block_len, + ram_bytes, type); + btrfs_dec_nocow_writers(root->fs_info, block_start); if (type == BTRFS_ORDERED_PREALLOC) { free_extent_map(em); - em = create_pinned_em(inode, start, len, - orig_start, - block_start, len, - orig_block_len, - ram_bytes, type); - if (IS_ERR(em)) { - ret = PTR_ERR(em); - goto unlock_err; - } + em = em2; } - - ret = btrfs_add_ordered_extent_dio(inode, start, - block_start, len, len, type); - if (ret) { - free_extent_map(em); + if (em2 && IS_ERR(em2)) { + ret = PTR_ERR(em2); goto unlock_err; } goto unlock; @@ -7746,12 +7789,12 @@ err: } static inline int submit_dio_repair_bio(struct inode *inode, struct bio *bio, - int rw, int mirror_num) + int mirror_num) { struct btrfs_root *root = BTRFS_I(inode)->root; int ret; - BUG_ON(rw & REQ_WRITE); + BUG_ON(bio_op(bio) == REQ_OP_WRITE); bio_get(bio); @@ -7760,7 +7803,7 @@ static inline int submit_dio_repair_bio(struct inode *inode, struct bio *bio, if (ret) goto err; - ret = btrfs_map_bio(root, rw, bio, mirror_num, 0); + ret = btrfs_map_bio(root, bio, mirror_num, 0); err: bio_put(bio); return ret; @@ -7811,7 +7854,7 @@ static int dio_read_error(struct inode *inode, struct bio *failed_bio, int read_mode; int ret; - BUG_ON(failed_bio->bi_rw & REQ_WRITE); + BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE); ret = btrfs_get_io_failure_record(inode, start, end, &failrec); if (ret) @@ -7839,13 +7882,13 @@ static int dio_read_error(struct inode *inode, struct bio *failed_bio, free_io_failure(inode, failrec); return -EIO; } + bio_set_op_attrs(bio, REQ_OP_READ, read_mode); btrfs_debug(BTRFS_I(inode)->root->fs_info, "Repair DIO Read Error: submitting new dio read[%#x] to this_mirror=%d, in_validation=%d\n", read_mode, failrec->this_mirror, failrec->in_validation); - ret = submit_dio_repair_bio(inode, bio, read_mode, - failrec->this_mirror); + ret = submit_dio_repair_bio(inode, bio, failrec->this_mirror); if (ret) { free_io_failure(inode, failrec); bio_put(bio); @@ -8135,7 +8178,7 @@ static void btrfs_endio_direct_write(struct bio *bio) bio_put(bio); } -static int __btrfs_submit_bio_start_direct_io(struct inode *inode, int rw, +static int __btrfs_submit_bio_start_direct_io(struct inode *inode, struct bio *bio, int mirror_num, unsigned long bio_flags, u64 offset) { @@ -8153,8 +8196,8 @@ static void btrfs_end_dio_bio(struct bio *bio) if (err) btrfs_warn(BTRFS_I(dip->inode)->root->fs_info, - "direct IO failed ino %llu rw %lu sector %#Lx len %u err no %d", - btrfs_ino(dip->inode), bio->bi_rw, + "direct IO failed ino %llu rw %d,%u sector %#Lx len %u err no %d", + btrfs_ino(dip->inode), bio_op(bio), bio->bi_rw, (unsigned long long)bio->bi_iter.bi_sector, bio->bi_iter.bi_size, err); @@ -8228,11 +8271,11 @@ static inline int btrfs_lookup_and_bind_dio_csum(struct btrfs_root *root, } static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode, - int rw, u64 file_offset, int skip_sum, + u64 file_offset, int skip_sum, int async_submit) { struct btrfs_dio_private *dip = bio->bi_private; - int write = rw & REQ_WRITE; + bool write = bio_op(bio) == REQ_OP_WRITE; struct btrfs_root *root = BTRFS_I(inode)->root; int ret; @@ -8253,8 +8296,7 @@ static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode, if (write && async_submit) { ret = btrfs_wq_submit_bio(root->fs_info, - inode, rw, bio, 0, 0, - file_offset, + inode, bio, 0, 0, file_offset, __btrfs_submit_bio_start_direct_io, __btrfs_submit_bio_done); goto err; @@ -8273,13 +8315,13 @@ static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode, goto err; } map: - ret = btrfs_map_bio(root, rw, bio, 0, async_submit); + ret = btrfs_map_bio(root, bio, 0, async_submit); err: bio_put(bio); return ret; } -static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip, +static int btrfs_submit_direct_hook(struct btrfs_dio_private *dip, int skip_sum) { struct inode *inode = dip->inode; @@ -8298,8 +8340,8 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip, int i; map_length = orig_bio->bi_iter.bi_size; - ret = btrfs_map_block(root->fs_info, rw, start_sector << 9, - &map_length, NULL, 0); + ret = btrfs_map_block(root->fs_info, bio_op(orig_bio), + start_sector << 9, &map_length, NULL, 0); if (ret) return -EIO; @@ -8319,6 +8361,7 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip, if (!bio) return -ENOMEM; + bio_set_op_attrs(bio, bio_op(orig_bio), orig_bio->bi_rw); bio->bi_private = dip; bio->bi_end_io = btrfs_end_dio_bio; btrfs_io_bio(bio)->logical = file_offset; @@ -8338,7 +8381,7 @@ next_block: * before we're done setting it up */ atomic_inc(&dip->pending_bios); - ret = __btrfs_submit_dio_bio(bio, inode, rw, + ret = __btrfs_submit_dio_bio(bio, inode, file_offset, skip_sum, async_submit); if (ret) { @@ -8356,12 +8399,13 @@ next_block: start_sector, GFP_NOFS); if (!bio) goto out_err; + bio_set_op_attrs(bio, bio_op(orig_bio), orig_bio->bi_rw); bio->bi_private = dip; bio->bi_end_io = btrfs_end_dio_bio; btrfs_io_bio(bio)->logical = file_offset; map_length = orig_bio->bi_iter.bi_size; - ret = btrfs_map_block(root->fs_info, rw, + ret = btrfs_map_block(root->fs_info, bio_op(orig_bio), start_sector << 9, &map_length, NULL, 0); if (ret) { @@ -8381,7 +8425,7 @@ next_block: } submit: - ret = __btrfs_submit_dio_bio(bio, inode, rw, file_offset, skip_sum, + ret = __btrfs_submit_dio_bio(bio, inode, file_offset, skip_sum, async_submit); if (!ret) return 0; @@ -8401,14 +8445,14 @@ out_err: return 0; } -static void btrfs_submit_direct(int rw, struct bio *dio_bio, - struct inode *inode, loff_t file_offset) +static void btrfs_submit_direct(struct bio *dio_bio, struct inode *inode, + loff_t file_offset) { struct btrfs_dio_private *dip = NULL; struct bio *io_bio = NULL; struct btrfs_io_bio *btrfs_bio; int skip_sum; - int write = rw & REQ_WRITE; + bool write = (bio_op(dio_bio) == REQ_OP_WRITE); int ret = 0; skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; @@ -8459,7 +8503,7 @@ static void btrfs_submit_direct(int rw, struct bio *dio_bio, dio_data->unsubmitted_oe_range_end; } - ret = btrfs_submit_direct_hook(rw, dip, skip_sum); + ret = btrfs_submit_direct_hook(dip, skip_sum); if (!ret) return; @@ -9019,7 +9063,7 @@ static int btrfs_truncate(struct inode *inode) return ret; /* - * Yes ladies and gentelment, this is indeed ugly. The fact is we have + * Yes ladies and gentlemen, this is indeed ugly. The fact is we have * 3 things going on here * * 1) We need to reserve space for our orphan item and the space to @@ -9033,15 +9077,15 @@ static int btrfs_truncate(struct inode *inode) * space reserved in case it uses space during the truncate (thank you * very much snapshotting). * - * And we need these to all be seperate. The fact is we can use alot of + * And we need these to all be separate. The fact is we can use a lot of * space doing the truncate, and we have no earthly idea how much space - * we will use, so we need the truncate reservation to be seperate so it + * we will use, so we need the truncate reservation to be separate so it * doesn't end up using space reserved for updating the inode or * removing the orphan item. We also need to be able to stop the * transaction and start a new one, which means we need to be able to * update the inode several times, and we have no idea of knowing how * many times that will be, so we can't just reserve 1 item for the - * entirety of the opration, so that has to be done seperately as well. + * entirety of the operation, so that has to be done separately as well. * Then there is the orphan item, which does indeed need to be held on * to for the whole operation, and we need nobody to touch this reserved * space except the orphan code. @@ -9230,6 +9274,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) INIT_LIST_HEAD(&ei->delalloc_inodes); INIT_LIST_HEAD(&ei->delayed_iput); RB_CLEAR_NODE(&ei->rb_node); + init_rwsem(&ei->dio_sem); return inode; } @@ -9387,10 +9432,281 @@ static int btrfs_getattr(struct vfsmount *mnt, return 0; } +static int btrfs_rename_exchange(struct inode *old_dir, + struct dentry *old_dentry, + struct inode *new_dir, + struct dentry *new_dentry) +{ + struct btrfs_trans_handle *trans; + struct btrfs_root *root = BTRFS_I(old_dir)->root; + struct btrfs_root *dest = BTRFS_I(new_dir)->root; + struct inode *new_inode = new_dentry->d_inode; + struct inode *old_inode = old_dentry->d_inode; + struct timespec ctime = CURRENT_TIME; + struct dentry *parent; + u64 old_ino = btrfs_ino(old_inode); + u64 new_ino = btrfs_ino(new_inode); + u64 old_idx = 0; + u64 new_idx = 0; + u64 root_objectid; + int ret; + bool root_log_pinned = false; + bool dest_log_pinned = false; + + /* we only allow rename subvolume link between subvolumes */ + if (old_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest) + return -EXDEV; + + /* close the race window with snapshot create/destroy ioctl */ + if (old_ino == BTRFS_FIRST_FREE_OBJECTID) + down_read(&root->fs_info->subvol_sem); + if (new_ino == BTRFS_FIRST_FREE_OBJECTID) + down_read(&dest->fs_info->subvol_sem); + + /* + * We want to reserve the absolute worst case amount of items. So if + * both inodes are subvols and we need to unlink them then that would + * require 4 item modifications, but if they are both normal inodes it + * would require 5 item modifications, so we'll assume their normal + * inodes. So 5 * 2 is 10, plus 2 for the new links, so 12 total items + * should cover the worst case number of items we'll modify. + */ + trans = btrfs_start_transaction(root, 12); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto out_notrans; + } + + /* + * We need to find a free sequence number both in the source and + * in the destination directory for the exchange. + */ + ret = btrfs_set_inode_index(new_dir, &old_idx); + if (ret) + goto out_fail; + ret = btrfs_set_inode_index(old_dir, &new_idx); + if (ret) + goto out_fail; + + BTRFS_I(old_inode)->dir_index = 0ULL; + BTRFS_I(new_inode)->dir_index = 0ULL; + + /* Reference for the source. */ + if (old_ino == BTRFS_FIRST_FREE_OBJECTID) { + /* force full log commit if subvolume involved. */ + btrfs_set_log_full_commit(root->fs_info, trans); + } else { + btrfs_pin_log_trans(root); + root_log_pinned = true; + ret = btrfs_insert_inode_ref(trans, dest, + new_dentry->d_name.name, + new_dentry->d_name.len, + old_ino, + btrfs_ino(new_dir), old_idx); + if (ret) + goto out_fail; + } + + /* And now for the dest. */ + if (new_ino == BTRFS_FIRST_FREE_OBJECTID) { + /* force full log commit if subvolume involved. */ + btrfs_set_log_full_commit(dest->fs_info, trans); + } else { + btrfs_pin_log_trans(dest); + dest_log_pinned = true; + ret = btrfs_insert_inode_ref(trans, root, + old_dentry->d_name.name, + old_dentry->d_name.len, + new_ino, + btrfs_ino(old_dir), new_idx); + if (ret) + goto out_fail; + } + + /* Update inode version and ctime/mtime. */ + inode_inc_iversion(old_dir); + inode_inc_iversion(new_dir); + inode_inc_iversion(old_inode); + inode_inc_iversion(new_inode); + old_dir->i_ctime = old_dir->i_mtime = ctime; + new_dir->i_ctime = new_dir->i_mtime = ctime; + old_inode->i_ctime = ctime; + new_inode->i_ctime = ctime; + + if (old_dentry->d_parent != new_dentry->d_parent) { + btrfs_record_unlink_dir(trans, old_dir, old_inode, 1); + btrfs_record_unlink_dir(trans, new_dir, new_inode, 1); + } + + /* src is a subvolume */ + if (old_ino == BTRFS_FIRST_FREE_OBJECTID) { + root_objectid = BTRFS_I(old_inode)->root->root_key.objectid; + ret = btrfs_unlink_subvol(trans, root, old_dir, + root_objectid, + old_dentry->d_name.name, + old_dentry->d_name.len); + } else { /* src is an inode */ + ret = __btrfs_unlink_inode(trans, root, old_dir, + old_dentry->d_inode, + old_dentry->d_name.name, + old_dentry->d_name.len); + if (!ret) + ret = btrfs_update_inode(trans, root, old_inode); + } + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto out_fail; + } + + /* dest is a subvolume */ + if (new_ino == BTRFS_FIRST_FREE_OBJECTID) { + root_objectid = BTRFS_I(new_inode)->root->root_key.objectid; + ret = btrfs_unlink_subvol(trans, dest, new_dir, + root_objectid, + new_dentry->d_name.name, + new_dentry->d_name.len); + } else { /* dest is an inode */ + ret = __btrfs_unlink_inode(trans, dest, new_dir, + new_dentry->d_inode, + new_dentry->d_name.name, + new_dentry->d_name.len); + if (!ret) + ret = btrfs_update_inode(trans, dest, new_inode); + } + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto out_fail; + } + + ret = btrfs_add_link(trans, new_dir, old_inode, + new_dentry->d_name.name, + new_dentry->d_name.len, 0, old_idx); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto out_fail; + } + + ret = btrfs_add_link(trans, old_dir, new_inode, + old_dentry->d_name.name, + old_dentry->d_name.len, 0, new_idx); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto out_fail; + } + + if (old_inode->i_nlink == 1) + BTRFS_I(old_inode)->dir_index = old_idx; + if (new_inode->i_nlink == 1) + BTRFS_I(new_inode)->dir_index = new_idx; + + if (root_log_pinned) { + parent = new_dentry->d_parent; + btrfs_log_new_name(trans, old_inode, old_dir, parent); + btrfs_end_log_trans(root); + root_log_pinned = false; + } + if (dest_log_pinned) { + parent = old_dentry->d_parent; + btrfs_log_new_name(trans, new_inode, new_dir, parent); + btrfs_end_log_trans(dest); + dest_log_pinned = false; + } +out_fail: + /* + * If we have pinned a log and an error happened, we unpin tasks + * trying to sync the log and force them to fallback to a transaction + * commit if the log currently contains any of the inodes involved in + * this rename operation (to ensure we do not persist a log with an + * inconsistent state for any of these inodes or leading to any + * inconsistencies when replayed). If the transaction was aborted, the + * abortion reason is propagated to userspace when attempting to commit + * the transaction. If the log does not contain any of these inodes, we + * allow the tasks to sync it. + */ + if (ret && (root_log_pinned || dest_log_pinned)) { + if (btrfs_inode_in_log(old_dir, root->fs_info->generation) || + btrfs_inode_in_log(new_dir, root->fs_info->generation) || + btrfs_inode_in_log(old_inode, root->fs_info->generation) || + (new_inode && + btrfs_inode_in_log(new_inode, root->fs_info->generation))) + btrfs_set_log_full_commit(root->fs_info, trans); + + if (root_log_pinned) { + btrfs_end_log_trans(root); + root_log_pinned = false; + } + if (dest_log_pinned) { + btrfs_end_log_trans(dest); + dest_log_pinned = false; + } + } + ret = btrfs_end_transaction(trans, root); +out_notrans: + if (new_ino == BTRFS_FIRST_FREE_OBJECTID) + up_read(&dest->fs_info->subvol_sem); + if (old_ino == BTRFS_FIRST_FREE_OBJECTID) + up_read(&root->fs_info->subvol_sem); + + return ret; +} + +static int btrfs_whiteout_for_rename(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct inode *dir, + struct dentry *dentry) +{ + int ret; + struct inode *inode; + u64 objectid; + u64 index; + + ret = btrfs_find_free_ino(root, &objectid); + if (ret) + return ret; + + inode = btrfs_new_inode(trans, root, dir, + dentry->d_name.name, + dentry->d_name.len, + btrfs_ino(dir), + objectid, + S_IFCHR | WHITEOUT_MODE, + &index); + + if (IS_ERR(inode)) { + ret = PTR_ERR(inode); + return ret; + } + + inode->i_op = &btrfs_special_inode_operations; + init_special_inode(inode, inode->i_mode, + WHITEOUT_DEV); + + ret = btrfs_init_inode_security(trans, inode, dir, + &dentry->d_name); + if (ret) + goto out; + + ret = btrfs_add_nondir(trans, dir, dentry, + inode, 0, index); + if (ret) + goto out; + + ret = btrfs_update_inode(trans, root, inode); +out: + unlock_new_inode(inode); + if (ret) + inode_dec_link_count(inode); + iput(inode); + + return ret; +} + static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, - struct inode *new_dir, struct dentry *new_dentry) + struct inode *new_dir, struct dentry *new_dentry, + unsigned int flags) { struct btrfs_trans_handle *trans; + unsigned int trans_num_items; struct btrfs_root *root = BTRFS_I(old_dir)->root; struct btrfs_root *dest = BTRFS_I(new_dir)->root; struct inode *new_inode = d_inode(new_dentry); @@ -9399,6 +9715,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, u64 root_objectid; int ret; u64 old_ino = btrfs_ino(old_inode); + bool log_pinned = false; if (btrfs_ino(new_dir) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) return -EPERM; @@ -9449,15 +9766,21 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, * We want to reserve the absolute worst case amount of items. So if * both inodes are subvols and we need to unlink them then that would * require 4 item modifications, but if they are both normal inodes it - * would require 5 item modifications, so we'll assume their normal + * would require 5 item modifications, so we'll assume they are normal * inodes. So 5 * 2 is 10, plus 1 for the new link, so 11 total items * should cover the worst case number of items we'll modify. + * If our rename has the whiteout flag, we need more 5 units for the + * new inode (1 inode item, 1 inode ref, 2 dir items and 1 xattr item + * when selinux is enabled). */ - trans = btrfs_start_transaction(root, 11); + trans_num_items = 11; + if (flags & RENAME_WHITEOUT) + trans_num_items += 5; + trans = btrfs_start_transaction(root, trans_num_items); if (IS_ERR(trans)) { - ret = PTR_ERR(trans); - goto out_notrans; - } + ret = PTR_ERR(trans); + goto out_notrans; + } if (dest != root) btrfs_record_root_in_trans(trans, dest); @@ -9471,6 +9794,8 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, /* force full log commit if subvolume involved. */ btrfs_set_log_full_commit(root->fs_info, trans); } else { + btrfs_pin_log_trans(root); + log_pinned = true; ret = btrfs_insert_inode_ref(trans, dest, new_dentry->d_name.name, new_dentry->d_name.len, @@ -9478,14 +9803,6 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, btrfs_ino(new_dir), index); if (ret) goto out_fail; - /* - * this is an ugly little race, but the rename is required - * to make sure that if we crash, the inode is either at the - * old name or the new one. pinning the log transaction lets - * us make sure we don't allow a log commit to come in after - * we unlink the name but before we add the new name back in. - */ - btrfs_pin_log_trans(root); } inode_inc_iversion(old_dir); @@ -9552,12 +9869,46 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, if (old_inode->i_nlink == 1) BTRFS_I(old_inode)->dir_index = index; - if (old_ino != BTRFS_FIRST_FREE_OBJECTID) { + if (log_pinned) { struct dentry *parent = new_dentry->d_parent; + btrfs_log_new_name(trans, old_inode, old_dir, parent); btrfs_end_log_trans(root); + log_pinned = false; + } + + if (flags & RENAME_WHITEOUT) { + ret = btrfs_whiteout_for_rename(trans, root, old_dir, + old_dentry); + + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto out_fail; + } } out_fail: + /* + * If we have pinned the log and an error happened, we unpin tasks + * trying to sync the log and force them to fallback to a transaction + * commit if the log currently contains any of the inodes involved in + * this rename operation (to ensure we do not persist a log with an + * inconsistent state for any of these inodes or leading to any + * inconsistencies when replayed). If the transaction was aborted, the + * abortion reason is propagated to userspace when attempting to commit + * the transaction. If the log does not contain any of these inodes, we + * allow the tasks to sync it. + */ + if (ret && log_pinned) { + if (btrfs_inode_in_log(old_dir, root->fs_info->generation) || + btrfs_inode_in_log(new_dir, root->fs_info->generation) || + btrfs_inode_in_log(old_inode, root->fs_info->generation) || + (new_inode && + btrfs_inode_in_log(new_inode, root->fs_info->generation))) + btrfs_set_log_full_commit(root->fs_info, trans); + + btrfs_end_log_trans(root); + log_pinned = false; + } btrfs_end_transaction(trans, root); out_notrans: if (old_ino == BTRFS_FIRST_FREE_OBJECTID) @@ -9570,10 +9921,14 @@ static int btrfs_rename2(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { - if (flags & ~RENAME_NOREPLACE) + if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT)) return -EINVAL; - return btrfs_rename(old_dir, old_dentry, new_dir, new_dentry); + if (flags & RENAME_EXCHANGE) + return btrfs_rename_exchange(old_dir, old_dentry, new_dir, + new_dentry); + + return btrfs_rename(old_dir, old_dentry, new_dir, new_dentry, flags); } static void btrfs_run_delalloc_work(struct btrfs_work *work) @@ -9942,6 +10297,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, btrfs_end_transaction(trans, root); break; } + btrfs_dec_block_group_reservations(root->fs_info, ins.objectid); last_alloc = ins.offset; ret = insert_reserved_file_extent(trans, inode, @@ -10181,10 +10537,10 @@ static const struct inode_operations btrfs_dir_ro_inode_operations = { static const struct file_operations btrfs_dir_file_operations = { .llseek = generic_file_llseek, .read = generic_read_dir, - .iterate = btrfs_real_readdir, + .iterate_shared = btrfs_real_readdir, .unlocked_ioctl = btrfs_ioctl, #ifdef CONFIG_COMPAT - .compat_ioctl = btrfs_ioctl, + .compat_ioctl = btrfs_compat_ioctl, #endif .release = btrfs_release_file, .fsync = btrfs_sync_file, diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 0b8ba717175b..05173563e4a6 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -125,10 +125,10 @@ static unsigned int btrfs_flags_to_ioctl(unsigned int flags) if (flags & BTRFS_INODE_NODATACOW) iflags |= FS_NOCOW_FL; - if ((flags & BTRFS_INODE_COMPRESS) && !(flags & BTRFS_INODE_NOCOMPRESS)) - iflags |= FS_COMPR_FL; - else if (flags & BTRFS_INODE_NOCOMPRESS) + if (flags & BTRFS_INODE_NOCOMPRESS) iflags |= FS_NOCOMP_FL; + else if (flags & BTRFS_INODE_COMPRESS) + iflags |= FS_COMPR_FL; return iflags; } @@ -296,7 +296,7 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) } } else { /* - * Revert back under same assuptions as above + * Revert back under same assumptions as above */ if (S_ISREG(mode)) { if (inode->i_size == 0) @@ -439,7 +439,7 @@ static noinline int create_subvol(struct inode *dir, { struct btrfs_trans_handle *trans; struct btrfs_key key; - struct btrfs_root_item root_item; + struct btrfs_root_item *root_item; struct btrfs_inode_item *inode_item; struct extent_buffer *leaf; struct btrfs_root *root = BTRFS_I(dir)->root; @@ -455,16 +455,22 @@ static noinline int create_subvol(struct inode *dir, u64 qgroup_reserved; uuid_le new_uuid; + root_item = kzalloc(sizeof(*root_item), GFP_KERNEL); + if (!root_item) + return -ENOMEM; + ret = btrfs_find_free_objectid(root->fs_info->tree_root, &objectid); if (ret) - return ret; + goto fail_free; /* * Don't create subvolume whose level is not zero. Or qgroup will be - * screwed up since it assume subvolme qgroup's level to be 0. + * screwed up since it assumes subvolume qgroup's level to be 0. */ - if (btrfs_qgroup_level(objectid)) - return -ENOSPC; + if (btrfs_qgroup_level(objectid)) { + ret = -ENOSPC; + goto fail_free; + } btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP); /* @@ -474,14 +480,14 @@ static noinline int create_subvol(struct inode *dir, ret = btrfs_subvolume_reserve_metadata(root, &block_rsv, 8, &qgroup_reserved, false); if (ret) - return ret; + goto fail_free; trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { ret = PTR_ERR(trans); btrfs_subvolume_release_metadata(root, &block_rsv, qgroup_reserved); - return ret; + goto fail_free; } trans->block_rsv = &block_rsv; trans->bytes_reserved = block_rsv.size; @@ -509,47 +515,45 @@ static noinline int create_subvol(struct inode *dir, BTRFS_UUID_SIZE); btrfs_mark_buffer_dirty(leaf); - memset(&root_item, 0, sizeof(root_item)); - - inode_item = &root_item.inode; + inode_item = &root_item->inode; btrfs_set_stack_inode_generation(inode_item, 1); btrfs_set_stack_inode_size(inode_item, 3); btrfs_set_stack_inode_nlink(inode_item, 1); btrfs_set_stack_inode_nbytes(inode_item, root->nodesize); btrfs_set_stack_inode_mode(inode_item, S_IFDIR | 0755); - btrfs_set_root_flags(&root_item, 0); - btrfs_set_root_limit(&root_item, 0); + btrfs_set_root_flags(root_item, 0); + btrfs_set_root_limit(root_item, 0); btrfs_set_stack_inode_flags(inode_item, BTRFS_INODE_ROOT_ITEM_INIT); - btrfs_set_root_bytenr(&root_item, leaf->start); - btrfs_set_root_generation(&root_item, trans->transid); - btrfs_set_root_level(&root_item, 0); - btrfs_set_root_refs(&root_item, 1); - btrfs_set_root_used(&root_item, leaf->len); - btrfs_set_root_last_snapshot(&root_item, 0); + btrfs_set_root_bytenr(root_item, leaf->start); + btrfs_set_root_generation(root_item, trans->transid); + btrfs_set_root_level(root_item, 0); + btrfs_set_root_refs(root_item, 1); + btrfs_set_root_used(root_item, leaf->len); + btrfs_set_root_last_snapshot(root_item, 0); - btrfs_set_root_generation_v2(&root_item, - btrfs_root_generation(&root_item)); + btrfs_set_root_generation_v2(root_item, + btrfs_root_generation(root_item)); uuid_le_gen(&new_uuid); - memcpy(root_item.uuid, new_uuid.b, BTRFS_UUID_SIZE); - btrfs_set_stack_timespec_sec(&root_item.otime, cur_time.tv_sec); - btrfs_set_stack_timespec_nsec(&root_item.otime, cur_time.tv_nsec); - root_item.ctime = root_item.otime; - btrfs_set_root_ctransid(&root_item, trans->transid); - btrfs_set_root_otransid(&root_item, trans->transid); + memcpy(root_item->uuid, new_uuid.b, BTRFS_UUID_SIZE); + btrfs_set_stack_timespec_sec(&root_item->otime, cur_time.tv_sec); + btrfs_set_stack_timespec_nsec(&root_item->otime, cur_time.tv_nsec); + root_item->ctime = root_item->otime; + btrfs_set_root_ctransid(root_item, trans->transid); + btrfs_set_root_otransid(root_item, trans->transid); btrfs_tree_unlock(leaf); free_extent_buffer(leaf); leaf = NULL; - btrfs_set_root_dirid(&root_item, new_dirid); + btrfs_set_root_dirid(root_item, new_dirid); key.objectid = objectid; key.offset = 0; key.type = BTRFS_ROOT_ITEM_KEY; ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key, - &root_item); + root_item); if (ret) goto fail; @@ -601,12 +605,13 @@ static noinline int create_subvol(struct inode *dir, BUG_ON(ret); ret = btrfs_uuid_tree_add(trans, root->fs_info->uuid_root, - root_item.uuid, BTRFS_UUID_KEY_SUBVOL, + root_item->uuid, BTRFS_UUID_KEY_SUBVOL, objectid); if (ret) btrfs_abort_transaction(trans, root, ret); fail: + kfree(root_item); trans->block_rsv = NULL; trans->bytes_reserved = 0; btrfs_subvolume_release_metadata(root, &block_rsv, qgroup_reserved); @@ -629,6 +634,10 @@ fail: d_instantiate(dentry, inode); } return ret; + +fail_free: + kfree(root_item); + return ret; } static void btrfs_wait_for_no_snapshoting_writes(struct btrfs_root *root) @@ -681,7 +690,7 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, if (ret) goto dec_and_free; - btrfs_wait_ordered_extents(root, -1); + btrfs_wait_ordered_extents(root, -1, 0, (u64)-1); btrfs_init_block_rsv(&pending_snapshot->block_rsv, BTRFS_BLOCK_RSV_TEMP); @@ -771,7 +780,7 @@ free_pending: * a. be owner of dir, or * b. be owner of victim, or * c. have CAP_FOWNER capability - * 6. If the victim is append-only or immutable we can't do antyhing with + * 6. If the victim is append-only or immutable we can't do anything with * links pointing to it. * 7. If we were asked to remove a directory and victim isn't one - ENOTDIR. * 8. If we were asked to remove a non-directory and victim isn't one - EISDIR. @@ -837,11 +846,9 @@ static noinline int btrfs_mksubvol(struct path *parent, struct dentry *dentry; int error; - inode_lock_nested(dir, I_MUTEX_PARENT); - // XXX: should've been - // mutex_lock_killable_nested(&dir->i_mutex, I_MUTEX_PARENT); - // if (error == -EINTR) - // return error; + error = down_write_killable_nested(&dir->i_rwsem, I_MUTEX_PARENT); + if (error == -EINTR) + return error; dentry = lookup_one_len(name, parent->dentry, namelen); error = PTR_ERR(dentry); @@ -1230,7 +1237,7 @@ again: set_extent_defrag(&BTRFS_I(inode)->io_tree, page_start, page_end - 1, - &cached_state, GFP_NOFS); + &cached_state); unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start, page_end - 1, &cached_state, @@ -2368,11 +2375,9 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, goto out; - inode_lock_nested(dir, I_MUTEX_PARENT); - // XXX: should've been - // err = mutex_lock_killable_nested(&dir->i_mutex, I_MUTEX_PARENT); - // if (err == -EINTR) - // goto out_drop_write; + err = down_write_killable_nested(&dir->i_rwsem, I_MUTEX_PARENT); + if (err == -EINTR) + goto out_drop_write; dentry = lookup_one_len(vol_args->name, parent, namelen); if (IS_ERR(dentry)) { err = PTR_ERR(dentry); @@ -2562,7 +2567,7 @@ out_dput: dput(dentry); out_unlock_dir: inode_unlock(dir); -//out_drop_write: +out_drop_write: mnt_drop_write_file(file); out: kfree(vol_args); @@ -2671,10 +2676,10 @@ out: return ret; } -static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg) +static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg) { struct btrfs_root *root = BTRFS_I(file_inode(file))->root; - struct btrfs_ioctl_vol_args *vol_args; + struct btrfs_ioctl_vol_args_v2 *vol_args; int ret; if (!capable(CAP_SYS_ADMIN)) @@ -2690,7 +2695,9 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg) goto err_drop; } - vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; + /* Check for compatibility reject unknown flags */ + if (vol_args->flags & ~BTRFS_VOL_ARG_V2_FLAGS_SUPPORTED) + return -EOPNOTSUPP; if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running, 1)) { @@ -2699,13 +2706,23 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg) } mutex_lock(&root->fs_info->volume_mutex); - ret = btrfs_rm_device(root, vol_args->name); + if (vol_args->flags & BTRFS_DEVICE_SPEC_BY_ID) { + ret = btrfs_rm_device(root, NULL, vol_args->devid); + } else { + vol_args->name[BTRFS_SUBVOL_NAME_MAX] = '\0'; + ret = btrfs_rm_device(root, vol_args->name, 0); + } mutex_unlock(&root->fs_info->volume_mutex); atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0); - if (!ret) - btrfs_info(root->fs_info, "disk deleted %s",vol_args->name); - + if (!ret) { + if (vol_args->flags & BTRFS_DEVICE_SPEC_BY_ID) + btrfs_info(root->fs_info, "device deleted: id %llu", + vol_args->devid); + else + btrfs_info(root->fs_info, "device deleted: %s", + vol_args->name); + } out: kfree(vol_args); err_drop: @@ -2713,6 +2730,47 @@ err_drop: return ret; } +static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg) +{ + struct btrfs_root *root = BTRFS_I(file_inode(file))->root; + struct btrfs_ioctl_vol_args *vol_args; + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + ret = mnt_want_write_file(file); + if (ret) + return ret; + + if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running, + 1)) { + ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; + goto out_drop_write; + } + + vol_args = memdup_user(arg, sizeof(*vol_args)); + if (IS_ERR(vol_args)) { + ret = PTR_ERR(vol_args); + goto out; + } + + vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; + mutex_lock(&root->fs_info->volume_mutex); + ret = btrfs_rm_device(root, vol_args->name, 0); + mutex_unlock(&root->fs_info->volume_mutex); + + if (!ret) + btrfs_info(root->fs_info, "disk deleted %s",vol_args->name); + kfree(vol_args); +out: + atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0); +out_drop_write: + mnt_drop_write_file(file); + + return ret; +} + static long btrfs_ioctl_fs_info(struct btrfs_root *root, void __user *arg) { struct btrfs_ioctl_fs_info_args *fi_args; @@ -3472,13 +3530,16 @@ static int btrfs_clone(struct inode *src, struct inode *inode, u64 last_dest_end = destoff; ret = -ENOMEM; - buf = vmalloc(root->nodesize); - if (!buf) - return ret; + buf = kmalloc(root->nodesize, GFP_KERNEL | __GFP_NOWARN); + if (!buf) { + buf = vmalloc(root->nodesize); + if (!buf) + return ret; + } path = btrfs_alloc_path(); if (!path) { - vfree(buf); + kvfree(buf); return ret; } @@ -3779,7 +3840,7 @@ process_slot: out: btrfs_free_path(path); - vfree(buf); + kvfree(buf); return ret; } @@ -4380,7 +4441,7 @@ static long btrfs_ioctl_dev_replace(struct btrfs_root *root, void __user *arg) 1)) { ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; } else { - ret = btrfs_dev_replace_start(root, p); + ret = btrfs_dev_replace_by_ioctl(root, p); atomic_set( &root->fs_info->mutually_exclusive_operation_running, 0); @@ -4589,7 +4650,7 @@ again: } /* - * mut. excl. ops lock is locked. Three possibilites: + * mut. excl. ops lock is locked. Three possibilities: * (1) some other op is running * (2) balance is running * (3) balance is paused -- special case (think resume) @@ -4851,8 +4912,8 @@ static long btrfs_ioctl_qgroup_assign(struct file *file, void __user *arg) /* update qgroup status and info */ err = btrfs_run_qgroups(trans, root->fs_info); if (err < 0) - btrfs_std_error(root->fs_info, ret, - "failed to update qgroup status and info\n"); + btrfs_handle_fs_error(root->fs_info, err, + "failed to update qgroup status and info"); err = btrfs_end_transaction(trans, root); if (err && !ret) ret = err; @@ -5398,9 +5459,15 @@ static int btrfs_ioctl_set_features(struct file *file, void __user *arg) if (ret) return ret; + ret = mnt_want_write_file(file); + if (ret) + return ret; + trans = btrfs_start_transaction(root, 0); - if (IS_ERR(trans)) - return PTR_ERR(trans); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto out_drop_write; + } spin_lock(&root->fs_info->super_lock); newflags = btrfs_super_compat_flags(super_block); @@ -5419,7 +5486,11 @@ static int btrfs_ioctl_set_features(struct file *file, void __user *arg) btrfs_set_super_incompat_flags(super_block, newflags); spin_unlock(&root->fs_info->super_lock); - return btrfs_commit_transaction(trans, root); + ret = btrfs_commit_transaction(trans, root); +out_drop_write: + mnt_drop_write_file(file); + + return ret; } long btrfs_ioctl(struct file *file, unsigned int @@ -5463,6 +5534,8 @@ long btrfs_ioctl(struct file *file, unsigned int return btrfs_ioctl_add_dev(root, argp); case BTRFS_IOC_RM_DEV: return btrfs_ioctl_rm_dev(file, argp); + case BTRFS_IOC_RM_DEV_V2: + return btrfs_ioctl_rm_dev_v2(file, argp); case BTRFS_IOC_FS_INFO: return btrfs_ioctl_fs_info(root, argp); case BTRFS_IOC_DEV_INFO: @@ -5494,7 +5567,7 @@ long btrfs_ioctl(struct file *file, unsigned int ret = btrfs_sync_fs(file_inode(file)->i_sb, 1); /* * The transaction thread may want to do more work, - * namely it pokes the cleaner ktread that will start + * namely it pokes the cleaner kthread that will start * processing uncleaned subvols. */ wake_up_process(root->fs_info->transaction_kthread); @@ -5556,3 +5629,24 @@ long btrfs_ioctl(struct file *file, unsigned int return -ENOTTY; } + +#ifdef CONFIG_COMPAT +long btrfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + switch (cmd) { + case FS_IOC32_GETFLAGS: + cmd = FS_IOC_GETFLAGS; + break; + case FS_IOC32_SETFLAGS: + cmd = FS_IOC_SETFLAGS; + break; + case FS_IOC32_GETVERSION: + cmd = FS_IOC_GETVERSION; + break; + default: + return -ENOIOCTLCMD; + } + + return btrfs_ioctl(file, cmd, (unsigned long) compat_ptr(arg)); +} +#endif diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 0de7da5a610d..aca8264f4a49 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -661,14 +661,15 @@ static void btrfs_run_ordered_extent_work(struct btrfs_work *work) * wait for all the ordered extents in a root. This is done when balancing * space between drives. */ -int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr) +int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr, + const u64 range_start, const u64 range_len) { - struct list_head splice, works; + LIST_HEAD(splice); + LIST_HEAD(skipped); + LIST_HEAD(works); struct btrfs_ordered_extent *ordered, *next; int count = 0; - - INIT_LIST_HEAD(&splice); - INIT_LIST_HEAD(&works); + const u64 range_end = range_start + range_len; mutex_lock(&root->ordered_extent_mutex); spin_lock(&root->ordered_extent_lock); @@ -676,6 +677,14 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr) while (!list_empty(&splice) && nr) { ordered = list_first_entry(&splice, struct btrfs_ordered_extent, root_extent_list); + + if (range_end <= ordered->start || + ordered->start + ordered->disk_len <= range_start) { + list_move_tail(&ordered->root_extent_list, &skipped); + cond_resched_lock(&root->ordered_extent_lock); + continue; + } + list_move_tail(&ordered->root_extent_list, &root->ordered_extents); atomic_inc(&ordered->refs); @@ -694,6 +703,7 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr) nr--; count++; } + list_splice_tail(&skipped, &root->ordered_extents); list_splice_tail(&splice, &root->ordered_extents); spin_unlock(&root->ordered_extent_lock); @@ -708,11 +718,13 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr) return count; } -void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr) +int btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr, + const u64 range_start, const u64 range_len) { struct btrfs_root *root; struct list_head splice; int done; + int total_done = 0; INIT_LIST_HEAD(&splice); @@ -728,8 +740,10 @@ void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr) &fs_info->ordered_roots); spin_unlock(&fs_info->ordered_root_lock); - done = btrfs_wait_ordered_extents(root, nr); + done = btrfs_wait_ordered_extents(root, nr, + range_start, range_len); btrfs_put_fs_root(root); + total_done += done; spin_lock(&fs_info->ordered_root_lock); if (nr != -1) { @@ -740,6 +754,8 @@ void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr) list_splice_tail(&splice, &fs_info->ordered_roots); spin_unlock(&fs_info->ordered_root_lock); mutex_unlock(&fs_info->ordered_operations_mutex); + + return total_done; } /* @@ -952,6 +968,7 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset, struct rb_node *prev = NULL; struct btrfs_ordered_extent *test; int ret = 1; + u64 orig_offset = offset; spin_lock_irq(&tree->lock); if (ordered) { @@ -967,7 +984,7 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset, /* truncate file */ if (disk_i_size > i_size) { - BTRFS_I(inode)->disk_i_size = i_size; + BTRFS_I(inode)->disk_i_size = orig_offset; ret = 0; goto out; } diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index 23c96059cef2..451507776ff5 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -58,7 +58,7 @@ struct btrfs_ordered_sum { #define BTRFS_ORDERED_COMPRESSED 3 /* writing a zlib compressed extent */ -#define BTRFS_ORDERED_PREALLOC 4 /* set when writing to prealloced extent */ +#define BTRFS_ORDERED_PREALLOC 4 /* set when writing to preallocated extent */ #define BTRFS_ORDERED_DIRECT 5 /* set when we're doing DIO with this extent */ @@ -197,8 +197,10 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset, struct btrfs_ordered_extent *ordered); int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum, int len); -int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr); -void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr); +int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr, + const u64 range_start, const u64 range_len); +int btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr, + const u64 range_start, const u64 range_len); void btrfs_get_logged_extents(struct inode *inode, struct list_head *logged_list, const loff_t start, diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 9e119552ed32..9d4c05b14f6e 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -85,7 +85,7 @@ struct btrfs_qgroup { /* * temp variables for accounting operations - * Refer to qgroup_shared_accouting() for details. + * Refer to qgroup_shared_accounting() for details. */ u64 old_refcnt; u64 new_refcnt; @@ -499,7 +499,7 @@ void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info) } /* * we call btrfs_free_qgroup_config() when umounting - * filesystem and disabling quota, so we set qgroup_ulit + * filesystem and disabling quota, so we set qgroup_ulist * to be null here to avoid double free. */ ulist_free(fs_info->qgroup_ulist); @@ -1036,7 +1036,7 @@ static void qgroup_dirty(struct btrfs_fs_info *fs_info, /* * The easy accounting, if we are adding/removing the only ref for an extent - * then this qgroup and all of the parent qgroups get their refrence and + * then this qgroup and all of the parent qgroups get their reference and * exclusive counts adjusted. * * Caller should hold fs_info->qgroup_lock. @@ -1436,7 +1436,7 @@ int btrfs_qgroup_prepare_account_extents(struct btrfs_trans_handle *trans, /* * No need to do lock, since this function will only be called in - * btrfs_commmit_transaction(). + * btrfs_commit_transaction(). */ node = rb_first(&delayed_refs->dirty_extent_root); while (node) { @@ -1557,7 +1557,7 @@ static int qgroup_update_refcnt(struct btrfs_fs_info *fs_info, * A: cur_old_roots < nr_old_roots (not exclusive before) * !A: cur_old_roots == nr_old_roots (possible exclusive before) * B: cur_new_roots < nr_new_roots (not exclusive now) - * !B: cur_new_roots == nr_new_roots (possible exclsuive now) + * !B: cur_new_roots == nr_new_roots (possible exclusive now) * * Results: * +: Possible sharing -> exclusive -: Possible exclusive -> sharing @@ -1851,7 +1851,7 @@ out: } /* - * Copy the acounting information between qgroups. This is necessary + * Copy the accounting information between qgroups. This is necessary * when a snapshot or a subvolume is created. Throwing an error will * cause a transaction abort so we take extra care here to only error * when a readonly fs is a reasonable outcome. @@ -2340,7 +2340,7 @@ out: mutex_unlock(&fs_info->qgroup_rescan_lock); /* - * only update status, since the previous part has alreay updated the + * only update status, since the previous part has already updated the * qgroup info. */ trans = btrfs_start_transaction(fs_info->quota_root, 1); @@ -2542,8 +2542,7 @@ int btrfs_qgroup_reserve_data(struct inode *inode, u64 start, u64 len) changeset.bytes_changed = 0; changeset.range_changed = ulist_alloc(GFP_NOFS); ret = set_record_extent_bits(&BTRFS_I(inode)->io_tree, start, - start + len -1, EXTENT_QGROUP_RESERVED, GFP_NOFS, - &changeset); + start + len -1, EXTENT_QGROUP_RESERVED, &changeset); trace_btrfs_qgroup_reserve_data(inode, start, len, changeset.bytes_changed, QGROUP_RESERVE); @@ -2580,8 +2579,7 @@ static int __btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len, return -ENOMEM; ret = clear_record_extent_bits(&BTRFS_I(inode)->io_tree, start, - start + len -1, EXTENT_QGROUP_RESERVED, GFP_NOFS, - &changeset); + start + len -1, EXTENT_QGROUP_RESERVED, &changeset); if (ret < 0) goto out; @@ -2672,7 +2670,7 @@ void btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes) } /* - * Check qgroup reserved space leaking, normally at destory inode + * Check qgroup reserved space leaking, normally at destroy inode * time */ void btrfs_qgroup_check_reserved_leak(struct inode *inode) @@ -2688,7 +2686,7 @@ void btrfs_qgroup_check_reserved_leak(struct inode *inode) return; ret = clear_record_extent_bits(&BTRFS_I(inode)->io_tree, 0, (u64)-1, - EXTENT_QGROUP_RESERVED, GFP_NOFS, &changeset); + EXTENT_QGROUP_RESERVED, &changeset); WARN_ON(ret < 0); if (WARN_ON(changeset.bytes_changed)) { diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 0b7792e02dd5..cd8d302a1f61 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -576,7 +576,7 @@ static int rbio_can_merge(struct btrfs_raid_bio *last, * we can't merge with cached rbios, since the * idea is that when we merge the destination * rbio is going to run our IO for us. We can - * steal from cached rbio's though, other functions + * steal from cached rbios though, other functions * handle that. */ if (test_bit(RBIO_CACHE_BIT, &last->flags) || @@ -1320,7 +1320,9 @@ write_data: bio->bi_private = rbio; bio->bi_end_io = raid_write_end_io; - submit_bio(WRITE, bio); + bio_set_op_attrs(bio, REQ_OP_WRITE, 0); + + submit_bio(bio); } return; @@ -1573,11 +1575,12 @@ static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio) bio->bi_private = rbio; bio->bi_end_io = raid_rmw_end_io; + bio_set_op_attrs(bio, REQ_OP_READ, 0); btrfs_bio_wq_end_io(rbio->fs_info, bio, BTRFS_WQ_ENDIO_RAID56); - submit_bio(READ, bio); + submit_bio(bio); } /* the actual write will happen once the reads are done */ return 0; @@ -2097,11 +2100,12 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio) bio->bi_private = rbio; bio->bi_end_io = raid_recover_end_io; + bio_set_op_attrs(bio, REQ_OP_READ, 0); btrfs_bio_wq_end_io(rbio->fs_info, bio, BTRFS_WQ_ENDIO_RAID56); - submit_bio(READ, bio); + submit_bio(bio); } out: return 0; @@ -2368,7 +2372,7 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, run_xor(pointers + 1, nr_data - 1, PAGE_SIZE); } - /* Check scrubbing pairty and repair it */ + /* Check scrubbing parity and repair it */ p = rbio_stripe_page(rbio, rbio->scrubp, pagenr); parity = kmap(p); if (memcmp(parity, pointers[rbio->scrubp], PAGE_SIZE)) @@ -2433,7 +2437,9 @@ submit_write: bio->bi_private = rbio; bio->bi_end_io = raid_write_end_io; - submit_bio(WRITE, bio); + bio_set_op_attrs(bio, REQ_OP_WRITE, 0); + + submit_bio(bio); } return; @@ -2493,7 +2499,7 @@ static void validate_rbio_for_parity_scrub(struct btrfs_raid_bio *rbio) /* * Here means we got one corrupted data stripe and one * corrupted parity on RAID6, if the corrupted parity - * is scrubbing parity, luckly, use the other one to repair + * is scrubbing parity, luckily, use the other one to repair * the data, or we can not repair the data stripe. */ if (failp != rbio->scrubp) @@ -2610,11 +2616,12 @@ static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio) bio->bi_private = rbio; bio->bi_end_io = raid56_parity_scrub_end_io; + bio_set_op_attrs(bio, REQ_OP_READ, 0); btrfs_bio_wq_end_io(rbio->fs_info, bio, BTRFS_WQ_ENDIO_RAID56); - submit_bio(READ, bio); + submit_bio(bio); } /* the actual write will happen once the reads are done */ return; diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c index 298631eaee78..8428db7cd88f 100644 --- a/fs/btrfs/reada.c +++ b/fs/btrfs/reada.c @@ -761,12 +761,14 @@ static void __reada_start_machine(struct btrfs_fs_info *fs_info) do { enqueued = 0; + mutex_lock(&fs_devices->device_list_mutex); list_for_each_entry(device, &fs_devices->devices, dev_list) { if (atomic_read(&device->reada_in_flight) < MAX_IN_FLIGHT) enqueued += reada_start_machine_dev(fs_info, device); } + mutex_unlock(&fs_devices->device_list_mutex); total += enqueued; } while (enqueued && total < 10000); diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 08ef890deca6..0477dca154ed 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -668,8 +668,8 @@ int find_inline_backref(struct extent_buffer *leaf, int slot, * roots of b-trees that reference the tree block. * * the basic idea of this function is check backrefs of a given block - * to find upper level blocks that refernece the block, and then check - * bakcrefs of these upper level blocks recursively. the recursion stop + * to find upper level blocks that reference the block, and then check + * backrefs of these upper level blocks recursively. the recursion stop * when tree root is reached or backrefs for the block is cached. * * NOTE: if we find backrefs for a block are cached, we know backrefs @@ -1160,7 +1160,7 @@ out: if (!RB_EMPTY_NODE(&upper->rb_node)) continue; - /* Add this guy's upper edges to the list to proces */ + /* Add this guy's upper edges to the list to process */ list_for_each_entry(edge, &upper->upper, list[LOWER]) list_add_tail(&edge->list[UPPER], &list); if (list_empty(&upper->upper)) @@ -2396,7 +2396,7 @@ again: } /* - * we keep the old last snapshod transid in rtranid when we + * we keep the old last snapshot transid in rtranid when we * created the relocation tree. */ last_snap = btrfs_root_rtransid(&reloc_root->root_item); @@ -2418,7 +2418,7 @@ again: } out: if (ret) { - btrfs_std_error(root->fs_info, ret, NULL); + btrfs_handle_fs_error(root->fs_info, ret, NULL); if (!list_empty(&reloc_roots)) free_reloc_roots(&reloc_roots); @@ -2616,7 +2616,7 @@ static int reserve_metadata_space(struct btrfs_trans_handle *trans, * only one thread can access block_rsv at this point, * so we don't need hold lock to protect block_rsv. * we expand more reservation size here to allow enough - * space for relocation and we will return eailer in + * space for relocation and we will return earlier in * enospc case. */ rc->block_rsv->size = tmp + rc->extent_root->nodesize * @@ -2814,7 +2814,7 @@ static void mark_block_processed(struct reloc_control *rc, u64 bytenr, u32 blocksize) { set_extent_bits(&rc->processed_blocks, bytenr, bytenr + blocksize - 1, - EXTENT_DIRTY, GFP_NOFS); + EXTENT_DIRTY); } static void __mark_block_processed(struct reloc_control *rc, @@ -3182,7 +3182,7 @@ static int relocate_file_extent_cluster(struct inode *inode, page_start + offset == cluster->boundary[nr]) { set_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end, - EXTENT_BOUNDARY, GFP_NOFS); + EXTENT_BOUNDARY); nr++; } @@ -4059,8 +4059,7 @@ restart: } btrfs_release_path(path); - clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY, - GFP_NOFS); + clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY); if (trans) { btrfs_end_transaction_throttle(trans, rc->extent_root); @@ -4254,12 +4253,11 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start) btrfs_info(extent_root->fs_info, "relocating block group %llu flags %llu", rc->block_group->key.objectid, rc->block_group->flags); - ret = btrfs_start_delalloc_roots(fs_info, 0, -1); - if (ret < 0) { - err = ret; - goto out; - } - btrfs_wait_ordered_roots(fs_info, -1); + btrfs_wait_block_group_reservations(rc->block_group); + btrfs_wait_nocow_writers(rc->block_group); + btrfs_wait_ordered_roots(fs_info, -1, + rc->block_group->key.objectid, + rc->block_group->key.offset); while (1) { mutex_lock(&fs_info->cleaner_mutex); @@ -4592,7 +4590,7 @@ int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans, /* * called before creating snapshot. it calculates metadata reservation - * requried for relocating tree blocks in the snapshot + * required for relocating tree blocks in the snapshot */ void btrfs_reloc_pre_snapshot(struct btrfs_pending_snapshot *pending, u64 *bytes_to_reserve) diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index 9fcd6dfc3266..f1c30861d062 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -71,9 +71,9 @@ static void btrfs_read_root_item(struct extent_buffer *eb, int slot, * search_key: the key to search * path: the path we search * root_item: the root item of the tree we look for - * root_key: the reak key of the tree we look for + * root_key: the root key of the tree we look for * - * If ->offset of 'seach_key' is -1ULL, it means we are not sure the offset + * If ->offset of 'search_key' is -1ULL, it means we are not sure the offset * of the search key, just lookup the root with the highest offset for a * given objectid. * @@ -284,7 +284,7 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root) trans = btrfs_join_transaction(tree_root); if (IS_ERR(trans)) { err = PTR_ERR(trans); - btrfs_std_error(tree_root->fs_info, err, + btrfs_handle_fs_error(tree_root->fs_info, err, "Failed to start trans to delete " "orphan item"); break; @@ -293,7 +293,7 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root) root_key.objectid); btrfs_end_transaction(trans, tree_root); if (err) { - btrfs_std_error(tree_root->fs_info, err, + btrfs_handle_fs_error(tree_root->fs_info, err, "Failed to delete root orphan " "item"); break; diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 4678f03e878e..e08b6bc676e3 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -745,7 +745,7 @@ static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *fixup_ctx) * sure we read the bad mirror. */ ret = set_extent_bits(&BTRFS_I(inode)->io_tree, offset, end, - EXTENT_DAMAGED, GFP_NOFS); + EXTENT_DAMAGED); if (ret) { /* set_extent_bits should give proper error */ WARN_ON(ret > 0); @@ -763,7 +763,7 @@ static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *fixup_ctx) end, EXTENT_DAMAGED, 0, NULL); if (!corrected) clear_extent_bits(&BTRFS_I(inode)->io_tree, offset, end, - EXTENT_DAMAGED, GFP_NOFS); + EXTENT_DAMAGED); } out: @@ -1044,7 +1044,7 @@ nodatasum_case: /* * !is_metadata and !have_csum, this means that the data - * might not be COW'ed, that it might be modified + * might not be COWed, that it might be modified * concurrently. The general strategy to work on the * commit root does not help in the case when COW is not * used. @@ -1125,7 +1125,7 @@ nodatasum_case: * the 2nd page of mirror #1 faces I/O errors, and the 2nd page * of mirror #2 is readable but the final checksum test fails, * then the 2nd page of mirror #3 could be tried, whether now - * the final checksum succeedes. But this would be a rare + * the final checksum succeeds. But this would be a rare * exception and is therefore not implemented. At least it is * avoided that the good copy is overwritten. * A more useful improvement would be to pick the sectors @@ -1350,7 +1350,7 @@ static int scrub_setup_recheck_block(struct scrub_block *original_sblock, recover->bbio = bbio; recover->map_length = mapped_length; - BUG_ON(page_index >= SCRUB_PAGES_PER_RD_BIO); + BUG_ON(page_index >= SCRUB_MAX_PAGES_PER_BLOCK); nmirrors = min(scrub_nr_raid_mirrors(bbio), BTRFS_MAX_MIRRORS); @@ -1504,8 +1504,9 @@ static void scrub_recheck_block(struct btrfs_fs_info *fs_info, sblock->no_io_error_seen = 0; } else { bio->bi_iter.bi_sector = page->physical >> 9; + bio_set_op_attrs(bio, REQ_OP_READ, 0); - if (btrfsic_submit_bio_wait(READ, bio)) + if (btrfsic_submit_bio_wait(bio)) sblock->no_io_error_seen = 0; } @@ -1583,6 +1584,7 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad, return -EIO; bio->bi_bdev = page_bad->dev->bdev; bio->bi_iter.bi_sector = page_bad->physical >> 9; + bio_set_op_attrs(bio, REQ_OP_WRITE, 0); ret = bio_add_page(bio, page_good->page, PAGE_SIZE, 0); if (PAGE_SIZE != ret) { @@ -1590,7 +1592,7 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad, return -EIO; } - if (btrfsic_submit_bio_wait(WRITE, bio)) { + if (btrfsic_submit_bio_wait(bio)) { btrfs_dev_stat_inc_and_print(page_bad->dev, BTRFS_DEV_STAT_WRITE_ERRS); btrfs_dev_replace_stats_inc( @@ -1684,6 +1686,7 @@ again: bio->bi_end_io = scrub_wr_bio_end_io; bio->bi_bdev = sbio->dev->bdev; bio->bi_iter.bi_sector = sbio->physical >> 9; + bio_set_op_attrs(bio, REQ_OP_WRITE, 0); sbio->err = 0; } else if (sbio->physical + sbio->page_count * PAGE_SIZE != spage->physical_for_dev_replace || @@ -1731,7 +1734,7 @@ static void scrub_wr_submit(struct scrub_ctx *sctx) * orders the requests before sending them to the driver which * doubled the write performance on spinning disks when measured * with Linux 3.5 */ - btrfsic_submit_bio(WRITE, sbio->bio); + btrfsic_submit_bio(sbio->bio); } static void scrub_wr_bio_end_io(struct bio *bio) @@ -2041,7 +2044,7 @@ static void scrub_submit(struct scrub_ctx *sctx) sbio = sctx->bios[sctx->curr]; sctx->curr = -1; scrub_pending_bio_inc(sctx); - btrfsic_submit_bio(READ, sbio->bio); + btrfsic_submit_bio(sbio->bio); } static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx, @@ -2088,6 +2091,7 @@ again: bio->bi_end_io = scrub_bio_end_io; bio->bi_bdev = sbio->dev->bdev; bio->bi_iter.bi_sector = sbio->physical >> 9; + bio_set_op_attrs(bio, REQ_OP_READ, 0); sbio->err = 0; } else if (sbio->physical + sbio->page_count * PAGE_SIZE != spage->physical || @@ -2127,6 +2131,8 @@ static void scrub_missing_raid56_end_io(struct bio *bio) if (bio->bi_error) sblock->no_io_error_seen = 0; + bio_put(bio); + btrfs_queue_work(fs_info->scrub_workers, &sblock->work); } @@ -2179,7 +2185,7 @@ static void scrub_missing_raid56_pages(struct scrub_block *sblock) struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info; u64 length = sblock->page_count * PAGE_SIZE; u64 logical = sblock->pagev[0]->logical; - struct btrfs_bio *bbio; + struct btrfs_bio *bbio = NULL; struct bio *bio; struct btrfs_raid_bio *rbio; int ret; @@ -2860,7 +2866,7 @@ static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx, int extent_mirror_num; int stop_loop = 0; - nsectors = map->stripe_len / root->sectorsize; + nsectors = div_u64(map->stripe_len, root->sectorsize); bitmap_len = scrub_calc_parity_bitmap_len(nsectors); sparity = kzalloc(sizeof(struct scrub_parity) + 2 * bitmap_len, GFP_NOFS); @@ -2980,6 +2986,7 @@ again: extent_len); mapped_length = extent_len; + bbio = NULL; ret = btrfs_map_block(fs_info, READ, extent_logical, &mapped_length, &bbio, 0); if (!ret) { @@ -3070,7 +3077,6 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, int slot; u64 nstripes; struct extent_buffer *l; - struct btrfs_key key; u64 physical; u64 logical; u64 logic_end; @@ -3079,7 +3085,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, int mirror_num; struct reada_control *reada1; struct reada_control *reada2; - struct btrfs_key key_start; + struct btrfs_key key; struct btrfs_key key_end; u64 increment = map->stripe_len; u64 offset; @@ -3158,21 +3164,21 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, scrub_blocked_if_needed(fs_info); /* FIXME it might be better to start readahead at commit root */ - key_start.objectid = logical; - key_start.type = BTRFS_EXTENT_ITEM_KEY; - key_start.offset = (u64)0; + key.objectid = logical; + key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = (u64)0; key_end.objectid = logic_end; key_end.type = BTRFS_METADATA_ITEM_KEY; key_end.offset = (u64)-1; - reada1 = btrfs_reada_add(root, &key_start, &key_end); + reada1 = btrfs_reada_add(root, &key, &key_end); - key_start.objectid = BTRFS_EXTENT_CSUM_OBJECTID; - key_start.type = BTRFS_EXTENT_CSUM_KEY; - key_start.offset = logical; + key.objectid = BTRFS_EXTENT_CSUM_OBJECTID; + key.type = BTRFS_EXTENT_CSUM_KEY; + key.offset = logical; key_end.objectid = BTRFS_EXTENT_CSUM_OBJECTID; key_end.type = BTRFS_EXTENT_CSUM_KEY; key_end.offset = logic_end; - reada2 = btrfs_reada_add(csum_root, &key_start, &key_end); + reada2 = btrfs_reada_add(csum_root, &key, &key_end); if (!IS_ERR(reada1)) btrfs_reada_wait(reada1); @@ -3580,6 +3586,46 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, */ scrub_pause_on(fs_info); ret = btrfs_inc_block_group_ro(root, cache); + if (!ret && is_dev_replace) { + /* + * If we are doing a device replace wait for any tasks + * that started dellaloc right before we set the block + * group to RO mode, as they might have just allocated + * an extent from it or decided they could do a nocow + * write. And if any such tasks did that, wait for their + * ordered extents to complete and then commit the + * current transaction, so that we can later see the new + * extent items in the extent tree - the ordered extents + * create delayed data references (for cow writes) when + * they complete, which will be run and insert the + * corresponding extent items into the extent tree when + * we commit the transaction they used when running + * inode.c:btrfs_finish_ordered_io(). We later use + * the commit root of the extent tree to find extents + * to copy from the srcdev into the tgtdev, and we don't + * want to miss any new extents. + */ + btrfs_wait_block_group_reservations(cache); + btrfs_wait_nocow_writers(cache); + ret = btrfs_wait_ordered_roots(fs_info, -1, + cache->key.objectid, + cache->key.offset); + if (ret > 0) { + struct btrfs_trans_handle *trans; + + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) + ret = PTR_ERR(trans); + else + ret = btrfs_commit_transaction(trans, + root); + if (ret) { + scrub_pause_off(fs_info); + btrfs_put_block_group(cache); + break; + } + } + } scrub_pause_off(fs_info); if (ret == 0) { @@ -3600,9 +3646,11 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, break; } + btrfs_dev_replace_lock(&fs_info->dev_replace, 1); dev_replace->cursor_right = found_key.offset + length; dev_replace->cursor_left = found_key.offset; dev_replace->item_needs_writeback = 1; + btrfs_dev_replace_unlock(&fs_info->dev_replace, 1); ret = scrub_chunk(sctx, scrub_dev, chunk_offset, length, found_key.offset, cache, is_dev_replace); @@ -3638,6 +3686,11 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, scrub_pause_off(fs_info); + btrfs_dev_replace_lock(&fs_info->dev_replace, 1); + dev_replace->cursor_left = dev_replace->cursor_right; + dev_replace->item_needs_writeback = 1; + btrfs_dev_replace_unlock(&fs_info->dev_replace, 1); + if (ro_set) btrfs_dec_block_group_ro(root, cache); @@ -3675,9 +3728,6 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, ret = -ENOMEM; break; } - - dev_replace->cursor_left = dev_replace->cursor_right; - dev_replace->item_needs_writeback = 1; skip: key.offset = found_key.offset + length; btrfs_release_path(path); @@ -4390,6 +4440,7 @@ static int write_page_nocow(struct scrub_ctx *sctx, bio->bi_iter.bi_size = 0; bio->bi_iter.bi_sector = physical_for_dev_replace >> 9; bio->bi_bdev = dev->bdev; + bio_set_op_attrs(bio, REQ_OP_WRITE, WRITE_SYNC); ret = bio_add_page(bio, page, PAGE_SIZE, 0); if (ret != PAGE_SIZE) { leave_with_eio: @@ -4398,7 +4449,7 @@ leave_with_eio: return -EIO; } - if (btrfsic_submit_bio_wait(WRITE_SYNC, bio)) + if (btrfsic_submit_bio_wait(bio)) goto leave_with_eio; bio_put(bio); diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 8d358c547c59..b71dd298385c 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -1831,7 +1831,7 @@ static int will_overwrite_ref(struct send_ctx *sctx, u64 dir, u64 dir_gen, /* * If we have a parent root we need to verify that the parent dir was - * not delted and then re-created, if it was then we have no overwrite + * not deleted and then re-created, if it was then we have no overwrite * and we can just unlink this entry. */ if (sctx->parent_root) { @@ -4192,9 +4192,9 @@ static int __process_new_xattr(int num, struct btrfs_key *di_key, return -ENOMEM; /* - * This hack is needed because empty acl's are stored as zero byte + * This hack is needed because empty acls are stored as zero byte * data in xattrs. Problem with that is, that receiving these zero byte - * acl's will fail later. To fix this, we send a dummy acl list that + * acls will fail later. To fix this, we send a dummy acl list that * only contains the version number and no entries. */ if (!strncmp(name, XATTR_NAME_POSIX_ACL_ACCESS, name_len) || @@ -5939,6 +5939,7 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_) u32 i; u64 *clone_sources_tmp = NULL; int clone_sources_to_rollback = 0; + unsigned alloc_size; int sort_clone_roots = 0; int index; @@ -5978,6 +5979,12 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_) goto out; } + if (arg->clone_sources_count > + ULLONG_MAX / sizeof(*arg->clone_sources)) { + ret = -EINVAL; + goto out; + } + if (!access_ok(VERIFY_READ, arg->clone_sources, sizeof(*arg->clone_sources) * arg->clone_sources_count)) { @@ -6022,40 +6029,53 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_) sctx->clone_roots_cnt = arg->clone_sources_count; sctx->send_max_size = BTRFS_SEND_BUF_SIZE; - sctx->send_buf = vmalloc(sctx->send_max_size); + sctx->send_buf = kmalloc(sctx->send_max_size, GFP_KERNEL | __GFP_NOWARN); if (!sctx->send_buf) { - ret = -ENOMEM; - goto out; + sctx->send_buf = vmalloc(sctx->send_max_size); + if (!sctx->send_buf) { + ret = -ENOMEM; + goto out; + } } - sctx->read_buf = vmalloc(BTRFS_SEND_READ_SIZE); + sctx->read_buf = kmalloc(BTRFS_SEND_READ_SIZE, GFP_KERNEL | __GFP_NOWARN); if (!sctx->read_buf) { - ret = -ENOMEM; - goto out; + sctx->read_buf = vmalloc(BTRFS_SEND_READ_SIZE); + if (!sctx->read_buf) { + ret = -ENOMEM; + goto out; + } } sctx->pending_dir_moves = RB_ROOT; sctx->waiting_dir_moves = RB_ROOT; sctx->orphan_dirs = RB_ROOT; - sctx->clone_roots = vzalloc(sizeof(struct clone_root) * - (arg->clone_sources_count + 1)); + alloc_size = sizeof(struct clone_root) * (arg->clone_sources_count + 1); + + sctx->clone_roots = kzalloc(alloc_size, GFP_KERNEL | __GFP_NOWARN); if (!sctx->clone_roots) { - ret = -ENOMEM; - goto out; + sctx->clone_roots = vzalloc(alloc_size); + if (!sctx->clone_roots) { + ret = -ENOMEM; + goto out; + } } + alloc_size = arg->clone_sources_count * sizeof(*arg->clone_sources); + if (arg->clone_sources_count) { - clone_sources_tmp = vmalloc(arg->clone_sources_count * - sizeof(*arg->clone_sources)); + clone_sources_tmp = kmalloc(alloc_size, GFP_KERNEL | __GFP_NOWARN); if (!clone_sources_tmp) { - ret = -ENOMEM; - goto out; + clone_sources_tmp = vmalloc(alloc_size); + if (!clone_sources_tmp) { + ret = -ENOMEM; + goto out; + } } ret = copy_from_user(clone_sources_tmp, arg->clone_sources, - arg->clone_sources_count * - sizeof(*arg->clone_sources)); + alloc_size); if (ret) { ret = -EFAULT; goto out; @@ -6089,7 +6109,7 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_) sctx->clone_roots[i].root = clone_root; clone_sources_to_rollback = i + 1; } - vfree(clone_sources_tmp); + kvfree(clone_sources_tmp); clone_sources_tmp = NULL; } @@ -6207,15 +6227,15 @@ out: btrfs_root_dec_send_in_progress(sctx->parent_root); kfree(arg); - vfree(clone_sources_tmp); + kvfree(clone_sources_tmp); if (sctx) { if (sctx->send_filp) fput(sctx->send_filp); - vfree(sctx->clone_roots); - vfree(sctx->send_buf); - vfree(sctx->read_buf); + kvfree(sctx->clone_roots); + kvfree(sctx->send_buf); + kvfree(sctx->read_buf); name_cache_free(sctx); diff --git a/fs/btrfs/struct-funcs.c b/fs/btrfs/struct-funcs.c index e05619f241be..875c757e73e2 100644 --- a/fs/btrfs/struct-funcs.c +++ b/fs/btrfs/struct-funcs.c @@ -36,7 +36,7 @@ static inline void put_unaligned_le8(u8 val, void *p) * * The end result is that anyone who #includes ctree.h gets a * declaration for the btrfs_set_foo functions and btrfs_foo functions, - * which are wappers of btrfs_set_token_#bits functions and + * which are wrappers of btrfs_set_token_#bits functions and * btrfs_get_token_#bits functions, which are defined in this file. * * These setget functions do all the extent_buffer related mapping diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 00b8f37cc306..60e7179ed4b7 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -97,15 +97,6 @@ const char *btrfs_decode_error(int errno) return errstr; } -static void save_error_info(struct btrfs_fs_info *fs_info) -{ - /* - * today we only save the error info into ram. Long term we'll - * also send it down to the disk - */ - set_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state); -} - /* btrfs handle error by forcing the filesystem readonly */ static void btrfs_handle_error(struct btrfs_fs_info *fs_info) { @@ -121,7 +112,7 @@ static void btrfs_handle_error(struct btrfs_fs_info *fs_info) * Note that a running device replace operation is not * canceled here although there is no way to update * the progress. It would add the risk of a deadlock, - * therefore the canceling is ommited. The only penalty + * therefore the canceling is omitted. The only penalty * is that some I/O remains active until the procedure * completes. The next time when the filesystem is * mounted writeable again, the device replace @@ -131,11 +122,11 @@ static void btrfs_handle_error(struct btrfs_fs_info *fs_info) } /* - * __btrfs_std_error decodes expected errors from the caller and + * __btrfs_handle_fs_error decodes expected errors from the caller and * invokes the approciate error response. */ __cold -void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function, +void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function, unsigned int line, int errno, const char *fmt, ...) { struct super_block *sb = fs_info->sb; @@ -170,8 +161,13 @@ void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function, } #endif + /* + * Today we only save the error info to memory. Long term we'll + * also send it down to the disk + */ + set_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state); + /* Don't go through full error handling during mount */ - save_error_info(fs_info); if (sb->s_flags & MS_BORN) btrfs_handle_error(fs_info); } @@ -239,7 +235,7 @@ void __btrfs_abort_transaction(struct btrfs_trans_handle *trans, trans->aborted = errno; /* Nothing used. The other threads that have joined this * transaction may be able to continue. */ - if (!trans->blocks_used && list_empty(&trans->new_bgs)) { + if (!trans->dirty && list_empty(&trans->new_bgs)) { const char *errstr; errstr = btrfs_decode_error(errno); @@ -252,7 +248,7 @@ void __btrfs_abort_transaction(struct btrfs_trans_handle *trans, /* Wake up anybody who may be waiting on this transaction */ wake_up(&root->fs_info->transaction_wait); wake_up(&root->fs_info->transaction_blocked_wait); - __btrfs_std_error(root->fs_info, function, line, errno, NULL); + __btrfs_handle_fs_error(root->fs_info, function, line, errno, NULL); } /* * __btrfs_panic decodes unexpected, fatal errors from the caller, @@ -1160,7 +1156,7 @@ int btrfs_sync_fs(struct super_block *sb, int wait) return 0; } - btrfs_wait_ordered_roots(fs_info, -1); + btrfs_wait_ordered_roots(fs_info, -1, 0, (u64)-1); trans = btrfs_attach_transaction_barrier(root); if (IS_ERR(trans)) { @@ -1488,10 +1484,10 @@ static int setup_security_options(struct btrfs_fs_info *fs_info, memcpy(&fs_info->security_opts, sec_opts, sizeof(*sec_opts)); } else { /* - * Since SELinux(the only one supports security_mnt_opts) does - * NOT support changing context during remount/mount same sb, - * This must be the same or part of the same security options, - * just free it. + * Since SELinux (the only one supporting security_mnt_opts) + * does NOT support changing context during remount/mount of + * the same sb, this must be the same or part of the same + * security options, just free it. */ security_free_mnt_opts(sec_opts); } @@ -1669,8 +1665,8 @@ static inline void btrfs_remount_cleanup(struct btrfs_fs_info *fs_info, unsigned long old_opts) { /* - * We need cleanup all defragable inodes if the autodefragment is - * close or the fs is R/O. + * We need to cleanup all defragable inodes if the autodefragment is + * close or the filesystem is read only. */ if (btrfs_raw_test_opt(old_opts, AUTO_DEFRAG) && (!btrfs_raw_test_opt(fs_info->mount_opt, AUTO_DEFRAG) || @@ -1811,6 +1807,8 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) } } sb->s_flags &= ~MS_RDONLY; + + fs_info->open = 1; } out: wake_up_process(fs_info->transaction_kthread); @@ -1881,7 +1879,7 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes) int ret; /* - * We aren't under the device list lock, so this is racey-ish, but good + * We aren't under the device list lock, so this is racy-ish, but good * enough for our purposes. */ nr_devices = fs_info->fs_devices->open_devices; @@ -1900,7 +1898,7 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes) if (!devices_info) return -ENOMEM; - /* calc min stripe number for data space alloction */ + /* calc min stripe number for data space allocation */ type = btrfs_get_alloc_profile(root, 1); if (type & BTRFS_BLOCK_GROUP_RAID0) { min_stripes = 2; @@ -1936,7 +1934,7 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes) avail_space *= BTRFS_STRIPE_LEN; /* - * In order to avoid overwritting the superblock on the drive, + * In order to avoid overwriting the superblock on the drive, * btrfs starts at an offset of at least 1MB when doing chunk * allocation. */ @@ -2051,9 +2049,10 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv; int ret; u64 thresh = 0; + int mixed = 0; /* - * holding chunk_muext to avoid allocating new chunks, holding + * holding chunk_mutex to avoid allocating new chunks, holding * device_list_mutex to avoid the device being removed */ rcu_read_lock(); @@ -2076,8 +2075,17 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) } } } - if (found->flags & BTRFS_BLOCK_GROUP_METADATA) - total_free_meta += found->disk_total - found->disk_used; + + /* + * Metadata in mixed block goup profiles are accounted in data + */ + if (!mixed && found->flags & BTRFS_BLOCK_GROUP_METADATA) { + if (found->flags & BTRFS_BLOCK_GROUP_DATA) + mixed = 1; + else + total_free_meta += found->disk_total - + found->disk_used; + } total_used += found->disk_used; } @@ -2090,7 +2098,11 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) /* Account global block reserve as used, it's in logical size already */ spin_lock(&block_rsv->lock); - buf->f_bfree -= block_rsv->size >> bits; + /* Mixed block groups accounting is not byte-accurate, avoid overflow */ + if (buf->f_bfree >= block_rsv->size >> bits) + buf->f_bfree -= block_rsv->size >> bits; + else + buf->f_bfree = 0; spin_unlock(&block_rsv->lock); buf->f_bavail = div_u64(total_free_data, factor); @@ -2115,7 +2127,7 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) */ thresh = 4 * 1024 * 1024; - if (total_free_meta - thresh < block_rsv->size) + if (!mixed && total_free_meta - thresh < block_rsv->size) buf->f_bavail = 0; buf->f_type = BTRFS_SUPER_MAGIC; @@ -2293,7 +2305,7 @@ static void btrfs_interface_exit(void) static void btrfs_print_mod_info(void) { - printk(KERN_INFO "Btrfs loaded" + printk(KERN_INFO "Btrfs loaded, crc32c=%s" #ifdef CONFIG_BTRFS_DEBUG ", debug=on" #endif @@ -2303,33 +2315,48 @@ static void btrfs_print_mod_info(void) #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY ", integrity-checker=on" #endif - "\n"); + "\n", + btrfs_crc32c_impl()); } static int btrfs_run_sanity_tests(void) { - int ret; - + int ret, i; + u32 sectorsize, nodesize; + u32 test_sectorsize[] = { + PAGE_SIZE, + }; ret = btrfs_init_test_fs(); if (ret) return ret; - - ret = btrfs_test_free_space_cache(); - if (ret) - goto out; - ret = btrfs_test_extent_buffer_operations(); - if (ret) - goto out; - ret = btrfs_test_extent_io(); - if (ret) - goto out; - ret = btrfs_test_inodes(); - if (ret) - goto out; - ret = btrfs_test_qgroups(); - if (ret) - goto out; - ret = btrfs_test_free_space_tree(); + for (i = 0; i < ARRAY_SIZE(test_sectorsize); i++) { + sectorsize = test_sectorsize[i]; + for (nodesize = sectorsize; + nodesize <= BTRFS_MAX_METADATA_BLOCKSIZE; + nodesize <<= 1) { + pr_info("BTRFS: selftest: sectorsize: %u nodesize: %u\n", + sectorsize, nodesize); + ret = btrfs_test_free_space_cache(sectorsize, nodesize); + if (ret) + goto out; + ret = btrfs_test_extent_buffer_operations(sectorsize, + nodesize); + if (ret) + goto out; + ret = btrfs_test_extent_io(sectorsize, nodesize); + if (ret) + goto out; + ret = btrfs_test_inodes(sectorsize, nodesize); + if (ret) + goto out; + ret = btrfs_test_qgroups(sectorsize, nodesize); + if (ret) + goto out; + ret = btrfs_test_free_space_tree(sectorsize, nodesize); + if (ret) + goto out; + } + } out: btrfs_destroy_test_fs(); return ret; diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 539e7b5e3f86..4879656bda3c 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -120,6 +120,9 @@ static ssize_t btrfs_feature_attr_store(struct kobject *kobj, if (!fs_info) return -EPERM; + if (fs_info->sb->s_flags & MS_RDONLY) + return -EROFS; + ret = kstrtoul(skip_spaces(buf), 0, &val); if (ret) return ret; @@ -364,7 +367,13 @@ static ssize_t btrfs_label_show(struct kobject *kobj, { struct btrfs_fs_info *fs_info = to_fs_info(kobj); char *label = fs_info->super_copy->label; - return snprintf(buf, PAGE_SIZE, label[0] ? "%s\n" : "%s", label); + ssize_t ret; + + spin_lock(&fs_info->super_lock); + ret = snprintf(buf, PAGE_SIZE, label[0] ? "%s\n" : "%s", label); + spin_unlock(&fs_info->super_lock); + + return ret; } static ssize_t btrfs_label_store(struct kobject *kobj, @@ -374,6 +383,9 @@ static ssize_t btrfs_label_store(struct kobject *kobj, struct btrfs_fs_info *fs_info = to_fs_info(kobj); size_t p_len; + if (!fs_info) + return -EPERM; + if (fs_info->sb->s_flags & MS_RDONLY) return -EROFS; diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c index f54bf450bad3..02223f3f78f4 100644 --- a/fs/btrfs/tests/btrfs-tests.c +++ b/fs/btrfs/tests/btrfs-tests.c @@ -68,7 +68,7 @@ int btrfs_init_test_fs(void) if (IS_ERR(test_mnt)) { printk(KERN_ERR "btrfs: cannot mount test file system\n"); unregister_filesystem(&test_type); - return ret; + return PTR_ERR(test_mnt); } return 0; } @@ -175,7 +175,7 @@ void btrfs_free_dummy_root(struct btrfs_root *root) } struct btrfs_block_group_cache * -btrfs_alloc_dummy_block_group(unsigned long length) +btrfs_alloc_dummy_block_group(unsigned long length, u32 sectorsize) { struct btrfs_block_group_cache *cache; @@ -192,8 +192,8 @@ btrfs_alloc_dummy_block_group(unsigned long length) cache->key.objectid = 0; cache->key.offset = length; cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; - cache->sectorsize = 4096; - cache->full_stripe_len = 4096; + cache->sectorsize = sectorsize; + cache->full_stripe_len = sectorsize; INIT_LIST_HEAD(&cache->list); INIT_LIST_HEAD(&cache->cluster_list); diff --git a/fs/btrfs/tests/btrfs-tests.h b/fs/btrfs/tests/btrfs-tests.h index 054b8c73c951..66fb6b701eb7 100644 --- a/fs/btrfs/tests/btrfs-tests.h +++ b/fs/btrfs/tests/btrfs-tests.h @@ -26,27 +26,28 @@ struct btrfs_root; struct btrfs_trans_handle; -int btrfs_test_free_space_cache(void); -int btrfs_test_extent_buffer_operations(void); -int btrfs_test_extent_io(void); -int btrfs_test_inodes(void); -int btrfs_test_qgroups(void); -int btrfs_test_free_space_tree(void); +int btrfs_test_free_space_cache(u32 sectorsize, u32 nodesize); +int btrfs_test_extent_buffer_operations(u32 sectorsize, u32 nodesize); +int btrfs_test_extent_io(u32 sectorsize, u32 nodesize); +int btrfs_test_inodes(u32 sectorsize, u32 nodesize); +int btrfs_test_qgroups(u32 sectorsize, u32 nodesize); +int btrfs_test_free_space_tree(u32 sectorsize, u32 nodesize); int btrfs_init_test_fs(void); void btrfs_destroy_test_fs(void); struct inode *btrfs_new_test_inode(void); struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(void); void btrfs_free_dummy_root(struct btrfs_root *root); struct btrfs_block_group_cache * -btrfs_alloc_dummy_block_group(unsigned long length); +btrfs_alloc_dummy_block_group(unsigned long length, u32 sectorsize); void btrfs_free_dummy_block_group(struct btrfs_block_group_cache *cache); void btrfs_init_dummy_trans(struct btrfs_trans_handle *trans); #else -static inline int btrfs_test_free_space_cache(void) +static inline int btrfs_test_free_space_cache(u32 sectorsize, u32 nodesize) { return 0; } -static inline int btrfs_test_extent_buffer_operations(void) +static inline int btrfs_test_extent_buffer_operations(u32 sectorsize, + u32 nodesize) { return 0; } @@ -57,19 +58,19 @@ static inline int btrfs_init_test_fs(void) static inline void btrfs_destroy_test_fs(void) { } -static inline int btrfs_test_extent_io(void) +static inline int btrfs_test_extent_io(u32 sectorsize, u32 nodesize) { return 0; } -static inline int btrfs_test_inodes(void) +static inline int btrfs_test_inodes(u32 sectorsize, u32 nodesize) { return 0; } -static inline int btrfs_test_qgroups(void) +static inline int btrfs_test_qgroups(u32 sectorsize, u32 nodesize) { return 0; } -static inline int btrfs_test_free_space_tree(void) +static inline int btrfs_test_free_space_tree(u32 sectorsize, u32 nodesize) { return 0; } diff --git a/fs/btrfs/tests/extent-buffer-tests.c b/fs/btrfs/tests/extent-buffer-tests.c index f51963a8f929..4f8cbd1ec5ee 100644 --- a/fs/btrfs/tests/extent-buffer-tests.c +++ b/fs/btrfs/tests/extent-buffer-tests.c @@ -22,7 +22,7 @@ #include "../extent_io.h" #include "../disk-io.h" -static int test_btrfs_split_item(void) +static int test_btrfs_split_item(u32 sectorsize, u32 nodesize) { struct btrfs_path *path; struct btrfs_root *root; @@ -40,7 +40,7 @@ static int test_btrfs_split_item(void) test_msg("Running btrfs_split_item tests\n"); - root = btrfs_alloc_dummy_root(); + root = btrfs_alloc_dummy_root(sectorsize, nodesize); if (IS_ERR(root)) { test_msg("Could not allocate root\n"); return PTR_ERR(root); @@ -53,7 +53,8 @@ static int test_btrfs_split_item(void) return -ENOMEM; } - path->nodes[0] = eb = alloc_dummy_extent_buffer(NULL, 4096); + path->nodes[0] = eb = alloc_dummy_extent_buffer(NULL, nodesize, + nodesize); if (!eb) { test_msg("Could not allocate dummy buffer\n"); ret = -ENOMEM; @@ -222,8 +223,8 @@ out: return ret; } -int btrfs_test_extent_buffer_operations(void) +int btrfs_test_extent_buffer_operations(u32 sectorsize, u32 nodesize) { - test_msg("Running extent buffer operation tests"); - return test_btrfs_split_item(); + test_msg("Running extent buffer operation tests\n"); + return test_btrfs_split_item(sectorsize, nodesize); } diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c index 70948b13bc81..d19ab0317283 100644 --- a/fs/btrfs/tests/extent-io-tests.c +++ b/fs/btrfs/tests/extent-io-tests.c @@ -21,6 +21,7 @@ #include <linux/slab.h> #include <linux/sizes.h> #include "btrfs-tests.h" +#include "../ctree.h" #include "../extent_io.h" #define PROCESS_UNLOCK (1 << 0) @@ -65,7 +66,7 @@ static noinline int process_page_range(struct inode *inode, u64 start, u64 end, return count; } -static int test_find_delalloc(void) +static int test_find_delalloc(u32 sectorsize) { struct inode *inode; struct extent_io_tree tmp; @@ -113,7 +114,7 @@ static int test_find_delalloc(void) * |--- delalloc ---| * |--- search ---| */ - set_extent_delalloc(&tmp, 0, 4095, NULL, GFP_KERNEL); + set_extent_delalloc(&tmp, 0, sectorsize - 1, NULL); start = 0; end = 0; found = find_lock_delalloc_range(inode, &tmp, locked_page, &start, @@ -122,9 +123,9 @@ static int test_find_delalloc(void) test_msg("Should have found at least one delalloc\n"); goto out_bits; } - if (start != 0 || end != 4095) { - test_msg("Expected start 0 end 4095, got start %Lu end %Lu\n", - start, end); + if (start != 0 || end != (sectorsize - 1)) { + test_msg("Expected start 0 end %u, got start %llu end %llu\n", + sectorsize - 1, start, end); goto out_bits; } unlock_extent(&tmp, start, end); @@ -144,7 +145,7 @@ static int test_find_delalloc(void) test_msg("Couldn't find the locked page\n"); goto out_bits; } - set_extent_delalloc(&tmp, 4096, max_bytes - 1, NULL, GFP_KERNEL); + set_extent_delalloc(&tmp, sectorsize, max_bytes - 1, NULL); start = test_start; end = 0; found = find_lock_delalloc_range(inode, &tmp, locked_page, &start, @@ -172,11 +173,11 @@ static int test_find_delalloc(void) * |--- delalloc ---| * |--- search ---| */ - test_start = max_bytes + 4096; + test_start = max_bytes + sectorsize; locked_page = find_lock_page(inode->i_mapping, test_start >> PAGE_SHIFT); if (!locked_page) { - test_msg("Could'nt find the locked page\n"); + test_msg("Couldn't find the locked page\n"); goto out_bits; } start = test_start; @@ -199,7 +200,7 @@ static int test_find_delalloc(void) * * We are re-using our test_start from above since it works out well. */ - set_extent_delalloc(&tmp, max_bytes, total_dirty - 1, NULL, GFP_KERNEL); + set_extent_delalloc(&tmp, max_bytes, total_dirty - 1, NULL); start = test_start; end = 0; found = find_lock_delalloc_range(inode, &tmp, locked_page, &start, @@ -262,7 +263,7 @@ static int test_find_delalloc(void) } ret = 0; out_bits: - clear_extent_bits(&tmp, 0, total_dirty - 1, (unsigned)-1, GFP_KERNEL); + clear_extent_bits(&tmp, 0, total_dirty - 1, (unsigned)-1); out: if (locked_page) put_page(locked_page); @@ -272,6 +273,16 @@ out: return ret; } +/** + * test_bit_in_byte - Determine whether a bit is set in a byte + * @nr: bit number to test + * @addr: Address to start counting from + */ +static inline int test_bit_in_byte(int nr, const u8 *addr) +{ + return 1UL & (addr[nr / BITS_PER_BYTE] >> (nr & (BITS_PER_BYTE - 1))); +} + static int __test_eb_bitmaps(unsigned long *bitmap, struct extent_buffer *eb, unsigned long len) { @@ -298,25 +309,29 @@ static int __test_eb_bitmaps(unsigned long *bitmap, struct extent_buffer *eb, return -EINVAL; } - bitmap_set(bitmap, (PAGE_SIZE - sizeof(long) / 2) * BITS_PER_BYTE, - sizeof(long) * BITS_PER_BYTE); - extent_buffer_bitmap_set(eb, PAGE_SIZE - sizeof(long) / 2, 0, - sizeof(long) * BITS_PER_BYTE); - if (memcmp_extent_buffer(eb, bitmap, 0, len) != 0) { - test_msg("Setting straddling pages failed\n"); - return -EINVAL; - } + /* Straddling pages test */ + if (len > PAGE_SIZE) { + bitmap_set(bitmap, + (PAGE_SIZE - sizeof(long) / 2) * BITS_PER_BYTE, + sizeof(long) * BITS_PER_BYTE); + extent_buffer_bitmap_set(eb, PAGE_SIZE - sizeof(long) / 2, 0, + sizeof(long) * BITS_PER_BYTE); + if (memcmp_extent_buffer(eb, bitmap, 0, len) != 0) { + test_msg("Setting straddling pages failed\n"); + return -EINVAL; + } - bitmap_set(bitmap, 0, len * BITS_PER_BYTE); - bitmap_clear(bitmap, - (PAGE_SIZE - sizeof(long) / 2) * BITS_PER_BYTE, - sizeof(long) * BITS_PER_BYTE); - extent_buffer_bitmap_set(eb, 0, 0, len * BITS_PER_BYTE); - extent_buffer_bitmap_clear(eb, PAGE_SIZE - sizeof(long) / 2, 0, - sizeof(long) * BITS_PER_BYTE); - if (memcmp_extent_buffer(eb, bitmap, 0, len) != 0) { - test_msg("Clearing straddling pages failed\n"); - return -EINVAL; + bitmap_set(bitmap, 0, len * BITS_PER_BYTE); + bitmap_clear(bitmap, + (PAGE_SIZE - sizeof(long) / 2) * BITS_PER_BYTE, + sizeof(long) * BITS_PER_BYTE); + extent_buffer_bitmap_set(eb, 0, 0, len * BITS_PER_BYTE); + extent_buffer_bitmap_clear(eb, PAGE_SIZE - sizeof(long) / 2, 0, + sizeof(long) * BITS_PER_BYTE); + if (memcmp_extent_buffer(eb, bitmap, 0, len) != 0) { + test_msg("Clearing straddling pages failed\n"); + return -EINVAL; + } } /* @@ -333,7 +348,7 @@ static int __test_eb_bitmaps(unsigned long *bitmap, struct extent_buffer *eb, for (i = 0; i < len * BITS_PER_BYTE; i++) { int bit, bit1; - bit = !!test_bit(i, bitmap); + bit = !!test_bit_in_byte(i, (u8 *)bitmap); bit1 = !!extent_buffer_test_bit(eb, 0, i); if (bit1 != bit) { test_msg("Testing bit pattern failed\n"); @@ -351,15 +366,22 @@ static int __test_eb_bitmaps(unsigned long *bitmap, struct extent_buffer *eb, return 0; } -static int test_eb_bitmaps(void) +static int test_eb_bitmaps(u32 sectorsize, u32 nodesize) { - unsigned long len = PAGE_SIZE * 4; + unsigned long len; unsigned long *bitmap; struct extent_buffer *eb; int ret; test_msg("Running extent buffer bitmap tests\n"); + /* + * In ppc64, sectorsize can be 64K, thus 4 * 64K will be larger than + * BTRFS_MAX_METADATA_BLOCKSIZE. + */ + len = (sectorsize < BTRFS_MAX_METADATA_BLOCKSIZE) + ? sectorsize * 4 : sectorsize; + bitmap = kmalloc(len, GFP_KERNEL); if (!bitmap) { test_msg("Couldn't allocate test bitmap\n"); @@ -379,7 +401,7 @@ static int test_eb_bitmaps(void) /* Do it over again with an extent buffer which isn't page-aligned. */ free_extent_buffer(eb); - eb = __alloc_dummy_extent_buffer(NULL, PAGE_SIZE / 2, len); + eb = __alloc_dummy_extent_buffer(NULL, nodesize / 2, len); if (!eb) { test_msg("Couldn't allocate test extent buffer\n"); kfree(bitmap); @@ -393,17 +415,17 @@ out: return ret; } -int btrfs_test_extent_io(void) +int btrfs_test_extent_io(u32 sectorsize, u32 nodesize) { int ret; test_msg("Running extent I/O tests\n"); - ret = test_find_delalloc(); + ret = test_find_delalloc(sectorsize); if (ret) goto out; - ret = test_eb_bitmaps(); + ret = test_eb_bitmaps(sectorsize, nodesize); out: test_msg("Extent I/O tests finished\n"); return ret; diff --git a/fs/btrfs/tests/free-space-tests.c b/fs/btrfs/tests/free-space-tests.c index 514247515312..3956bb2ff84c 100644 --- a/fs/btrfs/tests/free-space-tests.c +++ b/fs/btrfs/tests/free-space-tests.c @@ -22,10 +22,10 @@ #include "../disk-io.h" #include "../free-space-cache.h" -#define BITS_PER_BITMAP (PAGE_SIZE * 8) +#define BITS_PER_BITMAP (PAGE_SIZE * 8UL) /* - * This test just does basic sanity checking, making sure we can add an exten + * This test just does basic sanity checking, making sure we can add an extent * entry and remove space from either end and the middle, and make sure we can * remove space that covers adjacent extent entries. */ @@ -99,7 +99,8 @@ static int test_extents(struct btrfs_block_group_cache *cache) return 0; } -static int test_bitmaps(struct btrfs_block_group_cache *cache) +static int test_bitmaps(struct btrfs_block_group_cache *cache, + u32 sectorsize) { u64 next_bitmap_offset; int ret; @@ -139,7 +140,7 @@ static int test_bitmaps(struct btrfs_block_group_cache *cache) * The first bitmap we have starts at offset 0 so the next one is just * at the end of the first bitmap. */ - next_bitmap_offset = (u64)(BITS_PER_BITMAP * 4096); + next_bitmap_offset = (u64)(BITS_PER_BITMAP * sectorsize); /* Test a bit straddling two bitmaps */ ret = test_add_free_space_entry(cache, next_bitmap_offset - SZ_2M, @@ -167,9 +168,10 @@ static int test_bitmaps(struct btrfs_block_group_cache *cache) } /* This is the high grade jackassery */ -static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache) +static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache, + u32 sectorsize) { - u64 bitmap_offset = (u64)(BITS_PER_BITMAP * 4096); + u64 bitmap_offset = (u64)(BITS_PER_BITMAP * sectorsize); int ret; test_msg("Running bitmap and extent tests\n"); @@ -396,11 +398,13 @@ static int check_cache_empty(struct btrfs_block_group_cache *cache) * wasn't optimal as they could be spread all over the block group while under * concurrency (extra overhead and fragmentation). * - * This stealing approach is benefical, since we always prefer to allocate from - * extent entries, both for clustered and non-clustered allocation requests. + * This stealing approach is beneficial, since we always prefer to allocate + * from extent entries, both for clustered and non-clustered allocation + * requests. */ static int -test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache) +test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache, + u32 sectorsize) { int ret; u64 offset; @@ -538,7 +542,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache) * The goal is to test that the bitmap entry space stealing doesn't * steal this space region. */ - ret = btrfs_add_free_space(cache, SZ_128M + SZ_16M, 4096); + ret = btrfs_add_free_space(cache, SZ_128M + SZ_16M, sectorsize); if (ret) { test_msg("Error adding free space: %d\n", ret); return ret; @@ -596,8 +600,8 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache) return -ENOENT; } - if (cache->free_space_ctl->free_space != (SZ_1M + 4096)) { - test_msg("Cache free space is not 1Mb + 4Kb\n"); + if (cache->free_space_ctl->free_space != (SZ_1M + sectorsize)) { + test_msg("Cache free space is not 1Mb + %u\n", sectorsize); return -EINVAL; } @@ -610,22 +614,25 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache) return -EINVAL; } - /* All that remains is a 4Kb free space region in a bitmap. Confirm. */ + /* + * All that remains is a sectorsize free space region in a bitmap. + * Confirm. + */ ret = check_num_extents_and_bitmaps(cache, 1, 1); if (ret) return ret; - if (cache->free_space_ctl->free_space != 4096) { - test_msg("Cache free space is not 4Kb\n"); + if (cache->free_space_ctl->free_space != sectorsize) { + test_msg("Cache free space is not %u\n", sectorsize); return -EINVAL; } offset = btrfs_find_space_for_alloc(cache, - 0, 4096, 0, + 0, sectorsize, 0, &max_extent_size); if (offset != (SZ_128M + SZ_16M)) { - test_msg("Failed to allocate 4Kb from space cache, returned offset is: %llu\n", - offset); + test_msg("Failed to allocate %u, returned offset : %llu\n", + sectorsize, offset); return -EINVAL; } @@ -732,7 +739,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache) * The goal is to test that the bitmap entry space stealing doesn't * steal this space region. */ - ret = btrfs_add_free_space(cache, SZ_32M, 8192); + ret = btrfs_add_free_space(cache, SZ_32M, 2 * sectorsize); if (ret) { test_msg("Error adding free space: %d\n", ret); return ret; @@ -756,7 +763,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache) /* * Confirm that our extent entry didn't stole all free space from the - * bitmap, because of the small 8Kb free space region. + * bitmap, because of the small 2 * sectorsize free space region. */ ret = check_num_extents_and_bitmaps(cache, 2, 1); if (ret) @@ -782,8 +789,8 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache) return -ENOENT; } - if (cache->free_space_ctl->free_space != (SZ_1M + 8192)) { - test_msg("Cache free space is not 1Mb + 8Kb\n"); + if (cache->free_space_ctl->free_space != (SZ_1M + 2 * sectorsize)) { + test_msg("Cache free space is not 1Mb + %u\n", 2 * sectorsize); return -EINVAL; } @@ -795,21 +802,25 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache) return -EINVAL; } - /* All that remains is a 8Kb free space region in a bitmap. Confirm. */ + /* + * All that remains is 2 * sectorsize free space region + * in a bitmap. Confirm. + */ ret = check_num_extents_and_bitmaps(cache, 1, 1); if (ret) return ret; - if (cache->free_space_ctl->free_space != 8192) { - test_msg("Cache free space is not 8Kb\n"); + if (cache->free_space_ctl->free_space != 2 * sectorsize) { + test_msg("Cache free space is not %u\n", 2 * sectorsize); return -EINVAL; } offset = btrfs_find_space_for_alloc(cache, - 0, 8192, 0, + 0, 2 * sectorsize, 0, &max_extent_size); if (offset != SZ_32M) { - test_msg("Failed to allocate 8Kb from space cache, returned offset is: %llu\n", + test_msg("Failed to allocate %u, offset: %llu\n", + 2 * sectorsize, offset); return -EINVAL; } @@ -824,7 +835,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache) return 0; } -int btrfs_test_free_space_cache(void) +int btrfs_test_free_space_cache(u32 sectorsize, u32 nodesize) { struct btrfs_block_group_cache *cache; struct btrfs_root *root = NULL; @@ -832,13 +843,19 @@ int btrfs_test_free_space_cache(void) test_msg("Running btrfs free space cache tests\n"); - cache = btrfs_alloc_dummy_block_group(1024 * 1024 * 1024); + /* + * For ppc64 (with 64k page size), bytes per bitmap might be + * larger than 1G. To make bitmap test available in ppc64, + * alloc dummy block group whose size cross bitmaps. + */ + cache = btrfs_alloc_dummy_block_group(BITS_PER_BITMAP * sectorsize + + PAGE_SIZE, sectorsize); if (!cache) { test_msg("Couldn't run the tests\n"); return 0; } - root = btrfs_alloc_dummy_root(); + root = btrfs_alloc_dummy_root(sectorsize, nodesize); if (IS_ERR(root)) { ret = PTR_ERR(root); goto out; @@ -854,14 +871,14 @@ int btrfs_test_free_space_cache(void) ret = test_extents(cache); if (ret) goto out; - ret = test_bitmaps(cache); + ret = test_bitmaps(cache, sectorsize); if (ret) goto out; - ret = test_bitmaps_and_extents(cache); + ret = test_bitmaps_and_extents(cache, sectorsize); if (ret) goto out; - ret = test_steal_space_from_bitmap_to_extent(cache); + ret = test_steal_space_from_bitmap_to_extent(cache, sectorsize); out: btrfs_free_dummy_block_group(cache); btrfs_free_dummy_root(root); diff --git a/fs/btrfs/tests/free-space-tree-tests.c b/fs/btrfs/tests/free-space-tree-tests.c index 7cea4462acd5..aac507085ab0 100644 --- a/fs/btrfs/tests/free-space-tree-tests.c +++ b/fs/btrfs/tests/free-space-tree-tests.c @@ -16,6 +16,7 @@ * Boston, MA 021110-1307, USA. */ +#include <linux/types.h> #include "btrfs-tests.h" #include "../ctree.h" #include "../disk-io.h" @@ -30,7 +31,7 @@ struct free_space_extent { * The test cases align their operations to this in order to hit some of the * edge cases in the bitmap code. */ -#define BITMAP_RANGE (BTRFS_FREE_SPACE_BITMAP_BITS * 4096) +#define BITMAP_RANGE (BTRFS_FREE_SPACE_BITMAP_BITS * PAGE_SIZE) static int __check_free_space_extents(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, @@ -439,7 +440,8 @@ typedef int (*test_func_t)(struct btrfs_trans_handle *, struct btrfs_block_group_cache *, struct btrfs_path *); -static int run_test(test_func_t test_func, int bitmaps) +static int run_test(test_func_t test_func, int bitmaps, + u32 sectorsize, u32 nodesize) { struct btrfs_root *root = NULL; struct btrfs_block_group_cache *cache = NULL; @@ -447,7 +449,7 @@ static int run_test(test_func_t test_func, int bitmaps) struct btrfs_path *path = NULL; int ret; - root = btrfs_alloc_dummy_root(); + root = btrfs_alloc_dummy_root(sectorsize, nodesize); if (IS_ERR(root)) { test_msg("Couldn't allocate dummy root\n"); ret = PTR_ERR(root); @@ -466,7 +468,8 @@ static int run_test(test_func_t test_func, int bitmaps) root->fs_info->free_space_root = root; root->fs_info->tree_root = root; - root->node = alloc_test_extent_buffer(root->fs_info, 4096); + root->node = alloc_test_extent_buffer(root->fs_info, + nodesize, nodesize); if (!root->node) { test_msg("Couldn't allocate dummy buffer\n"); ret = -ENOMEM; @@ -474,9 +477,9 @@ static int run_test(test_func_t test_func, int bitmaps) } btrfs_set_header_level(root->node, 0); btrfs_set_header_nritems(root->node, 0); - root->alloc_bytenr += 8192; + root->alloc_bytenr += 2 * nodesize; - cache = btrfs_alloc_dummy_block_group(8 * BITMAP_RANGE); + cache = btrfs_alloc_dummy_block_group(8 * BITMAP_RANGE, sectorsize); if (!cache) { test_msg("Couldn't allocate dummy block group cache\n"); ret = -ENOMEM; @@ -534,17 +537,18 @@ out: return ret; } -static int run_test_both_formats(test_func_t test_func) +static int run_test_both_formats(test_func_t test_func, + u32 sectorsize, u32 nodesize) { int ret; - ret = run_test(test_func, 0); + ret = run_test(test_func, 0, sectorsize, nodesize); if (ret) return ret; - return run_test(test_func, 1); + return run_test(test_func, 1, sectorsize, nodesize); } -int btrfs_test_free_space_tree(void) +int btrfs_test_free_space_tree(u32 sectorsize, u32 nodesize) { test_func_t tests[] = { test_empty_block_group, @@ -561,9 +565,11 @@ int btrfs_test_free_space_tree(void) test_msg("Running free space tree tests\n"); for (i = 0; i < ARRAY_SIZE(tests); i++) { - int ret = run_test_both_formats(tests[i]); + int ret = run_test_both_formats(tests[i], sectorsize, + nodesize); if (ret) { - test_msg("%pf failed\n", tests[i]); + test_msg("%pf : sectorsize %u failed\n", + tests[i], sectorsize); return ret; } } diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c index 863a6a3af1f8..29648c0a39f1 100644 --- a/fs/btrfs/tests/inode-tests.c +++ b/fs/btrfs/tests/inode-tests.c @@ -16,6 +16,7 @@ * Boston, MA 021110-1307, USA. */ +#include <linux/types.h> #include "btrfs-tests.h" #include "../ctree.h" #include "../btrfs_inode.h" @@ -86,19 +87,19 @@ static void insert_inode_item_key(struct btrfs_root *root) * diagram of how the extents will look though this may not be possible we still * want to make sure everything acts normally (the last number is not inclusive) * - * [0 - 5][5 - 6][6 - 10][10 - 4096][ 4096 - 8192 ][8192 - 12288] - * [hole ][inline][ hole ][ regular ][regular1 split][ hole ] + * [0 - 5][5 - 6][ 6 - 4096 ][ 4096 - 4100][4100 - 8195][8195 - 12291] + * [hole ][inline][hole but no extent][ hole ][ regular ][regular1 split] * - * [ 12288 - 20480][20480 - 24576][ 24576 - 28672 ][28672 - 36864][36864 - 45056] - * [regular1 split][ prealloc1 ][prealloc1 written][ prealloc1 ][ compressed ] + * [12291 - 16387][16387 - 24579][24579 - 28675][ 28675 - 32771][32771 - 36867 ] + * [ hole ][regular1 split][ prealloc ][ prealloc1 ][prealloc1 written] * - * [45056 - 49152][49152-53248][53248-61440][61440-65536][ 65536+81920 ] - * [ compressed1 ][ regular ][compressed1][ regular ][ hole but no extent] + * [36867 - 45059][45059 - 53251][53251 - 57347][57347 - 61443][61443- 69635] + * [ prealloc1 ][ compressed ][ compressed1 ][ regular ][ compressed1] * - * [81920-86016] - * [ regular ] + * [69635-73731][ 73731 - 86019 ][86019-90115] + * [ regular ][ hole but no extent][ regular ] */ -static void setup_file_extents(struct btrfs_root *root) +static void setup_file_extents(struct btrfs_root *root, u32 sectorsize) { int slot = 0; u64 disk_bytenr = SZ_1M; @@ -119,7 +120,7 @@ static void setup_file_extents(struct btrfs_root *root) insert_extent(root, offset, 1, 1, 0, 0, 0, BTRFS_FILE_EXTENT_INLINE, 0, slot); slot++; - offset = 4096; + offset = sectorsize; /* Now another hole */ insert_extent(root, offset, 4, 4, 0, 0, 0, BTRFS_FILE_EXTENT_REG, 0, @@ -128,99 +129,106 @@ static void setup_file_extents(struct btrfs_root *root) offset += 4; /* Now for a regular extent */ - insert_extent(root, offset, 4095, 4095, 0, disk_bytenr, 4096, - BTRFS_FILE_EXTENT_REG, 0, slot); + insert_extent(root, offset, sectorsize - 1, sectorsize - 1, 0, + disk_bytenr, sectorsize, BTRFS_FILE_EXTENT_REG, 0, slot); slot++; - disk_bytenr += 4096; - offset += 4095; + disk_bytenr += sectorsize; + offset += sectorsize - 1; /* * Now for 3 extents that were split from a hole punch so we test * offsets properly. */ - insert_extent(root, offset, 4096, 16384, 0, disk_bytenr, 16384, - BTRFS_FILE_EXTENT_REG, 0, slot); + insert_extent(root, offset, sectorsize, 4 * sectorsize, 0, disk_bytenr, + 4 * sectorsize, BTRFS_FILE_EXTENT_REG, 0, slot); slot++; - offset += 4096; - insert_extent(root, offset, 4096, 4096, 0, 0, 0, BTRFS_FILE_EXTENT_REG, - 0, slot); + offset += sectorsize; + insert_extent(root, offset, sectorsize, sectorsize, 0, 0, 0, + BTRFS_FILE_EXTENT_REG, 0, slot); slot++; - offset += 4096; - insert_extent(root, offset, 8192, 16384, 8192, disk_bytenr, 16384, + offset += sectorsize; + insert_extent(root, offset, 2 * sectorsize, 4 * sectorsize, + 2 * sectorsize, disk_bytenr, 4 * sectorsize, BTRFS_FILE_EXTENT_REG, 0, slot); slot++; - offset += 8192; - disk_bytenr += 16384; + offset += 2 * sectorsize; + disk_bytenr += 4 * sectorsize; /* Now for a unwritten prealloc extent */ - insert_extent(root, offset, 4096, 4096, 0, disk_bytenr, 4096, - BTRFS_FILE_EXTENT_PREALLOC, 0, slot); + insert_extent(root, offset, sectorsize, sectorsize, 0, disk_bytenr, + sectorsize, BTRFS_FILE_EXTENT_PREALLOC, 0, slot); slot++; - offset += 4096; + offset += sectorsize; /* * We want to jack up disk_bytenr a little more so the em stuff doesn't * merge our records. */ - disk_bytenr += 8192; + disk_bytenr += 2 * sectorsize; /* * Now for a partially written prealloc extent, basically the same as * the hole punch example above. Ram_bytes never changes when you mark * extents written btw. */ - insert_extent(root, offset, 4096, 16384, 0, disk_bytenr, 16384, - BTRFS_FILE_EXTENT_PREALLOC, 0, slot); + insert_extent(root, offset, sectorsize, 4 * sectorsize, 0, disk_bytenr, + 4 * sectorsize, BTRFS_FILE_EXTENT_PREALLOC, 0, slot); slot++; - offset += 4096; - insert_extent(root, offset, 4096, 16384, 4096, disk_bytenr, 16384, - BTRFS_FILE_EXTENT_REG, 0, slot); + offset += sectorsize; + insert_extent(root, offset, sectorsize, 4 * sectorsize, sectorsize, + disk_bytenr, 4 * sectorsize, BTRFS_FILE_EXTENT_REG, 0, + slot); slot++; - offset += 4096; - insert_extent(root, offset, 8192, 16384, 8192, disk_bytenr, 16384, + offset += sectorsize; + insert_extent(root, offset, 2 * sectorsize, 4 * sectorsize, + 2 * sectorsize, disk_bytenr, 4 * sectorsize, BTRFS_FILE_EXTENT_PREALLOC, 0, slot); slot++; - offset += 8192; - disk_bytenr += 16384; + offset += 2 * sectorsize; + disk_bytenr += 4 * sectorsize; /* Now a normal compressed extent */ - insert_extent(root, offset, 8192, 8192, 0, disk_bytenr, 4096, - BTRFS_FILE_EXTENT_REG, BTRFS_COMPRESS_ZLIB, slot); + insert_extent(root, offset, 2 * sectorsize, 2 * sectorsize, 0, + disk_bytenr, sectorsize, BTRFS_FILE_EXTENT_REG, + BTRFS_COMPRESS_ZLIB, slot); slot++; - offset += 8192; + offset += 2 * sectorsize; /* No merges */ - disk_bytenr += 8192; + disk_bytenr += 2 * sectorsize; /* Now a split compressed extent */ - insert_extent(root, offset, 4096, 16384, 0, disk_bytenr, 4096, - BTRFS_FILE_EXTENT_REG, BTRFS_COMPRESS_ZLIB, slot); + insert_extent(root, offset, sectorsize, 4 * sectorsize, 0, disk_bytenr, + sectorsize, BTRFS_FILE_EXTENT_REG, + BTRFS_COMPRESS_ZLIB, slot); slot++; - offset += 4096; - insert_extent(root, offset, 4096, 4096, 0, disk_bytenr + 4096, 4096, + offset += sectorsize; + insert_extent(root, offset, sectorsize, sectorsize, 0, + disk_bytenr + sectorsize, sectorsize, BTRFS_FILE_EXTENT_REG, 0, slot); slot++; - offset += 4096; - insert_extent(root, offset, 8192, 16384, 8192, disk_bytenr, 4096, + offset += sectorsize; + insert_extent(root, offset, 2 * sectorsize, 4 * sectorsize, + 2 * sectorsize, disk_bytenr, sectorsize, BTRFS_FILE_EXTENT_REG, BTRFS_COMPRESS_ZLIB, slot); slot++; - offset += 8192; - disk_bytenr += 8192; + offset += 2 * sectorsize; + disk_bytenr += 2 * sectorsize; /* Now extents that have a hole but no hole extent */ - insert_extent(root, offset, 4096, 4096, 0, disk_bytenr, 4096, - BTRFS_FILE_EXTENT_REG, 0, slot); + insert_extent(root, offset, sectorsize, sectorsize, 0, disk_bytenr, + sectorsize, BTRFS_FILE_EXTENT_REG, 0, slot); slot++; - offset += 16384; - disk_bytenr += 4096; - insert_extent(root, offset, 4096, 4096, 0, disk_bytenr, 4096, - BTRFS_FILE_EXTENT_REG, 0, slot); + offset += 4 * sectorsize; + disk_bytenr += sectorsize; + insert_extent(root, offset, sectorsize, sectorsize, 0, disk_bytenr, + sectorsize, BTRFS_FILE_EXTENT_REG, 0, slot); } static unsigned long prealloc_only = 0; static unsigned long compressed_only = 0; static unsigned long vacancy_only = 0; -static noinline int test_btrfs_get_extent(void) +static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) { struct inode *inode = NULL; struct btrfs_root *root = NULL; @@ -240,7 +248,7 @@ static noinline int test_btrfs_get_extent(void) BTRFS_I(inode)->location.objectid = BTRFS_FIRST_FREE_OBJECTID; BTRFS_I(inode)->location.offset = 0; - root = btrfs_alloc_dummy_root(); + root = btrfs_alloc_dummy_root(sectorsize, nodesize); if (IS_ERR(root)) { test_msg("Couldn't allocate root\n"); goto out; @@ -256,7 +264,7 @@ static noinline int test_btrfs_get_extent(void) goto out; } - root->node = alloc_dummy_extent_buffer(NULL, 4096); + root->node = alloc_dummy_extent_buffer(NULL, nodesize, nodesize); if (!root->node) { test_msg("Couldn't allocate dummy buffer\n"); goto out; @@ -264,7 +272,7 @@ static noinline int test_btrfs_get_extent(void) /* * We will just free a dummy node if it's ref count is 2 so we need an - * extra ref so our searches don't accidently release our page. + * extra ref so our searches don't accidentally release our page. */ extent_buffer_get(root->node); btrfs_set_header_nritems(root->node, 0); @@ -273,7 +281,7 @@ static noinline int test_btrfs_get_extent(void) /* First with no extents */ BTRFS_I(inode)->root = root; - em = btrfs_get_extent(inode, NULL, 0, 0, 4096, 0); + em = btrfs_get_extent(inode, NULL, 0, 0, sectorsize, 0); if (IS_ERR(em)) { em = NULL; test_msg("Got an error when we shouldn't have\n"); @@ -295,7 +303,7 @@ static noinline int test_btrfs_get_extent(void) * setup_file_extents, so if you change anything there you need to * update the comment and update the expected values below. */ - setup_file_extents(root); + setup_file_extents(root, sectorsize); em = btrfs_get_extent(inode, NULL, 0, 0, (u64)-1, 0); if (IS_ERR(em)) { @@ -318,7 +326,7 @@ static noinline int test_btrfs_get_extent(void) offset = em->start + em->len; free_extent_map(em); - em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0); + em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize, 0); if (IS_ERR(em)) { test_msg("Got an error when we shouldn't have\n"); goto out; @@ -327,7 +335,8 @@ static noinline int test_btrfs_get_extent(void) test_msg("Expected an inline, got %llu\n", em->block_start); goto out; } - if (em->start != offset || em->len != 4091) { + + if (em->start != offset || em->len != (sectorsize - 5)) { test_msg("Unexpected extent wanted start %llu len 1, got start " "%llu len %llu\n", offset, em->start, em->len); goto out; @@ -344,7 +353,7 @@ static noinline int test_btrfs_get_extent(void) offset = em->start + em->len; free_extent_map(em); - em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0); + em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize, 0); if (IS_ERR(em)) { test_msg("Got an error when we shouldn't have\n"); goto out; @@ -366,7 +375,7 @@ static noinline int test_btrfs_get_extent(void) free_extent_map(em); /* Regular extent */ - em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0); + em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize, 0); if (IS_ERR(em)) { test_msg("Got an error when we shouldn't have\n"); goto out; @@ -375,7 +384,7 @@ static noinline int test_btrfs_get_extent(void) test_msg("Expected a real extent, got %llu\n", em->block_start); goto out; } - if (em->start != offset || em->len != 4095) { + if (em->start != offset || em->len != sectorsize - 1) { test_msg("Unexpected extent wanted start %llu len 4095, got " "start %llu len %llu\n", offset, em->start, em->len); goto out; @@ -393,7 +402,7 @@ static noinline int test_btrfs_get_extent(void) free_extent_map(em); /* The next 3 are split extents */ - em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0); + em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize, 0); if (IS_ERR(em)) { test_msg("Got an error when we shouldn't have\n"); goto out; @@ -402,9 +411,10 @@ static noinline int test_btrfs_get_extent(void) test_msg("Expected a real extent, got %llu\n", em->block_start); goto out; } - if (em->start != offset || em->len != 4096) { - test_msg("Unexpected extent wanted start %llu len 4096, got " - "start %llu len %llu\n", offset, em->start, em->len); + if (em->start != offset || em->len != sectorsize) { + test_msg("Unexpected extent start %llu len %u, " + "got start %llu len %llu\n", + offset, sectorsize, em->start, em->len); goto out; } if (em->flags != 0) { @@ -421,7 +431,7 @@ static noinline int test_btrfs_get_extent(void) offset = em->start + em->len; free_extent_map(em); - em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0); + em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize, 0); if (IS_ERR(em)) { test_msg("Got an error when we shouldn't have\n"); goto out; @@ -430,9 +440,10 @@ static noinline int test_btrfs_get_extent(void) test_msg("Expected a hole, got %llu\n", em->block_start); goto out; } - if (em->start != offset || em->len != 4096) { - test_msg("Unexpected extent wanted start %llu len 4096, got " - "start %llu len %llu\n", offset, em->start, em->len); + if (em->start != offset || em->len != sectorsize) { + test_msg("Unexpected extent wanted start %llu len %u, " + "got start %llu len %llu\n", + offset, sectorsize, em->start, em->len); goto out; } if (em->flags != 0) { @@ -442,7 +453,7 @@ static noinline int test_btrfs_get_extent(void) offset = em->start + em->len; free_extent_map(em); - em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0); + em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize, 0); if (IS_ERR(em)) { test_msg("Got an error when we shouldn't have\n"); goto out; @@ -451,9 +462,10 @@ static noinline int test_btrfs_get_extent(void) test_msg("Expected a real extent, got %llu\n", em->block_start); goto out; } - if (em->start != offset || em->len != 8192) { - test_msg("Unexpected extent wanted start %llu len 8192, got " - "start %llu len %llu\n", offset, em->start, em->len); + if (em->start != offset || em->len != 2 * sectorsize) { + test_msg("Unexpected extent wanted start %llu len %u, " + "got start %llu len %llu\n", + offset, 2 * sectorsize, em->start, em->len); goto out; } if (em->flags != 0) { @@ -475,7 +487,7 @@ static noinline int test_btrfs_get_extent(void) free_extent_map(em); /* Prealloc extent */ - em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0); + em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize, 0); if (IS_ERR(em)) { test_msg("Got an error when we shouldn't have\n"); goto out; @@ -484,9 +496,10 @@ static noinline int test_btrfs_get_extent(void) test_msg("Expected a real extent, got %llu\n", em->block_start); goto out; } - if (em->start != offset || em->len != 4096) { - test_msg("Unexpected extent wanted start %llu len 4096, got " - "start %llu len %llu\n", offset, em->start, em->len); + if (em->start != offset || em->len != sectorsize) { + test_msg("Unexpected extent wanted start %llu len %u, " + "got start %llu len %llu\n", + offset, sectorsize, em->start, em->len); goto out; } if (em->flags != prealloc_only) { @@ -503,7 +516,7 @@ static noinline int test_btrfs_get_extent(void) free_extent_map(em); /* The next 3 are a half written prealloc extent */ - em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0); + em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize, 0); if (IS_ERR(em)) { test_msg("Got an error when we shouldn't have\n"); goto out; @@ -512,9 +525,10 @@ static noinline int test_btrfs_get_extent(void) test_msg("Expected a real extent, got %llu\n", em->block_start); goto out; } - if (em->start != offset || em->len != 4096) { - test_msg("Unexpected extent wanted start %llu len 4096, got " - "start %llu len %llu\n", offset, em->start, em->len); + if (em->start != offset || em->len != sectorsize) { + test_msg("Unexpected extent wanted start %llu len %u, " + "got start %llu len %llu\n", + offset, sectorsize, em->start, em->len); goto out; } if (em->flags != prealloc_only) { @@ -532,7 +546,7 @@ static noinline int test_btrfs_get_extent(void) offset = em->start + em->len; free_extent_map(em); - em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0); + em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize, 0); if (IS_ERR(em)) { test_msg("Got an error when we shouldn't have\n"); goto out; @@ -541,9 +555,10 @@ static noinline int test_btrfs_get_extent(void) test_msg("Expected a real extent, got %llu\n", em->block_start); goto out; } - if (em->start != offset || em->len != 4096) { - test_msg("Unexpected extent wanted start %llu len 4096, got " - "start %llu len %llu\n", offset, em->start, em->len); + if (em->start != offset || em->len != sectorsize) { + test_msg("Unexpected extent wanted start %llu len %u, " + "got start %llu len %llu\n", + offset, sectorsize, em->start, em->len); goto out; } if (em->flags != 0) { @@ -564,7 +579,7 @@ static noinline int test_btrfs_get_extent(void) offset = em->start + em->len; free_extent_map(em); - em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0); + em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize, 0); if (IS_ERR(em)) { test_msg("Got an error when we shouldn't have\n"); goto out; @@ -573,9 +588,10 @@ static noinline int test_btrfs_get_extent(void) test_msg("Expected a real extent, got %llu\n", em->block_start); goto out; } - if (em->start != offset || em->len != 8192) { - test_msg("Unexpected extent wanted start %llu len 8192, got " - "start %llu len %llu\n", offset, em->start, em->len); + if (em->start != offset || em->len != 2 * sectorsize) { + test_msg("Unexpected extent wanted start %llu len %u, " + "got start %llu len %llu\n", + offset, 2 * sectorsize, em->start, em->len); goto out; } if (em->flags != prealloc_only) { @@ -598,7 +614,7 @@ static noinline int test_btrfs_get_extent(void) free_extent_map(em); /* Now for the compressed extent */ - em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0); + em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize, 0); if (IS_ERR(em)) { test_msg("Got an error when we shouldn't have\n"); goto out; @@ -607,9 +623,10 @@ static noinline int test_btrfs_get_extent(void) test_msg("Expected a real extent, got %llu\n", em->block_start); goto out; } - if (em->start != offset || em->len != 8192) { - test_msg("Unexpected extent wanted start %llu len 8192, got " - "start %llu len %llu\n", offset, em->start, em->len); + if (em->start != offset || em->len != 2 * sectorsize) { + test_msg("Unexpected extent wanted start %llu len %u," + "got start %llu len %llu\n", + offset, 2 * sectorsize, em->start, em->len); goto out; } if (em->flags != compressed_only) { @@ -631,7 +648,7 @@ static noinline int test_btrfs_get_extent(void) free_extent_map(em); /* Split compressed extent */ - em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0); + em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize, 0); if (IS_ERR(em)) { test_msg("Got an error when we shouldn't have\n"); goto out; @@ -640,9 +657,10 @@ static noinline int test_btrfs_get_extent(void) test_msg("Expected a real extent, got %llu\n", em->block_start); goto out; } - if (em->start != offset || em->len != 4096) { - test_msg("Unexpected extent wanted start %llu len 4096, got " - "start %llu len %llu\n", offset, em->start, em->len); + if (em->start != offset || em->len != sectorsize) { + test_msg("Unexpected extent wanted start %llu len %u," + "got start %llu len %llu\n", + offset, sectorsize, em->start, em->len); goto out; } if (em->flags != compressed_only) { @@ -665,7 +683,7 @@ static noinline int test_btrfs_get_extent(void) offset = em->start + em->len; free_extent_map(em); - em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0); + em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize, 0); if (IS_ERR(em)) { test_msg("Got an error when we shouldn't have\n"); goto out; @@ -674,9 +692,10 @@ static noinline int test_btrfs_get_extent(void) test_msg("Expected a real extent, got %llu\n", em->block_start); goto out; } - if (em->start != offset || em->len != 4096) { - test_msg("Unexpected extent wanted start %llu len 4096, got " - "start %llu len %llu\n", offset, em->start, em->len); + if (em->start != offset || em->len != sectorsize) { + test_msg("Unexpected extent wanted start %llu len %u, " + "got start %llu len %llu\n", + offset, sectorsize, em->start, em->len); goto out; } if (em->flags != 0) { @@ -691,7 +710,7 @@ static noinline int test_btrfs_get_extent(void) offset = em->start + em->len; free_extent_map(em); - em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0); + em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize, 0); if (IS_ERR(em)) { test_msg("Got an error when we shouldn't have\n"); goto out; @@ -701,9 +720,10 @@ static noinline int test_btrfs_get_extent(void) disk_bytenr, em->block_start); goto out; } - if (em->start != offset || em->len != 8192) { - test_msg("Unexpected extent wanted start %llu len 8192, got " - "start %llu len %llu\n", offset, em->start, em->len); + if (em->start != offset || em->len != 2 * sectorsize) { + test_msg("Unexpected extent wanted start %llu len %u, " + "got start %llu len %llu\n", + offset, 2 * sectorsize, em->start, em->len); goto out; } if (em->flags != compressed_only) { @@ -725,7 +745,7 @@ static noinline int test_btrfs_get_extent(void) free_extent_map(em); /* A hole between regular extents but no hole extent */ - em = btrfs_get_extent(inode, NULL, 0, offset + 6, 4096, 0); + em = btrfs_get_extent(inode, NULL, 0, offset + 6, sectorsize, 0); if (IS_ERR(em)) { test_msg("Got an error when we shouldn't have\n"); goto out; @@ -734,9 +754,10 @@ static noinline int test_btrfs_get_extent(void) test_msg("Expected a real extent, got %llu\n", em->block_start); goto out; } - if (em->start != offset || em->len != 4096) { - test_msg("Unexpected extent wanted start %llu len 4096, got " - "start %llu len %llu\n", offset, em->start, em->len); + if (em->start != offset || em->len != sectorsize) { + test_msg("Unexpected extent wanted start %llu len %u, " + "got start %llu len %llu\n", + offset, sectorsize, em->start, em->len); goto out; } if (em->flags != 0) { @@ -765,9 +786,10 @@ static noinline int test_btrfs_get_extent(void) * length of the actual hole, if this changes we'll have to change this * test. */ - if (em->start != offset || em->len != 12288) { - test_msg("Unexpected extent wanted start %llu len 12288, got " - "start %llu len %llu\n", offset, em->start, em->len); + if (em->start != offset || em->len != 3 * sectorsize) { + test_msg("Unexpected extent wanted start %llu len %u, " + "got start %llu len %llu\n", + offset, 3 * sectorsize, em->start, em->len); goto out; } if (em->flags != vacancy_only) { @@ -783,7 +805,7 @@ static noinline int test_btrfs_get_extent(void) offset = em->start + em->len; free_extent_map(em); - em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0); + em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize, 0); if (IS_ERR(em)) { test_msg("Got an error when we shouldn't have\n"); goto out; @@ -792,9 +814,10 @@ static noinline int test_btrfs_get_extent(void) test_msg("Expected a real extent, got %llu\n", em->block_start); goto out; } - if (em->start != offset || em->len != 4096) { - test_msg("Unexpected extent wanted start %llu len 4096, got " - "start %llu len %llu\n", offset, em->start, em->len); + if (em->start != offset || em->len != sectorsize) { + test_msg("Unexpected extent wanted start %llu len %u," + "got start %llu len %llu\n", + offset, sectorsize, em->start, em->len); goto out; } if (em->flags != 0) { @@ -815,7 +838,7 @@ out: return ret; } -static int test_hole_first(void) +static int test_hole_first(u32 sectorsize, u32 nodesize) { struct inode *inode = NULL; struct btrfs_root *root = NULL; @@ -832,7 +855,7 @@ static int test_hole_first(void) BTRFS_I(inode)->location.objectid = BTRFS_FIRST_FREE_OBJECTID; BTRFS_I(inode)->location.offset = 0; - root = btrfs_alloc_dummy_root(); + root = btrfs_alloc_dummy_root(sectorsize, nodesize); if (IS_ERR(root)) { test_msg("Couldn't allocate root\n"); goto out; @@ -844,7 +867,7 @@ static int test_hole_first(void) goto out; } - root->node = alloc_dummy_extent_buffer(NULL, 4096); + root->node = alloc_dummy_extent_buffer(NULL, nodesize, nodesize); if (!root->node) { test_msg("Couldn't allocate dummy buffer\n"); goto out; @@ -861,9 +884,9 @@ static int test_hole_first(void) * btrfs_get_extent. */ insert_inode_item_key(root); - insert_extent(root, 4096, 4096, 4096, 0, 4096, 4096, - BTRFS_FILE_EXTENT_REG, 0, 1); - em = btrfs_get_extent(inode, NULL, 0, 0, 8192, 0); + insert_extent(root, sectorsize, sectorsize, sectorsize, 0, sectorsize, + sectorsize, BTRFS_FILE_EXTENT_REG, 0, 1); + em = btrfs_get_extent(inode, NULL, 0, 0, 2 * sectorsize, 0); if (IS_ERR(em)) { test_msg("Got an error when we shouldn't have\n"); goto out; @@ -872,9 +895,10 @@ static int test_hole_first(void) test_msg("Expected a hole, got %llu\n", em->block_start); goto out; } - if (em->start != 0 || em->len != 4096) { - test_msg("Unexpected extent wanted start 0 len 4096, got start " - "%llu len %llu\n", em->start, em->len); + if (em->start != 0 || em->len != sectorsize) { + test_msg("Unexpected extent wanted start 0 len %u, " + "got start %llu len %llu\n", + sectorsize, em->start, em->len); goto out; } if (em->flags != vacancy_only) { @@ -884,18 +908,19 @@ static int test_hole_first(void) } free_extent_map(em); - em = btrfs_get_extent(inode, NULL, 0, 4096, 8192, 0); + em = btrfs_get_extent(inode, NULL, 0, sectorsize, 2 * sectorsize, 0); if (IS_ERR(em)) { test_msg("Got an error when we shouldn't have\n"); goto out; } - if (em->block_start != 4096) { + if (em->block_start != sectorsize) { test_msg("Expected a real extent, got %llu\n", em->block_start); goto out; } - if (em->start != 4096 || em->len != 4096) { - test_msg("Unexpected extent wanted start 4096 len 4096, got " - "start %llu len %llu\n", em->start, em->len); + if (em->start != sectorsize || em->len != sectorsize) { + test_msg("Unexpected extent wanted start %u len %u, " + "got start %llu len %llu\n", + sectorsize, sectorsize, em->start, em->len); goto out; } if (em->flags != 0) { @@ -912,7 +937,7 @@ out: return ret; } -static int test_extent_accounting(void) +static int test_extent_accounting(u32 sectorsize, u32 nodesize) { struct inode *inode = NULL; struct btrfs_root *root = NULL; @@ -924,7 +949,7 @@ static int test_extent_accounting(void) return ret; } - root = btrfs_alloc_dummy_root(); + root = btrfs_alloc_dummy_root(sectorsize, nodesize); if (IS_ERR(root)) { test_msg("Couldn't allocate root\n"); goto out; @@ -954,10 +979,11 @@ static int test_extent_accounting(void) goto out; } - /* [BTRFS_MAX_EXTENT_SIZE][4k] */ + /* [BTRFS_MAX_EXTENT_SIZE][sectorsize] */ BTRFS_I(inode)->outstanding_extents++; ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE, - BTRFS_MAX_EXTENT_SIZE + 4095, NULL); + BTRFS_MAX_EXTENT_SIZE + sectorsize - 1, + NULL); if (ret) { test_msg("btrfs_set_extent_delalloc returned %d\n", ret); goto out; @@ -969,10 +995,10 @@ static int test_extent_accounting(void) goto out; } - /* [BTRFS_MAX_EXTENT_SIZE/2][4K HOLE][the rest] */ + /* [BTRFS_MAX_EXTENT_SIZE/2][sectorsize HOLE][the rest] */ ret = clear_extent_bit(&BTRFS_I(inode)->io_tree, BTRFS_MAX_EXTENT_SIZE >> 1, - (BTRFS_MAX_EXTENT_SIZE >> 1) + 4095, + (BTRFS_MAX_EXTENT_SIZE >> 1) + sectorsize - 1, EXTENT_DELALLOC | EXTENT_DIRTY | EXTENT_UPTODATE | EXTENT_DO_ACCOUNTING, 0, 0, NULL, GFP_KERNEL); @@ -987,10 +1013,11 @@ static int test_extent_accounting(void) goto out; } - /* [BTRFS_MAX_EXTENT_SIZE][4K] */ + /* [BTRFS_MAX_EXTENT_SIZE][sectorsize] */ BTRFS_I(inode)->outstanding_extents++; ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE >> 1, - (BTRFS_MAX_EXTENT_SIZE >> 1) + 4095, + (BTRFS_MAX_EXTENT_SIZE >> 1) + + sectorsize - 1, NULL); if (ret) { test_msg("btrfs_set_extent_delalloc returned %d\n", ret); @@ -1004,16 +1031,17 @@ static int test_extent_accounting(void) } /* - * [BTRFS_MAX_EXTENT_SIZE+4K][4K HOLE][BTRFS_MAX_EXTENT_SIZE+4K] + * [BTRFS_MAX_EXTENT_SIZE+sectorsize][sectorsize HOLE][BTRFS_MAX_EXTENT_SIZE+sectorsize] * * I'm artificially adding 2 to outstanding_extents because in the * buffered IO case we'd add things up as we go, but I don't feel like * doing that here, this isn't the interesting case we want to test. */ BTRFS_I(inode)->outstanding_extents += 2; - ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE + 8192, - (BTRFS_MAX_EXTENT_SIZE << 1) + 12287, - NULL); + ret = btrfs_set_extent_delalloc(inode, + BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize, + (BTRFS_MAX_EXTENT_SIZE << 1) + 3 * sectorsize - 1, + NULL); if (ret) { test_msg("btrfs_set_extent_delalloc returned %d\n", ret); goto out; @@ -1025,10 +1053,13 @@ static int test_extent_accounting(void) goto out; } - /* [BTRFS_MAX_EXTENT_SIZE+4k][4k][BTRFS_MAX_EXTENT_SIZE+4k] */ + /* + * [BTRFS_MAX_EXTENT_SIZE+sectorsize][sectorsize][BTRFS_MAX_EXTENT_SIZE+sectorsize] + */ BTRFS_I(inode)->outstanding_extents++; - ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE+4096, - BTRFS_MAX_EXTENT_SIZE+8191, NULL); + ret = btrfs_set_extent_delalloc(inode, + BTRFS_MAX_EXTENT_SIZE + sectorsize, + BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize - 1, NULL); if (ret) { test_msg("btrfs_set_extent_delalloc returned %d\n", ret); goto out; @@ -1042,8 +1073,8 @@ static int test_extent_accounting(void) /* [BTRFS_MAX_EXTENT_SIZE+4k][4K HOLE][BTRFS_MAX_EXTENT_SIZE+4k] */ ret = clear_extent_bit(&BTRFS_I(inode)->io_tree, - BTRFS_MAX_EXTENT_SIZE+4096, - BTRFS_MAX_EXTENT_SIZE+8191, + BTRFS_MAX_EXTENT_SIZE + sectorsize, + BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize - 1, EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_UPTODATE, 0, 0, NULL, GFP_KERNEL); @@ -1063,8 +1094,9 @@ static int test_extent_accounting(void) * might fail and I'd rather satisfy my paranoia at this point. */ BTRFS_I(inode)->outstanding_extents++; - ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE+4096, - BTRFS_MAX_EXTENT_SIZE+8191, NULL); + ret = btrfs_set_extent_delalloc(inode, + BTRFS_MAX_EXTENT_SIZE + sectorsize, + BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize - 1, NULL); if (ret) { test_msg("btrfs_set_extent_delalloc returned %d\n", ret); goto out; @@ -1103,7 +1135,7 @@ out: return ret; } -int btrfs_test_inodes(void) +int btrfs_test_inodes(u32 sectorsize, u32 nodesize) { int ret; @@ -1112,13 +1144,13 @@ int btrfs_test_inodes(void) set_bit(EXTENT_FLAG_PREALLOC, &prealloc_only); test_msg("Running btrfs_get_extent tests\n"); - ret = test_btrfs_get_extent(); + ret = test_btrfs_get_extent(sectorsize, nodesize); if (ret) return ret; test_msg("Running hole first btrfs_get_extent test\n"); - ret = test_hole_first(); + ret = test_hole_first(sectorsize, nodesize); if (ret) return ret; test_msg("Running outstanding_extents tests\n"); - return test_extent_accounting(); + return test_extent_accounting(sectorsize, nodesize); } diff --git a/fs/btrfs/tests/qgroup-tests.c b/fs/btrfs/tests/qgroup-tests.c index 8ea5d34bc5a2..57a12c0d680b 100644 --- a/fs/btrfs/tests/qgroup-tests.c +++ b/fs/btrfs/tests/qgroup-tests.c @@ -16,6 +16,7 @@ * Boston, MA 021110-1307, USA. */ +#include <linux/types.h> #include "btrfs-tests.h" #include "../ctree.h" #include "../transaction.h" @@ -216,7 +217,8 @@ static int remove_extent_ref(struct btrfs_root *root, u64 bytenr, return ret; } -static int test_no_shared_qgroup(struct btrfs_root *root) +static int test_no_shared_qgroup(struct btrfs_root *root, + u32 sectorsize, u32 nodesize) { struct btrfs_trans_handle trans; struct btrfs_fs_info *fs_info = root->fs_info; @@ -227,29 +229,30 @@ static int test_no_shared_qgroup(struct btrfs_root *root) btrfs_init_dummy_trans(&trans); test_msg("Qgroup basic add\n"); - ret = btrfs_create_qgroup(NULL, fs_info, 5); + ret = btrfs_create_qgroup(NULL, fs_info, BTRFS_FS_TREE_OBJECTID); if (ret) { test_msg("Couldn't create a qgroup %d\n", ret); return ret; } /* - * Since the test trans doesn't havee the complicated delayed refs, + * Since the test trans doesn't have the complicated delayed refs, * we can only call btrfs_qgroup_account_extent() directly to test * quota. */ - ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &old_roots); + ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots); if (ret) { ulist_free(old_roots); test_msg("Couldn't find old roots: %d\n", ret); return ret; } - ret = insert_normal_tree_ref(root, 4096, 4096, 0, 5); + ret = insert_normal_tree_ref(root, nodesize, nodesize, 0, + BTRFS_FS_TREE_OBJECTID); if (ret) return ret; - ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &new_roots); + ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots); if (ret) { ulist_free(old_roots); ulist_free(new_roots); @@ -257,32 +260,33 @@ static int test_no_shared_qgroup(struct btrfs_root *root) return ret; } - ret = btrfs_qgroup_account_extent(&trans, fs_info, 4096, 4096, - old_roots, new_roots); + ret = btrfs_qgroup_account_extent(&trans, fs_info, nodesize, + nodesize, old_roots, new_roots); if (ret) { test_msg("Couldn't account space for a qgroup %d\n", ret); return ret; } - if (btrfs_verify_qgroup_counts(fs_info, 5, 4096, 4096)) { + if (btrfs_verify_qgroup_counts(fs_info, BTRFS_FS_TREE_OBJECTID, + nodesize, nodesize)) { test_msg("Qgroup counts didn't match expected values\n"); return -EINVAL; } old_roots = NULL; new_roots = NULL; - ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &old_roots); + ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots); if (ret) { ulist_free(old_roots); test_msg("Couldn't find old roots: %d\n", ret); return ret; } - ret = remove_extent_item(root, 4096, 4096); + ret = remove_extent_item(root, nodesize, nodesize); if (ret) return -EINVAL; - ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &new_roots); + ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots); if (ret) { ulist_free(old_roots); ulist_free(new_roots); @@ -290,14 +294,14 @@ static int test_no_shared_qgroup(struct btrfs_root *root) return ret; } - ret = btrfs_qgroup_account_extent(&trans, fs_info, 4096, 4096, - old_roots, new_roots); + ret = btrfs_qgroup_account_extent(&trans, fs_info, nodesize, + nodesize, old_roots, new_roots); if (ret) { test_msg("Couldn't account space for a qgroup %d\n", ret); return -EINVAL; } - if (btrfs_verify_qgroup_counts(fs_info, 5, 0, 0)) { + if (btrfs_verify_qgroup_counts(fs_info, BTRFS_FS_TREE_OBJECTID, 0, 0)) { test_msg("Qgroup counts didn't match expected values\n"); return -EINVAL; } @@ -310,7 +314,8 @@ static int test_no_shared_qgroup(struct btrfs_root *root) * right, also remove one of the roots and make sure the exclusive count is * adjusted properly. */ -static int test_multiple_refs(struct btrfs_root *root) +static int test_multiple_refs(struct btrfs_root *root, + u32 sectorsize, u32 nodesize) { struct btrfs_trans_handle trans; struct btrfs_fs_info *fs_info = root->fs_info; @@ -322,25 +327,29 @@ static int test_multiple_refs(struct btrfs_root *root) test_msg("Qgroup multiple refs test\n"); - /* We have 5 created already from the previous test */ - ret = btrfs_create_qgroup(NULL, fs_info, 256); + /* + * We have BTRFS_FS_TREE_OBJECTID created already from the + * previous test. + */ + ret = btrfs_create_qgroup(NULL, fs_info, BTRFS_FIRST_FREE_OBJECTID); if (ret) { test_msg("Couldn't create a qgroup %d\n", ret); return ret; } - ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &old_roots); + ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots); if (ret) { ulist_free(old_roots); test_msg("Couldn't find old roots: %d\n", ret); return ret; } - ret = insert_normal_tree_ref(root, 4096, 4096, 0, 5); + ret = insert_normal_tree_ref(root, nodesize, nodesize, 0, + BTRFS_FS_TREE_OBJECTID); if (ret) return ret; - ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &new_roots); + ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots); if (ret) { ulist_free(old_roots); ulist_free(new_roots); @@ -348,30 +357,32 @@ static int test_multiple_refs(struct btrfs_root *root) return ret; } - ret = btrfs_qgroup_account_extent(&trans, fs_info, 4096, 4096, - old_roots, new_roots); + ret = btrfs_qgroup_account_extent(&trans, fs_info, nodesize, + nodesize, old_roots, new_roots); if (ret) { test_msg("Couldn't account space for a qgroup %d\n", ret); return ret; } - if (btrfs_verify_qgroup_counts(fs_info, 5, 4096, 4096)) { + if (btrfs_verify_qgroup_counts(fs_info, BTRFS_FS_TREE_OBJECTID, + nodesize, nodesize)) { test_msg("Qgroup counts didn't match expected values\n"); return -EINVAL; } - ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &old_roots); + ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots); if (ret) { ulist_free(old_roots); test_msg("Couldn't find old roots: %d\n", ret); return ret; } - ret = add_tree_ref(root, 4096, 4096, 0, 256); + ret = add_tree_ref(root, nodesize, nodesize, 0, + BTRFS_FIRST_FREE_OBJECTID); if (ret) return ret; - ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &new_roots); + ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots); if (ret) { ulist_free(old_roots); ulist_free(new_roots); @@ -379,35 +390,38 @@ static int test_multiple_refs(struct btrfs_root *root) return ret; } - ret = btrfs_qgroup_account_extent(&trans, fs_info, 4096, 4096, - old_roots, new_roots); + ret = btrfs_qgroup_account_extent(&trans, fs_info, nodesize, + nodesize, old_roots, new_roots); if (ret) { test_msg("Couldn't account space for a qgroup %d\n", ret); return ret; } - if (btrfs_verify_qgroup_counts(fs_info, 5, 4096, 0)) { + if (btrfs_verify_qgroup_counts(fs_info, BTRFS_FS_TREE_OBJECTID, + nodesize, 0)) { test_msg("Qgroup counts didn't match expected values\n"); return -EINVAL; } - if (btrfs_verify_qgroup_counts(fs_info, 256, 4096, 0)) { + if (btrfs_verify_qgroup_counts(fs_info, BTRFS_FIRST_FREE_OBJECTID, + nodesize, 0)) { test_msg("Qgroup counts didn't match expected values\n"); return -EINVAL; } - ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &old_roots); + ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots); if (ret) { ulist_free(old_roots); test_msg("Couldn't find old roots: %d\n", ret); return ret; } - ret = remove_extent_ref(root, 4096, 4096, 0, 256); + ret = remove_extent_ref(root, nodesize, nodesize, 0, + BTRFS_FIRST_FREE_OBJECTID); if (ret) return ret; - ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &new_roots); + ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots); if (ret) { ulist_free(old_roots); ulist_free(new_roots); @@ -415,19 +429,21 @@ static int test_multiple_refs(struct btrfs_root *root) return ret; } - ret = btrfs_qgroup_account_extent(&trans, fs_info, 4096, 4096, - old_roots, new_roots); + ret = btrfs_qgroup_account_extent(&trans, fs_info, nodesize, + nodesize, old_roots, new_roots); if (ret) { test_msg("Couldn't account space for a qgroup %d\n", ret); return ret; } - if (btrfs_verify_qgroup_counts(fs_info, 256, 0, 0)) { + if (btrfs_verify_qgroup_counts(fs_info, BTRFS_FIRST_FREE_OBJECTID, + 0, 0)) { test_msg("Qgroup counts didn't match expected values\n"); return -EINVAL; } - if (btrfs_verify_qgroup_counts(fs_info, 5, 4096, 4096)) { + if (btrfs_verify_qgroup_counts(fs_info, BTRFS_FS_TREE_OBJECTID, + nodesize, nodesize)) { test_msg("Qgroup counts didn't match expected values\n"); return -EINVAL; } @@ -435,13 +451,13 @@ static int test_multiple_refs(struct btrfs_root *root) return 0; } -int btrfs_test_qgroups(void) +int btrfs_test_qgroups(u32 sectorsize, u32 nodesize) { struct btrfs_root *root; struct btrfs_root *tmp_root; int ret = 0; - root = btrfs_alloc_dummy_root(); + root = btrfs_alloc_dummy_root(sectorsize, nodesize); if (IS_ERR(root)) { test_msg("Couldn't allocate root\n"); return PTR_ERR(root); @@ -468,7 +484,8 @@ int btrfs_test_qgroups(void) * Can't use bytenr 0, some things freak out * *cough*backref walking code*cough* */ - root->node = alloc_test_extent_buffer(root->fs_info, 4096); + root->node = alloc_test_extent_buffer(root->fs_info, nodesize, + nodesize); if (!root->node) { test_msg("Couldn't allocate dummy buffer\n"); ret = -ENOMEM; @@ -476,16 +493,16 @@ int btrfs_test_qgroups(void) } btrfs_set_header_level(root->node, 0); btrfs_set_header_nritems(root->node, 0); - root->alloc_bytenr += 8192; + root->alloc_bytenr += 2 * nodesize; - tmp_root = btrfs_alloc_dummy_root(); + tmp_root = btrfs_alloc_dummy_root(sectorsize, nodesize); if (IS_ERR(tmp_root)) { test_msg("Couldn't allocate a fs root\n"); ret = PTR_ERR(tmp_root); goto out; } - tmp_root->root_key.objectid = 5; + tmp_root->root_key.objectid = BTRFS_FS_TREE_OBJECTID; root->fs_info->fs_root = tmp_root; ret = btrfs_insert_fs_root(root->fs_info, tmp_root); if (ret) { @@ -493,14 +510,14 @@ int btrfs_test_qgroups(void) goto out; } - tmp_root = btrfs_alloc_dummy_root(); + tmp_root = btrfs_alloc_dummy_root(sectorsize, nodesize); if (IS_ERR(tmp_root)) { test_msg("Couldn't allocate a fs root\n"); ret = PTR_ERR(tmp_root); goto out; } - tmp_root->root_key.objectid = 256; + tmp_root->root_key.objectid = BTRFS_FIRST_FREE_OBJECTID; ret = btrfs_insert_fs_root(root->fs_info, tmp_root); if (ret) { test_msg("Couldn't insert fs root %d\n", ret); @@ -508,10 +525,10 @@ int btrfs_test_qgroups(void) } test_msg("Running qgroup tests\n"); - ret = test_no_shared_qgroup(root); + ret = test_no_shared_qgroup(root, sectorsize, nodesize); if (ret) goto out; - ret = test_multiple_refs(root); + ret = test_multiple_refs(root, sectorsize, nodesize); out: btrfs_free_dummy_root(root); return ret; diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 43885e51b882..948aa186b353 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -311,10 +311,11 @@ loop: * when the transaction commits */ static int record_root_in_trans(struct btrfs_trans_handle *trans, - struct btrfs_root *root) + struct btrfs_root *root, + int force) { - if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) && - root->last_trans < trans->transid) { + if ((test_bit(BTRFS_ROOT_REF_COWS, &root->state) && + root->last_trans < trans->transid) || force) { WARN_ON(root == root->fs_info->extent_root); WARN_ON(root->commit_root != root->node); @@ -331,7 +332,7 @@ static int record_root_in_trans(struct btrfs_trans_handle *trans, smp_wmb(); spin_lock(&root->fs_info->fs_roots_radix_lock); - if (root->last_trans == trans->transid) { + if (root->last_trans == trans->transid && !force) { spin_unlock(&root->fs_info->fs_roots_radix_lock); return 0; } @@ -402,7 +403,7 @@ int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans, return 0; mutex_lock(&root->fs_info->reloc_mutex); - record_root_in_trans(trans, root); + record_root_in_trans(trans, root, 0); mutex_unlock(&root->fs_info->reloc_mutex); return 0; @@ -817,6 +818,7 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, { struct btrfs_transaction *cur_trans = trans->transaction; struct btrfs_fs_info *info = root->fs_info; + u64 transid = trans->transid; unsigned long cur = trans->delayed_ref_updates; int lock = (trans->type != TRANS_JOIN_NOLOCK); int err = 0; @@ -904,7 +906,7 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, kmem_cache_free(btrfs_trans_handle_cachep, trans); if (must_run_delayed_refs) { - btrfs_async_run_delayed_refs(root, cur, + btrfs_async_run_delayed_refs(root, cur, transid, must_run_delayed_refs == 1); } return err; @@ -943,7 +945,7 @@ int btrfs_write_marked_extents(struct btrfs_root *root, err = convert_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT, - mark, &cached_state, GFP_NOFS); + mark, &cached_state); /* * convert_extent_bit can return -ENOMEM, which is most of the * time a temporary error. So when it happens, ignore the error @@ -1311,6 +1313,92 @@ int btrfs_defrag_root(struct btrfs_root *root) } /* + * Do all special snapshot related qgroup dirty hack. + * + * Will do all needed qgroup inherit and dirty hack like switch commit + * roots inside one transaction and write all btree into disk, to make + * qgroup works. + */ +static int qgroup_account_snapshot(struct btrfs_trans_handle *trans, + struct btrfs_root *src, + struct btrfs_root *parent, + struct btrfs_qgroup_inherit *inherit, + u64 dst_objectid) +{ + struct btrfs_fs_info *fs_info = src->fs_info; + int ret; + + /* + * Save some performance in the case that qgroups are not + * enabled. If this check races with the ioctl, rescan will + * kick in anyway. + */ + mutex_lock(&fs_info->qgroup_ioctl_lock); + if (!fs_info->quota_enabled) { + mutex_unlock(&fs_info->qgroup_ioctl_lock); + return 0; + } + mutex_unlock(&fs_info->qgroup_ioctl_lock); + + /* + * We are going to commit transaction, see btrfs_commit_transaction() + * comment for reason locking tree_log_mutex + */ + mutex_lock(&fs_info->tree_log_mutex); + + ret = commit_fs_roots(trans, src); + if (ret) + goto out; + ret = btrfs_qgroup_prepare_account_extents(trans, fs_info); + if (ret < 0) + goto out; + ret = btrfs_qgroup_account_extents(trans, fs_info); + if (ret < 0) + goto out; + + /* Now qgroup are all updated, we can inherit it to new qgroups */ + ret = btrfs_qgroup_inherit(trans, fs_info, + src->root_key.objectid, dst_objectid, + inherit); + if (ret < 0) + goto out; + + /* + * Now we do a simplified commit transaction, which will: + * 1) commit all subvolume and extent tree + * To ensure all subvolume and extent tree have a valid + * commit_root to accounting later insert_dir_item() + * 2) write all btree blocks onto disk + * This is to make sure later btree modification will be cowed + * Or commit_root can be populated and cause wrong qgroup numbers + * In this simplified commit, we don't really care about other trees + * like chunk and root tree, as they won't affect qgroup. + * And we don't write super to avoid half committed status. + */ + ret = commit_cowonly_roots(trans, src); + if (ret) + goto out; + switch_commit_roots(trans->transaction, fs_info); + ret = btrfs_write_and_wait_transaction(trans, src); + if (ret) + btrfs_handle_fs_error(fs_info, ret, + "Error while writing out transaction for qgroup"); + +out: + mutex_unlock(&fs_info->tree_log_mutex); + + /* + * Force parent root to be updated, as we recorded it before so its + * last_trans == cur_transid. + * Or it won't be committed again onto disk after later + * insert_dir_item() + */ + if (!ret) + record_root_in_trans(trans, parent, 1); + return ret; +} + +/* * new snapshots need to be created at a very specific time in the * transaction commit. This does the actual creation. * @@ -1383,7 +1471,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, dentry = pending->dentry; parent_inode = pending->dir; parent_root = BTRFS_I(parent_inode)->root; - record_root_in_trans(trans, parent_root); + record_root_in_trans(trans, parent_root, 0); cur_time = current_fs_time(parent_inode->i_sb); @@ -1420,7 +1508,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, goto fail; } - record_root_in_trans(trans, root); + record_root_in_trans(trans, root, 0); btrfs_set_root_last_snapshot(&root->root_item, trans->transid); memcpy(new_root_item, &root->root_item, sizeof(*new_root_item)); btrfs_check_and_init_root_item(new_root_item); @@ -1516,6 +1604,17 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, goto fail; } + /* + * Do special qgroup accounting for snapshot, as we do some qgroup + * snapshot hack to do fast snapshot. + * To co-operate with that hack, we do hack again. + * Or snapshot will be greatly slowed down by a subtree qgroup rescan + */ + ret = qgroup_account_snapshot(trans, root, parent_root, + pending->inherit, objectid); + if (ret < 0) + goto fail; + ret = btrfs_insert_dir_item(trans, parent_root, dentry->d_name.name, dentry->d_name.len, parent_inode, &key, @@ -1559,23 +1658,6 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, goto fail; } - /* - * account qgroup counters before qgroup_inherit() - */ - ret = btrfs_qgroup_prepare_account_extents(trans, fs_info); - if (ret) - goto fail; - ret = btrfs_qgroup_account_extents(trans, fs_info); - if (ret) - goto fail; - ret = btrfs_qgroup_inherit(trans, fs_info, - root->root_key.objectid, - objectid, pending->inherit); - if (ret) { - btrfs_abort_transaction(trans, root, ret); - goto fail; - } - fail: pending->error = ret; dir_item_existed: @@ -1821,7 +1903,7 @@ static inline int btrfs_start_delalloc_flush(struct btrfs_fs_info *fs_info) static inline void btrfs_wait_delalloc_flush(struct btrfs_fs_info *fs_info) { if (btrfs_test_opt(fs_info->tree_root, FLUSHONCOMMIT)) - btrfs_wait_ordered_roots(fs_info, -1); + btrfs_wait_ordered_roots(fs_info, -1, 0, (u64)-1); } static inline void @@ -2145,7 +2227,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, ret = btrfs_write_and_wait_transaction(trans, root); if (ret) { - btrfs_std_error(root->fs_info, ret, + btrfs_handle_fs_error(root->fs_info, ret, "Error while writing out transaction"); mutex_unlock(&root->fs_info->tree_log_mutex); goto scrub_continue; diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 72be51f7ca2f..c5abee4f01ad 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -110,7 +110,6 @@ struct btrfs_trans_handle { u64 chunk_bytes_reserved; unsigned long use_count; unsigned long blocks_reserved; - unsigned long blocks_used; unsigned long delayed_ref_updates; struct btrfs_transaction *transaction; struct btrfs_block_rsv *block_rsv; @@ -121,6 +120,7 @@ struct btrfs_trans_handle { bool can_flush_pending_bgs; bool reloc_reserved; bool sync; + bool dirty; unsigned int type; /* * this root is only needed to validate that the root passed to @@ -144,7 +144,7 @@ struct btrfs_pending_snapshot { /* block reservation for the operation */ struct btrfs_block_rsv block_rsv; u64 qgroup_reserved; - /* extra metadata reseration for relocation */ + /* extra metadata reservation for relocation */ int error; bool readonly; struct list_head list; diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index e692eea87af6..c05f69a8ec42 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -2330,7 +2330,7 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, break; /* for regular files, make sure corresponding - * orhpan item exist. extents past the new EOF + * orphan item exist. extents past the new EOF * will be truncated later by orphan cleanup. */ if (S_ISREG(mode)) { @@ -2422,8 +2422,8 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, root_owner = btrfs_header_owner(parent); next = btrfs_find_create_tree_block(root, bytenr); - if (!next) - return -ENOMEM; + if (IS_ERR(next)) + return PTR_ERR(next); if (*level == 1) { ret = wc->process_func(root, next, wc, ptr_gen); @@ -3001,7 +3001,7 @@ static void free_log_tree(struct btrfs_trans_handle *trans, break; clear_extent_bits(&log->dirty_log_pages, start, end, - EXTENT_DIRTY | EXTENT_NEW, GFP_NOFS); + EXTENT_DIRTY | EXTENT_NEW); } /* @@ -4141,6 +4141,7 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, INIT_LIST_HEAD(&extents); + down_write(&BTRFS_I(inode)->dio_sem); write_lock(&tree->lock); test_gen = root->fs_info->last_trans_committed; @@ -4169,13 +4170,20 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, } list_sort(NULL, &extents, extent_cmp); + btrfs_get_logged_extents(inode, logged_list, start, end); /* - * Collect any new ordered extents within the range. This is to - * prevent logging file extent items without waiting for the disk - * location they point to being written. We do this only to deal - * with races against concurrent lockless direct IO writes. + * Some ordered extents started by fsync might have completed + * before we could collect them into the list logged_list, which + * means they're gone, not in our logged_list nor in the inode's + * ordered tree. We want the application/user space to know an + * error happened while attempting to persist file data so that + * it can take proper action. If such error happened, we leave + * without writing to the log tree and the fsync must report the + * file data write error and not commit the current transaction. */ - btrfs_get_logged_extents(inode, logged_list, start, end); + ret = btrfs_inode_check_errors(inode); + if (ret) + ctx->io_err = ret; process: while (!list_empty(&extents)) { em = list_entry(extents.next, struct extent_map, list); @@ -4202,6 +4210,7 @@ process: } WARN_ON(!list_empty(&extents)); write_unlock(&tree->lock); + up_write(&BTRFS_I(inode)->dio_sem); btrfs_release_path(path); return ret; @@ -4623,23 +4632,6 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, mutex_lock(&BTRFS_I(inode)->log_mutex); /* - * Collect ordered extents only if we are logging data. This is to - * ensure a subsequent request to log this inode in LOG_INODE_ALL mode - * will process the ordered extents if they still exists at the time, - * because when we collect them we test and set for the flag - * BTRFS_ORDERED_LOGGED to prevent multiple log requests to process the - * same ordered extents. The consequence for the LOG_INODE_ALL log mode - * not processing the ordered extents is that we end up logging the - * corresponding file extent items, based on the extent maps in the - * inode's extent_map_tree's modified_list, without logging the - * respective checksums (since the may still be only attached to the - * ordered extents and have not been inserted in the csum tree by - * btrfs_finish_ordered_io() yet). - */ - if (inode_only == LOG_INODE_ALL) - btrfs_get_logged_extents(inode, &logged_list, start, end); - - /* * a brute force approach to making sure we get the most uptodate * copies of everything. */ @@ -4846,21 +4838,6 @@ log_extents: goto out_unlock; } if (fast_search) { - /* - * Some ordered extents started by fsync might have completed - * before we collected the ordered extents in logged_list, which - * means they're gone, not in our logged_list nor in the inode's - * ordered tree. We want the application/user space to know an - * error happened while attempting to persist file data so that - * it can take proper action. If such error happened, we leave - * without writing to the log tree and the fsync must report the - * file data write error and not commit the current transaction. - */ - err = btrfs_inode_check_errors(inode); - if (err) { - ctx->io_err = err; - goto out_unlock; - } ret = btrfs_log_changed_extents(trans, root, inode, dst_path, &logged_list, ctx, start, end); if (ret) { @@ -4937,7 +4914,7 @@ out_unlock: * the actual unlink operation, so if we do this check before a concurrent task * sets last_unlink_trans it means we've logged a consistent version/state of * all the inode items, otherwise we are not sure and must do a transaction - * commit (the concurrent task migth have only updated last_unlink_trans before + * commit (the concurrent task might have only updated last_unlink_trans before * we logged the inode or it might have also done the unlink). */ static bool btrfs_must_commit_transaction(struct btrfs_trans_handle *trans, @@ -4996,7 +4973,7 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans, while (1) { /* * If we are logging a directory then we start with our inode, - * not our parents inode, so we need to skipp setting the + * not our parent's inode, so we need to skip setting the * logged_trans so that further down in the log code we don't * think this inode has already been logged. */ @@ -5158,7 +5135,7 @@ process_leaf: } ctx->log_new_dentries = false; - if (type == BTRFS_FT_DIR) + if (type == BTRFS_FT_DIR || type == BTRFS_FT_SYMLINK) log_mode = LOG_INODE_ALL; btrfs_release_path(path); ret = btrfs_log_inode(trans, root, di_inode, @@ -5278,11 +5255,16 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans, if (IS_ERR(dir_inode)) continue; + if (ctx) + ctx->log_new_dentries = false; ret = btrfs_log_inode(trans, root, dir_inode, LOG_INODE_ALL, 0, LLONG_MAX, ctx); if (!ret && btrfs_must_commit_transaction(trans, dir_inode)) ret = 1; + if (!ret && ctx && ctx->log_new_dentries) + ret = log_new_dir_dentries(trans, root, + dir_inode, ctx); iput(dir_inode); if (ret) goto out; @@ -5375,7 +5357,7 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, log_dentries = true; /* - * On unlink we must make sure all our current and old parent directores + * On unlink we must make sure all our current and old parent directory * inodes are fully logged. This is to prevent leaving dangling * directory index entries in directories that were our parents but are * not anymore. Not doing this results in old parent directory being @@ -5519,7 +5501,7 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree) ret = walk_log_tree(trans, log_root_tree, &wc); if (ret) { - btrfs_std_error(fs_info, ret, "Failed to pin buffers while " + btrfs_handle_fs_error(fs_info, ret, "Failed to pin buffers while " "recovering log root tree."); goto error; } @@ -5533,7 +5515,7 @@ again: ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0); if (ret < 0) { - btrfs_std_error(fs_info, ret, + btrfs_handle_fs_error(fs_info, ret, "Couldn't find tree log root."); goto error; } @@ -5551,7 +5533,7 @@ again: log = btrfs_read_fs_root(log_root_tree, &found_key); if (IS_ERR(log)) { ret = PTR_ERR(log); - btrfs_std_error(fs_info, ret, + btrfs_handle_fs_error(fs_info, ret, "Couldn't read tree log root."); goto error; } @@ -5566,7 +5548,7 @@ again: free_extent_buffer(log->node); free_extent_buffer(log->commit_root); kfree(log); - btrfs_std_error(fs_info, ret, "Couldn't read target root " + btrfs_handle_fs_error(fs_info, ret, "Couldn't read target root " "for tree log recovery."); goto error; } @@ -5652,11 +5634,9 @@ void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans, * into the file. When the file is logged we check it and * don't log the parents if the file is fully on disk. */ - if (S_ISREG(inode->i_mode)) { - mutex_lock(&BTRFS_I(inode)->log_mutex); - BTRFS_I(inode)->last_unlink_trans = trans->transid; - mutex_unlock(&BTRFS_I(inode)->log_mutex); - } + mutex_lock(&BTRFS_I(inode)->log_mutex); + BTRFS_I(inode)->last_unlink_trans = trans->transid; + mutex_unlock(&BTRFS_I(inode)->log_mutex); /* * if this directory was already logged any new diff --git a/fs/btrfs/ulist.c b/fs/btrfs/ulist.c index 91feb2bdefee..b1434bb57e36 100644 --- a/fs/btrfs/ulist.c +++ b/fs/btrfs/ulist.c @@ -28,7 +28,7 @@ * } * ulist_free(ulist); * - * This assumes the graph nodes are adressable by u64. This stems from the + * This assumes the graph nodes are addressable by u64. This stems from the * usage for tree enumeration in btrfs, where the logical addresses are * 64 bit. * diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index bd0f45fb38c4..0fb4a959012e 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -20,13 +20,13 @@ #include <linux/slab.h> #include <linux/buffer_head.h> #include <linux/blkdev.h> -#include <linux/random.h> #include <linux/iocontext.h> #include <linux/capability.h> #include <linux/ratelimit.h> #include <linux/kthread.h> #include <linux/raid/pq.h> #include <linux/semaphore.h> +#include <linux/uuid.h> #include <asm/div64.h> #include "ctree.h" #include "extent_map.h" @@ -118,6 +118,21 @@ const u64 btrfs_raid_group[BTRFS_NR_RAID_TYPES] = { [BTRFS_RAID_RAID6] = BTRFS_BLOCK_GROUP_RAID6, }; +/* + * Table to convert BTRFS_RAID_* to the error code if minimum number of devices + * condition is not met. Zero means there's no corresponding + * BTRFS_ERROR_DEV_*_NOT_MET value. + */ +const int btrfs_raid_mindev_error[BTRFS_NR_RAID_TYPES] = { + [BTRFS_RAID_RAID10] = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET, + [BTRFS_RAID_RAID1] = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET, + [BTRFS_RAID_DUP] = 0, + [BTRFS_RAID_RAID0] = 0, + [BTRFS_RAID_SINGLE] = 0, + [BTRFS_RAID_RAID5] = BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET, + [BTRFS_RAID_RAID6] = BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET, +}; + static int init_first_rw_device(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_device *device); @@ -447,7 +462,7 @@ loop_lock: sync_pending = 0; } - btrfsic_submit_bio(cur->bi_rw, cur); + btrfsic_submit_bio(cur); num_run++; batch_run++; @@ -699,7 +714,8 @@ static noinline int device_list_add(const char *path, * if there is new btrfs on an already registered device, * then remove the stale device entry. */ - btrfs_free_stale_device(device); + if (ret > 0) + btrfs_free_stale_device(device); *fs_devices_ret = fs_devices; @@ -988,6 +1004,56 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, return ret; } +void btrfs_release_disk_super(struct page *page) +{ + kunmap(page); + put_page(page); +} + +int btrfs_read_disk_super(struct block_device *bdev, u64 bytenr, + struct page **page, struct btrfs_super_block **disk_super) +{ + void *p; + pgoff_t index; + + /* make sure our super fits in the device */ + if (bytenr + PAGE_SIZE >= i_size_read(bdev->bd_inode)) + return 1; + + /* make sure our super fits in the page */ + if (sizeof(**disk_super) > PAGE_SIZE) + return 1; + + /* make sure our super doesn't straddle pages on disk */ + index = bytenr >> PAGE_SHIFT; + if ((bytenr + sizeof(**disk_super) - 1) >> PAGE_SHIFT != index) + return 1; + + /* pull in the page with our super */ + *page = read_cache_page_gfp(bdev->bd_inode->i_mapping, + index, GFP_KERNEL); + + if (IS_ERR_OR_NULL(*page)) + return 1; + + p = kmap(*page); + + /* align our pointer to the offset of the super block */ + *disk_super = p + (bytenr & ~PAGE_MASK); + + if (btrfs_super_bytenr(*disk_super) != bytenr || + btrfs_super_magic(*disk_super) != BTRFS_MAGIC) { + btrfs_release_disk_super(*page); + return 1; + } + + if ((*disk_super)->label[0] && + (*disk_super)->label[BTRFS_LABEL_SIZE - 1]) + (*disk_super)->label[BTRFS_LABEL_SIZE - 1] = '\0'; + + return 0; +} + /* * Look for a btrfs signature on a device. This may be called out of the mount path * and we are not allowed to call set_blocksize during the scan. The superblock @@ -999,13 +1065,11 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, struct btrfs_super_block *disk_super; struct block_device *bdev; struct page *page; - void *p; int ret = -EINVAL; u64 devid; u64 transid; u64 total_devices; u64 bytenr; - pgoff_t index; /* * we would like to check all the supers, but that would make @@ -1018,41 +1082,14 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, mutex_lock(&uuid_mutex); bdev = blkdev_get_by_path(path, flags, holder); - if (IS_ERR(bdev)) { ret = PTR_ERR(bdev); goto error; } - /* make sure our super fits in the device */ - if (bytenr + PAGE_SIZE >= i_size_read(bdev->bd_inode)) - goto error_bdev_put; - - /* make sure our super fits in the page */ - if (sizeof(*disk_super) > PAGE_SIZE) + if (btrfs_read_disk_super(bdev, bytenr, &page, &disk_super)) goto error_bdev_put; - /* make sure our super doesn't straddle pages on disk */ - index = bytenr >> PAGE_SHIFT; - if ((bytenr + sizeof(*disk_super) - 1) >> PAGE_SHIFT != index) - goto error_bdev_put; - - /* pull in the page with our super */ - page = read_cache_page_gfp(bdev->bd_inode->i_mapping, - index, GFP_NOFS); - - if (IS_ERR_OR_NULL(page)) - goto error_bdev_put; - - p = kmap(page); - - /* align our pointer to the offset of the super block */ - disk_super = p + (bytenr & ~PAGE_MASK); - - if (btrfs_super_bytenr(disk_super) != bytenr || - btrfs_super_magic(disk_super) != BTRFS_MAGIC) - goto error_unmap; - devid = btrfs_stack_device_id(&disk_super->dev_item); transid = btrfs_super_generation(disk_super); total_devices = btrfs_super_num_devices(disk_super); @@ -1060,8 +1097,6 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, ret = device_list_add(path, disk_super, devid, fs_devices_ret); if (ret > 0) { if (disk_super->label[0]) { - if (disk_super->label[BTRFS_LABEL_SIZE - 1]) - disk_super->label[BTRFS_LABEL_SIZE - 1] = '\0'; printk(KERN_INFO "BTRFS: device label %s ", disk_super->label); } else { printk(KERN_INFO "BTRFS: device fsid %pU ", disk_super->fsid); @@ -1073,9 +1108,7 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, if (!ret && fs_devices_ret) (*fs_devices_ret)->total_devices = total_devices; -error_unmap: - kunmap(page); - put_page(page); + btrfs_release_disk_super(page); error_bdev_put: blkdev_put(bdev, flags); @@ -1454,7 +1487,7 @@ again: extent = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_extent); } else { - btrfs_std_error(root->fs_info, ret, "Slot search failed"); + btrfs_handle_fs_error(root->fs_info, ret, "Slot search failed"); goto out; } @@ -1462,7 +1495,7 @@ again: ret = btrfs_del_item(trans, root, path); if (ret) { - btrfs_std_error(root->fs_info, ret, + btrfs_handle_fs_error(root->fs_info, ret, "Failed to remove dev extent item"); } else { set_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags); @@ -1688,32 +1721,92 @@ out: return ret; } -int btrfs_rm_device(struct btrfs_root *root, char *device_path) +/* + * Verify that @num_devices satisfies the RAID profile constraints in the whole + * filesystem. It's up to the caller to adjust that number regarding eg. device + * replace. + */ +static int btrfs_check_raid_min_devices(struct btrfs_fs_info *fs_info, + u64 num_devices) +{ + u64 all_avail; + unsigned seq; + int i; + + do { + seq = read_seqbegin(&fs_info->profiles_lock); + + all_avail = fs_info->avail_data_alloc_bits | + fs_info->avail_system_alloc_bits | + fs_info->avail_metadata_alloc_bits; + } while (read_seqretry(&fs_info->profiles_lock, seq)); + + for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) { + if (!(all_avail & btrfs_raid_group[i])) + continue; + + if (num_devices < btrfs_raid_array[i].devs_min) { + int ret = btrfs_raid_mindev_error[i]; + + if (ret) + return ret; + } + } + + return 0; +} + +struct btrfs_device *btrfs_find_next_active_device(struct btrfs_fs_devices *fs_devs, + struct btrfs_device *device) { - struct btrfs_device *device; struct btrfs_device *next_device; - struct block_device *bdev; - struct buffer_head *bh = NULL; - struct btrfs_super_block *disk_super; + + list_for_each_entry(next_device, &fs_devs->devices, dev_list) { + if (next_device != device && + !next_device->missing && next_device->bdev) + return next_device; + } + + return NULL; +} + +/* + * Helper function to check if the given device is part of s_bdev / latest_bdev + * and replace it with the provided or the next active device, in the context + * where this function called, there should be always be another device (or + * this_dev) which is active. + */ +void btrfs_assign_next_active_device(struct btrfs_fs_info *fs_info, + struct btrfs_device *device, struct btrfs_device *this_dev) +{ + struct btrfs_device *next_device; + + if (this_dev) + next_device = this_dev; + else + next_device = btrfs_find_next_active_device(fs_info->fs_devices, + device); + ASSERT(next_device); + + if (fs_info->sb->s_bdev && + (fs_info->sb->s_bdev == device->bdev)) + fs_info->sb->s_bdev = next_device->bdev; + + if (fs_info->fs_devices->latest_bdev == device->bdev) + fs_info->fs_devices->latest_bdev = next_device->bdev; +} + +int btrfs_rm_device(struct btrfs_root *root, char *device_path, u64 devid) +{ + struct btrfs_device *device; struct btrfs_fs_devices *cur_devices; - u64 all_avail; - u64 devid; u64 num_devices; - u8 *dev_uuid; - unsigned seq; int ret = 0; bool clear_super = false; + char *dev_name = NULL; mutex_lock(&uuid_mutex); - do { - seq = read_seqbegin(&root->fs_info->profiles_lock); - - all_avail = root->fs_info->avail_data_alloc_bits | - root->fs_info->avail_system_alloc_bits | - root->fs_info->avail_metadata_alloc_bits; - } while (read_seqretry(&root->fs_info->profiles_lock, seq)); - num_devices = root->fs_info->fs_devices->num_devices; btrfs_dev_replace_lock(&root->fs_info->dev_replace, 0); if (btrfs_dev_replace_is_ongoing(&root->fs_info->dev_replace)) { @@ -1722,78 +1815,23 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) } btrfs_dev_replace_unlock(&root->fs_info->dev_replace, 0); - if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) && num_devices <= 4) { - ret = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET; - goto out; - } - - if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) && num_devices <= 2) { - ret = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET; + ret = btrfs_check_raid_min_devices(root->fs_info, num_devices - 1); + if (ret) goto out; - } - if ((all_avail & BTRFS_BLOCK_GROUP_RAID5) && - root->fs_info->fs_devices->rw_devices <= 2) { - ret = BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET; - goto out; - } - if ((all_avail & BTRFS_BLOCK_GROUP_RAID6) && - root->fs_info->fs_devices->rw_devices <= 3) { - ret = BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET; + ret = btrfs_find_device_by_devspec(root, devid, device_path, + &device); + if (ret) goto out; - } - - if (strcmp(device_path, "missing") == 0) { - struct list_head *devices; - struct btrfs_device *tmp; - - device = NULL; - devices = &root->fs_info->fs_devices->devices; - /* - * It is safe to read the devices since the volume_mutex - * is held. - */ - list_for_each_entry(tmp, devices, dev_list) { - if (tmp->in_fs_metadata && - !tmp->is_tgtdev_for_dev_replace && - !tmp->bdev) { - device = tmp; - break; - } - } - bdev = NULL; - bh = NULL; - disk_super = NULL; - if (!device) { - ret = BTRFS_ERROR_DEV_MISSING_NOT_FOUND; - goto out; - } - } else { - ret = btrfs_get_bdev_and_sb(device_path, - FMODE_WRITE | FMODE_EXCL, - root->fs_info->bdev_holder, 0, - &bdev, &bh); - if (ret) - goto out; - disk_super = (struct btrfs_super_block *)bh->b_data; - devid = btrfs_stack_device_id(&disk_super->dev_item); - dev_uuid = disk_super->dev_item.uuid; - device = btrfs_find_device(root->fs_info, devid, dev_uuid, - disk_super->fsid); - if (!device) { - ret = -ENOENT; - goto error_brelse; - } - } if (device->is_tgtdev_for_dev_replace) { ret = BTRFS_ERROR_DEV_TGT_REPLACE; - goto error_brelse; + goto out; } if (device->writeable && root->fs_info->fs_devices->rw_devices == 1) { ret = BTRFS_ERROR_DEV_ONLY_WRITABLE; - goto error_brelse; + goto out; } if (device->writeable) { @@ -1801,6 +1839,11 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) list_del_init(&device->dev_alloc_list); device->fs_devices->rw_devices--; unlock_chunks(root); + dev_name = kstrdup(device->name->str, GFP_KERNEL); + if (!dev_name) { + ret = -ENOMEM; + goto error_undo; + } clear_super = true; } @@ -1842,12 +1885,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) if (device->missing) device->fs_devices->missing_devices--; - next_device = list_entry(root->fs_info->fs_devices->devices.next, - struct btrfs_device, dev_list); - if (device->bdev == root->fs_info->sb->s_bdev) - root->fs_info->sb->s_bdev = next_device->bdev; - if (device->bdev == root->fs_info->fs_devices->latest_bdev) - root->fs_info->fs_devices->latest_bdev = next_device->bdev; + btrfs_assign_next_active_device(root->fs_info, device, NULL); if (device->bdev) { device->fs_devices->open_devices--; @@ -1883,63 +1921,23 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) * at this point, the device is zero sized. We want to * remove it from the devices list and zero out the old super */ - if (clear_super && disk_super) { - u64 bytenr; - int i; - - /* make sure this device isn't detected as part of - * the FS anymore - */ - memset(&disk_super->magic, 0, sizeof(disk_super->magic)); - set_buffer_dirty(bh); - sync_dirty_buffer(bh); - - /* clear the mirror copies of super block on the disk - * being removed, 0th copy is been taken care above and - * the below would take of the rest - */ - for (i = 1; i < BTRFS_SUPER_MIRROR_MAX; i++) { - bytenr = btrfs_sb_offset(i); - if (bytenr + BTRFS_SUPER_INFO_SIZE >= - i_size_read(bdev->bd_inode)) - break; - - brelse(bh); - bh = __bread(bdev, bytenr / 4096, - BTRFS_SUPER_INFO_SIZE); - if (!bh) - continue; - - disk_super = (struct btrfs_super_block *)bh->b_data; - - if (btrfs_super_bytenr(disk_super) != bytenr || - btrfs_super_magic(disk_super) != BTRFS_MAGIC) { - continue; - } - memset(&disk_super->magic, 0, - sizeof(disk_super->magic)); - set_buffer_dirty(bh); - sync_dirty_buffer(bh); + if (clear_super) { + struct block_device *bdev; + + bdev = blkdev_get_by_path(dev_name, FMODE_READ | FMODE_EXCL, + root->fs_info->bdev_holder); + if (!IS_ERR(bdev)) { + btrfs_scratch_superblocks(bdev, dev_name); + blkdev_put(bdev, FMODE_READ | FMODE_EXCL); } } - ret = 0; - - if (bdev) { - /* Notify udev that device has changed */ - btrfs_kobject_uevent(bdev, KOBJ_CHANGE); - - /* Update ctime/mtime for device path for libblkid */ - update_dev_time(device_path); - } - -error_brelse: - brelse(bh); - if (bdev) - blkdev_put(bdev, FMODE_READ | FMODE_EXCL); out: + kfree(dev_name); + mutex_unlock(&uuid_mutex); return ret; + error_undo: if (device->writeable) { lock_chunks(root); @@ -1948,7 +1946,7 @@ error_undo: device->fs_devices->rw_devices++; unlock_chunks(root); } - goto error_brelse; + goto out; } void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_fs_info *fs_info, @@ -1972,11 +1970,8 @@ void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_fs_info *fs_info, if (srcdev->missing) fs_devices->missing_devices--; - if (srcdev->writeable) { + if (srcdev->writeable) fs_devices->rw_devices--; - /* zero out the old super if it is writable */ - btrfs_scratch_superblocks(srcdev->bdev, srcdev->name->str); - } if (srcdev->bdev) fs_devices->open_devices--; @@ -1987,6 +1982,10 @@ void btrfs_rm_dev_replace_free_srcdev(struct btrfs_fs_info *fs_info, { struct btrfs_fs_devices *fs_devices = srcdev->fs_devices; + if (srcdev->writeable) { + /* zero out the old super if it is writable */ + btrfs_scratch_superblocks(srcdev->bdev, srcdev->name->str); + } call_rcu(&srcdev->rcu, free_device); /* @@ -2016,32 +2015,33 @@ void btrfs_rm_dev_replace_free_srcdev(struct btrfs_fs_info *fs_info, void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, struct btrfs_device *tgtdev) { - struct btrfs_device *next_device; - mutex_lock(&uuid_mutex); WARN_ON(!tgtdev); mutex_lock(&fs_info->fs_devices->device_list_mutex); btrfs_sysfs_rm_device_link(fs_info->fs_devices, tgtdev); - if (tgtdev->bdev) { - btrfs_scratch_superblocks(tgtdev->bdev, tgtdev->name->str); + if (tgtdev->bdev) fs_info->fs_devices->open_devices--; - } + fs_info->fs_devices->num_devices--; - next_device = list_entry(fs_info->fs_devices->devices.next, - struct btrfs_device, dev_list); - if (tgtdev->bdev == fs_info->sb->s_bdev) - fs_info->sb->s_bdev = next_device->bdev; - if (tgtdev->bdev == fs_info->fs_devices->latest_bdev) - fs_info->fs_devices->latest_bdev = next_device->bdev; - list_del_rcu(&tgtdev->dev_list); + btrfs_assign_next_active_device(fs_info, tgtdev, NULL); - call_rcu(&tgtdev->rcu, free_device); + list_del_rcu(&tgtdev->dev_list); mutex_unlock(&fs_info->fs_devices->device_list_mutex); mutex_unlock(&uuid_mutex); + + /* + * The update_dev_time() with in btrfs_scratch_superblocks() + * may lead to a call to btrfs_show_devname() which will try + * to hold device_list_mutex. And here this device + * is already out of device list, so we don't have to hold + * the device_list_mutex lock. + */ + btrfs_scratch_superblocks(tgtdev->bdev, tgtdev->name->str); + call_rcu(&tgtdev->rcu, free_device); } static int btrfs_find_device_by_path(struct btrfs_root *root, char *device_path, @@ -2102,6 +2102,31 @@ int btrfs_find_device_missing_or_by_path(struct btrfs_root *root, } /* + * Lookup a device given by device id, or the path if the id is 0. + */ +int btrfs_find_device_by_devspec(struct btrfs_root *root, u64 devid, + char *devpath, + struct btrfs_device **device) +{ + int ret; + + if (devid) { + ret = 0; + *device = btrfs_find_device(root->fs_info, devid, NULL, + NULL); + if (!*device) + ret = -ENOENT; + } else { + if (!devpath || !devpath[0]) + return -EINVAL; + + ret = btrfs_find_device_missing_or_by_path(root, devpath, + device); + } + return ret; +} + +/* * does all the dirty work required for changing file system's UUID. */ static int btrfs_prepare_sprout(struct btrfs_root *root) @@ -2165,7 +2190,7 @@ static int btrfs_prepare_sprout(struct btrfs_root *root) } /* - * strore the expected generation for seed devices in device items. + * Store the expected generation for seed devices in device items. */ static int btrfs_finish_sprout(struct btrfs_trans_handle *trans, struct btrfs_root *root) @@ -2418,7 +2443,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) ret = btrfs_relocate_sys_chunks(root); if (ret < 0) - btrfs_std_error(root->fs_info, ret, + btrfs_handle_fs_error(root->fs_info, ret, "Failed to relocate sys chunks after " "device initialization. This can be fixed " "using the \"btrfs balance\" command."); @@ -2663,7 +2688,7 @@ static int btrfs_free_chunk(struct btrfs_trans_handle *trans, if (ret < 0) goto out; else if (ret > 0) { /* Logic error or corruption */ - btrfs_std_error(root->fs_info, -ENOENT, + btrfs_handle_fs_error(root->fs_info, -ENOENT, "Failed lookup while freeing chunk."); ret = -ENOENT; goto out; @@ -2671,7 +2696,7 @@ static int btrfs_free_chunk(struct btrfs_trans_handle *trans, ret = btrfs_del_item(trans, root, path); if (ret < 0) - btrfs_std_error(root->fs_info, ret, + btrfs_handle_fs_error(root->fs_info, ret, "Failed to delete chunk item."); out: btrfs_free_path(path); @@ -2736,6 +2761,7 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 dev_extent_len = 0; u64 chunk_objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; int i, ret = 0; + struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; /* Just in case */ root = root->fs_info->chunk_root; @@ -2762,12 +2788,19 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, check_system_chunk(trans, extent_root, map->type); unlock_chunks(root->fs_info->chunk_root); + /* + * Take the device list mutex to prevent races with the final phase of + * a device replace operation that replaces the device object associated + * with map stripes (dev-replace.c:btrfs_dev_replace_finishing()). + */ + mutex_lock(&fs_devices->device_list_mutex); for (i = 0; i < map->num_stripes; i++) { struct btrfs_device *device = map->stripes[i].dev; ret = btrfs_free_dev_extent(trans, device, map->stripes[i].physical, &dev_extent_len); if (ret) { + mutex_unlock(&fs_devices->device_list_mutex); btrfs_abort_transaction(trans, root, ret); goto out; } @@ -2786,11 +2819,14 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, if (map->stripes[i].dev) { ret = btrfs_update_device(trans, map->stripes[i].dev); if (ret) { + mutex_unlock(&fs_devices->device_list_mutex); btrfs_abort_transaction(trans, root, ret); goto out; } } } + mutex_unlock(&fs_devices->device_list_mutex); + ret = btrfs_free_chunk(trans, root, chunk_objectid, chunk_offset); if (ret) { btrfs_abort_transaction(trans, root, ret); @@ -2857,7 +2893,7 @@ static int btrfs_relocate_chunk(struct btrfs_root *root, u64 chunk_offset) chunk_offset); if (IS_ERR(trans)) { ret = PTR_ERR(trans); - btrfs_std_error(root->fs_info, ret, NULL); + btrfs_handle_fs_error(root->fs_info, ret, NULL); return ret; } @@ -3362,7 +3398,7 @@ static int should_balance_chunk(struct btrfs_root *root, } else if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE)) { /* * Same logic as the 'limit' filter; the minimum cannot be - * determined here because we do not have the global informatoin + * determined here because we do not have the global information * about the count of all chunks that satisfy the filters. */ if (bargs->limit_max == 0) @@ -3402,6 +3438,7 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info) u32 count_meta = 0; u32 count_sys = 0; int chunk_reserved = 0; + u64 bytes_used = 0; /* step one make some room on all the devices */ devices = &fs_info->fs_devices->devices; @@ -3540,7 +3577,13 @@ again: goto loop; } - if ((chunk_type & BTRFS_BLOCK_GROUP_DATA) && !chunk_reserved) { + ASSERT(fs_info->data_sinfo); + spin_lock(&fs_info->data_sinfo->lock); + bytes_used = fs_info->data_sinfo->bytes_used; + spin_unlock(&fs_info->data_sinfo->lock); + + if ((chunk_type & BTRFS_BLOCK_GROUP_DATA) && + !chunk_reserved && !bytes_used) { trans = btrfs_start_transaction(chunk_root, 0); if (IS_ERR(trans)) { mutex_unlock(&fs_info->delete_unused_bgs_mutex); @@ -3632,7 +3675,7 @@ static void __cancel_balance(struct btrfs_fs_info *fs_info) unset_balance_control(fs_info); ret = del_balance_item(fs_info->tree_root); if (ret) - btrfs_std_error(fs_info, ret, NULL); + btrfs_handle_fs_error(fs_info, ret, NULL); atomic_set(&fs_info->mutually_exclusive_operation_running, 0); } @@ -3693,10 +3736,8 @@ int btrfs_balance(struct btrfs_balance_control *bctl, num_devices--; } btrfs_dev_replace_unlock(&fs_info->dev_replace, 0); - allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE; - if (num_devices == 1) - allowed |= BTRFS_BLOCK_GROUP_DUP; - else if (num_devices > 1) + allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE | BTRFS_BLOCK_GROUP_DUP; + if (num_devices > 1) allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1); if (num_devices > 2) allowed |= BTRFS_BLOCK_GROUP_RAID5; @@ -4200,6 +4241,7 @@ int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info) if (IS_ERR(uuid_root)) { ret = PTR_ERR(uuid_root); btrfs_abort_transaction(trans, tree_root, ret); + btrfs_end_transaction(trans, tree_root); return ret; } @@ -4652,12 +4694,12 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, if (type & BTRFS_BLOCK_GROUP_RAID5) { raid_stripe_len = find_raid56_stripe_len(ndevs - 1, - btrfs_super_stripesize(info->super_copy)); + extent_root->stripesize); data_stripes = num_stripes - 1; } if (type & BTRFS_BLOCK_GROUP_RAID6) { raid_stripe_len = find_raid56_stripe_len(ndevs - 2, - btrfs_super_stripesize(info->super_copy)); + extent_root->stripesize); data_stripes = num_stripes - 2; } @@ -5218,7 +5260,7 @@ void btrfs_put_bbio(struct btrfs_bio *bbio) kfree(bbio); } -static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, +static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int op, u64 logical, u64 *length, struct btrfs_bio **bbio_ret, int mirror_num, int need_raid_map) @@ -5278,7 +5320,15 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, stripe_nr = div64_u64(stripe_nr, stripe_len); stripe_offset = stripe_nr * stripe_len; - BUG_ON(offset < stripe_offset); + if (offset < stripe_offset) { + btrfs_crit(fs_info, "stripe math has gone wrong, " + "stripe_offset=%llu, offset=%llu, start=%llu, " + "logical=%llu, stripe_len=%llu", + stripe_offset, offset, em->start, logical, + stripe_len); + free_extent_map(em); + return -EINVAL; + } /* stripe_offset is the offset of this block in its stripe*/ stripe_offset = offset - stripe_offset; @@ -5296,7 +5346,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, raid56_full_stripe_start *= full_stripe_len; } - if (rw & REQ_DISCARD) { + if (op == REQ_OP_DISCARD) { /* we don't discard raid56 yet */ if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { ret = -EOPNOTSUPP; @@ -5309,7 +5359,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, For other RAID types and for RAID[56] reads, just allow a single stripe (on a single disk). */ if ((map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) && - (rw & REQ_WRITE)) { + (op == REQ_OP_WRITE)) { max_len = stripe_len * nr_data_stripes(map) - (offset - raid56_full_stripe_start); } else { @@ -5334,8 +5384,8 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, btrfs_dev_replace_set_lock_blocking(dev_replace); if (dev_replace_is_ongoing && mirror_num == map->num_stripes + 1 && - !(rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS)) && - dev_replace->tgtdev != NULL) { + op != REQ_OP_WRITE && op != REQ_OP_DISCARD && + op != REQ_GET_READ_MIRRORS && dev_replace->tgtdev != NULL) { /* * in dev-replace case, for repair case (that's the only * case where the mirror is selected explicitly when @@ -5422,15 +5472,17 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, (offset + *length); if (map->type & BTRFS_BLOCK_GROUP_RAID0) { - if (rw & REQ_DISCARD) + if (op == REQ_OP_DISCARD) num_stripes = min_t(u64, map->num_stripes, stripe_nr_end - stripe_nr_orig); stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, &stripe_index); - if (!(rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS))) + if (op != REQ_OP_WRITE && op != REQ_OP_DISCARD && + op != REQ_GET_READ_MIRRORS) mirror_num = 1; } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) { - if (rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS)) + if (op == REQ_OP_WRITE || op == REQ_OP_DISCARD || + op == REQ_GET_READ_MIRRORS) num_stripes = map->num_stripes; else if (mirror_num) stripe_index = mirror_num - 1; @@ -5443,7 +5495,8 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, } } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { - if (rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS)) { + if (op == REQ_OP_WRITE || op == REQ_OP_DISCARD || + op == REQ_GET_READ_MIRRORS) { num_stripes = map->num_stripes; } else if (mirror_num) { stripe_index = mirror_num - 1; @@ -5457,9 +5510,9 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index); stripe_index *= map->sub_stripes; - if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) + if (op == REQ_OP_WRITE || op == REQ_GET_READ_MIRRORS) num_stripes = map->sub_stripes; - else if (rw & REQ_DISCARD) + else if (op == REQ_OP_DISCARD) num_stripes = min_t(u64, map->sub_stripes * (stripe_nr_end - stripe_nr_orig), map->num_stripes); @@ -5477,7 +5530,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, } else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { if (need_raid_map && - ((rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) || + (op == REQ_OP_WRITE || op == REQ_GET_READ_MIRRORS || mirror_num > 1)) { /* push stripe_nr back to the start of the full stripe */ stripe_nr = div_u64(raid56_full_stripe_start, @@ -5505,8 +5558,8 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, /* We distribute the parity blocks across stripes */ div_u64_rem(stripe_nr + stripe_index, map->num_stripes, &stripe_index); - if (!(rw & (REQ_WRITE | REQ_DISCARD | - REQ_GET_READ_MIRRORS)) && mirror_num <= 1) + if ((op != REQ_OP_WRITE && op != REQ_OP_DISCARD && + op != REQ_GET_READ_MIRRORS) && mirror_num <= 1) mirror_num = 1; } } else { @@ -5519,13 +5572,19 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, &stripe_index); mirror_num = stripe_index + 1; } - BUG_ON(stripe_index >= map->num_stripes); + if (stripe_index >= map->num_stripes) { + btrfs_crit(fs_info, "stripe index math went horribly wrong, " + "got stripe_index=%u, num_stripes=%u", + stripe_index, map->num_stripes); + ret = -EINVAL; + goto out; + } num_alloc_stripes = num_stripes; if (dev_replace_is_ongoing) { - if (rw & (REQ_WRITE | REQ_DISCARD)) + if (op == REQ_OP_WRITE || op == REQ_OP_DISCARD) num_alloc_stripes <<= 1; - if (rw & REQ_GET_READ_MIRRORS) + if (op == REQ_GET_READ_MIRRORS) num_alloc_stripes++; tgtdev_indexes = num_stripes; } @@ -5540,7 +5599,8 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, /* build raid_map */ if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK && - need_raid_map && ((rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) || + need_raid_map && + ((op == REQ_OP_WRITE || op == REQ_GET_READ_MIRRORS) || mirror_num > 1)) { u64 tmp; unsigned rot; @@ -5565,7 +5625,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, RAID6_Q_STRIPE; } - if (rw & REQ_DISCARD) { + if (op == REQ_OP_DISCARD) { u32 factor = 0; u32 sub_stripes = 0; u64 stripes_per_dev = 0; @@ -5645,14 +5705,15 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, } } - if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) + if (op == REQ_OP_WRITE || op == REQ_GET_READ_MIRRORS) max_errors = btrfs_chunk_max_errors(map); if (bbio->raid_map) sort_parity_stripes(bbio, num_stripes); tgtdev_indexes = 0; - if (dev_replace_is_ongoing && (rw & (REQ_WRITE | REQ_DISCARD)) && + if (dev_replace_is_ongoing && + (op == REQ_OP_WRITE || op == REQ_OP_DISCARD) && dev_replace->tgtdev != NULL) { int index_where_to_add; u64 srcdev_devid = dev_replace->srcdev->devid; @@ -5687,7 +5748,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, } } num_stripes = index_where_to_add; - } else if (dev_replace_is_ongoing && (rw & REQ_GET_READ_MIRRORS) && + } else if (dev_replace_is_ongoing && (op == REQ_GET_READ_MIRRORS) && dev_replace->tgtdev != NULL) { u64 srcdev_devid = dev_replace->srcdev->devid; int index_srcdev = 0; @@ -5718,20 +5779,17 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, } } if (found) { - if (physical_of_found + map->stripe_len <= - dev_replace->cursor_left) { - struct btrfs_bio_stripe *tgtdev_stripe = - bbio->stripes + num_stripes; + struct btrfs_bio_stripe *tgtdev_stripe = + bbio->stripes + num_stripes; - tgtdev_stripe->physical = physical_of_found; - tgtdev_stripe->length = - bbio->stripes[index_srcdev].length; - tgtdev_stripe->dev = dev_replace->tgtdev; - bbio->tgtdev_map[index_srcdev] = num_stripes; + tgtdev_stripe->physical = physical_of_found; + tgtdev_stripe->length = + bbio->stripes[index_srcdev].length; + tgtdev_stripe->dev = dev_replace->tgtdev; + bbio->tgtdev_map[index_srcdev] = num_stripes; - tgtdev_indexes++; - num_stripes++; - } + tgtdev_indexes++; + num_stripes++; } } @@ -5762,21 +5820,21 @@ out: return ret; } -int btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, +int btrfs_map_block(struct btrfs_fs_info *fs_info, int op, u64 logical, u64 *length, struct btrfs_bio **bbio_ret, int mirror_num) { - return __btrfs_map_block(fs_info, rw, logical, length, bbio_ret, + return __btrfs_map_block(fs_info, op, logical, length, bbio_ret, mirror_num, 0); } /* For Scrub/replace */ -int btrfs_map_sblock(struct btrfs_fs_info *fs_info, int rw, +int btrfs_map_sblock(struct btrfs_fs_info *fs_info, int op, u64 logical, u64 *length, struct btrfs_bio **bbio_ret, int mirror_num, int need_raid_map) { - return __btrfs_map_block(fs_info, rw, logical, length, bbio_ret, + return __btrfs_map_block(fs_info, op, logical, length, bbio_ret, mirror_num, need_raid_map); } @@ -5890,7 +5948,7 @@ static void btrfs_end_bio(struct bio *bio) BUG_ON(stripe_index >= bbio->num_stripes); dev = bbio->stripes[stripe_index].dev; if (dev->bdev) { - if (bio->bi_rw & WRITE) + if (bio_op(bio) == REQ_OP_WRITE) btrfs_dev_stat_inc(dev, BTRFS_DEV_STAT_WRITE_ERRS); else @@ -5944,7 +6002,7 @@ static void btrfs_end_bio(struct bio *bio) */ static noinline void btrfs_schedule_bio(struct btrfs_root *root, struct btrfs_device *device, - int rw, struct bio *bio) + struct bio *bio) { int should_queue = 1; struct btrfs_pending_bios *pending_bios; @@ -5955,9 +6013,9 @@ static noinline void btrfs_schedule_bio(struct btrfs_root *root, } /* don't bother with additional async steps for reads, right now */ - if (!(rw & REQ_WRITE)) { + if (bio_op(bio) == REQ_OP_READ) { bio_get(bio); - btrfsic_submit_bio(rw, bio); + btrfsic_submit_bio(bio); bio_put(bio); return; } @@ -5971,7 +6029,6 @@ static noinline void btrfs_schedule_bio(struct btrfs_root *root, atomic_inc(&root->fs_info->nr_async_bios); WARN_ON(bio->bi_next); bio->bi_next = NULL; - bio->bi_rw |= rw; spin_lock(&device->io_lock); if (bio->bi_rw & REQ_SYNC) @@ -5997,7 +6054,7 @@ static noinline void btrfs_schedule_bio(struct btrfs_root *root, static void submit_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio, struct bio *bio, u64 physical, int dev_nr, - int rw, int async) + int async) { struct btrfs_device *dev = bbio->stripes[dev_nr].dev; @@ -6011,8 +6068,8 @@ static void submit_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio, rcu_read_lock(); name = rcu_dereference(dev->name); - pr_debug("btrfs_map_bio: rw %d, sector=%llu, dev=%lu " - "(%s id %llu), size=%u\n", rw, + pr_debug("btrfs_map_bio: rw %d 0x%x, sector=%llu, dev=%lu " + "(%s id %llu), size=%u\n", bio_op(bio), bio->bi_rw, (u64)bio->bi_iter.bi_sector, (u_long)dev->bdev->bd_dev, name->str, dev->devid, bio->bi_iter.bi_size); rcu_read_unlock(); @@ -6023,16 +6080,16 @@ static void submit_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio, btrfs_bio_counter_inc_noblocked(root->fs_info); if (async) - btrfs_schedule_bio(root, dev, rw, bio); + btrfs_schedule_bio(root, dev, bio); else - btrfsic_submit_bio(rw, bio); + btrfsic_submit_bio(bio); } static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical) { atomic_inc(&bbio->error); if (atomic_dec_and_test(&bbio->stripes_pending)) { - /* Shoud be the original bio. */ + /* Should be the original bio. */ WARN_ON(bio != bbio->orig_bio); btrfs_io_bio(bio)->mirror_num = bbio->mirror_num; @@ -6042,7 +6099,7 @@ static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical) } } -int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, +int btrfs_map_bio(struct btrfs_root *root, struct bio *bio, int mirror_num, int async_submit) { struct btrfs_device *dev; @@ -6059,8 +6116,8 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, map_length = length; btrfs_bio_counter_inc_blocked(root->fs_info); - ret = __btrfs_map_block(root->fs_info, rw, logical, &map_length, &bbio, - mirror_num, 1); + ret = __btrfs_map_block(root->fs_info, bio_op(bio), logical, + &map_length, &bbio, mirror_num, 1); if (ret) { btrfs_bio_counter_dec(root->fs_info); return ret; @@ -6074,10 +6131,10 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, atomic_set(&bbio->stripes_pending, bbio->num_stripes); if ((bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) && - ((rw & WRITE) || (mirror_num > 1))) { + ((bio_op(bio) == REQ_OP_WRITE) || (mirror_num > 1))) { /* In this case, map_length has been set to the length of a single stripe; not the whole write */ - if (rw & WRITE) { + if (bio_op(bio) == REQ_OP_WRITE) { ret = raid56_parity_write(root, bio, bbio, map_length); } else { ret = raid56_parity_recover(root, bio, bbio, map_length, @@ -6096,7 +6153,8 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, for (dev_nr = 0; dev_nr < total_devs; dev_nr++) { dev = bbio->stripes[dev_nr].dev; - if (!dev || !dev->bdev || (rw & WRITE && !dev->writeable)) { + if (!dev || !dev->bdev || + (bio_op(bio) == REQ_OP_WRITE && !dev->writeable)) { bbio_error(bbio, first_bio, logical); continue; } @@ -6108,7 +6166,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, bio = first_bio; submit_stripe_bio(root, bbio, bio, - bbio->stripes[dev_nr].physical, dev_nr, rw, + bbio->stripes[dev_nr].physical, dev_nr, async_submit); } btrfs_bio_counter_dec(root->fs_info); @@ -6206,27 +6264,23 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info, return dev; } -static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key, - struct extent_buffer *leaf, - struct btrfs_chunk *chunk) +/* Return -EIO if any error, otherwise return 0. */ +static int btrfs_check_chunk_valid(struct btrfs_root *root, + struct extent_buffer *leaf, + struct btrfs_chunk *chunk, u64 logical) { - struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree; - struct map_lookup *map; - struct extent_map *em; - u64 logical; u64 length; u64 stripe_len; - u64 devid; - u8 uuid[BTRFS_UUID_SIZE]; - int num_stripes; - int ret; - int i; + u16 num_stripes; + u16 sub_stripes; + u64 type; - logical = key->offset; length = btrfs_chunk_length(leaf, chunk); stripe_len = btrfs_chunk_stripe_len(leaf, chunk); num_stripes = btrfs_chunk_num_stripes(leaf, chunk); - /* Validation check */ + sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk); + type = btrfs_chunk_type(leaf, chunk); + if (!num_stripes) { btrfs_err(root->fs_info, "invalid chunk num_stripes: %u", num_stripes); @@ -6237,24 +6291,70 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key, "invalid chunk logical %llu", logical); return -EIO; } + if (btrfs_chunk_sector_size(leaf, chunk) != root->sectorsize) { + btrfs_err(root->fs_info, "invalid chunk sectorsize %u", + btrfs_chunk_sector_size(leaf, chunk)); + return -EIO; + } if (!length || !IS_ALIGNED(length, root->sectorsize)) { btrfs_err(root->fs_info, "invalid chunk length %llu", length); return -EIO; } - if (!is_power_of_2(stripe_len)) { + if (!is_power_of_2(stripe_len) || stripe_len != BTRFS_STRIPE_LEN) { btrfs_err(root->fs_info, "invalid chunk stripe length: %llu", stripe_len); return -EIO; } if (~(BTRFS_BLOCK_GROUP_TYPE_MASK | BTRFS_BLOCK_GROUP_PROFILE_MASK) & - btrfs_chunk_type(leaf, chunk)) { + type) { btrfs_err(root->fs_info, "unrecognized chunk type: %llu", ~(BTRFS_BLOCK_GROUP_TYPE_MASK | BTRFS_BLOCK_GROUP_PROFILE_MASK) & btrfs_chunk_type(leaf, chunk)); return -EIO; } + if ((type & BTRFS_BLOCK_GROUP_RAID10 && sub_stripes != 2) || + (type & BTRFS_BLOCK_GROUP_RAID1 && num_stripes < 1) || + (type & BTRFS_BLOCK_GROUP_RAID5 && num_stripes < 2) || + (type & BTRFS_BLOCK_GROUP_RAID6 && num_stripes < 3) || + (type & BTRFS_BLOCK_GROUP_DUP && num_stripes > 2) || + ((type & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0 && + num_stripes != 1)) { + btrfs_err(root->fs_info, + "invalid num_stripes:sub_stripes %u:%u for profile %llu", + num_stripes, sub_stripes, + type & BTRFS_BLOCK_GROUP_PROFILE_MASK); + return -EIO; + } + + return 0; +} + +static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key, + struct extent_buffer *leaf, + struct btrfs_chunk *chunk) +{ + struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree; + struct map_lookup *map; + struct extent_map *em; + u64 logical; + u64 length; + u64 stripe_len; + u64 devid; + u8 uuid[BTRFS_UUID_SIZE]; + int num_stripes; + int ret; + int i; + + logical = key->offset; + length = btrfs_chunk_length(leaf, chunk); + stripe_len = btrfs_chunk_stripe_len(leaf, chunk); + num_stripes = btrfs_chunk_num_stripes(leaf, chunk); + + ret = btrfs_check_chunk_valid(root, leaf, chunk, logical); + if (ret) + return ret; read_lock(&map_tree->map_tree.lock); em = lookup_extent_mapping(&map_tree->map_tree, logical, 1); @@ -6502,6 +6602,7 @@ int btrfs_read_sys_array(struct btrfs_root *root) u32 array_size; u32 len = 0; u32 cur_offset; + u64 type; struct btrfs_key key; ASSERT(BTRFS_SUPER_INFO_SIZE <= root->nodesize); @@ -6511,12 +6612,12 @@ int btrfs_read_sys_array(struct btrfs_root *root) * overallocate but we can keep it as-is, only the first page is used. */ sb = btrfs_find_create_tree_block(root, BTRFS_SUPER_INFO_OFFSET); - if (!sb) - return -ENOMEM; + if (IS_ERR(sb)) + return PTR_ERR(sb); set_extent_buffer_uptodate(sb); btrfs_set_buffer_lockdep_class(root->root_key.objectid, sb, 0); /* - * The sb extent buffer is artifical and just used to read the system array. + * The sb extent buffer is artificial and just used to read the system array. * set_extent_buffer_uptodate() call does not properly mark all it's * pages up-to-date when the page is larger: extent does not cover the * whole page and consequently check_page_uptodate does not find all @@ -6568,6 +6669,15 @@ int btrfs_read_sys_array(struct btrfs_root *root) break; } + type = btrfs_chunk_type(sb, chunk); + if ((type & BTRFS_BLOCK_GROUP_SYSTEM) == 0) { + btrfs_err(root->fs_info, + "invalid chunk type %llu in sys_array at offset %u", + type, cur_offset); + ret = -EIO; + break; + } + len = btrfs_chunk_item_size(num_stripes); if (cur_offset + len > array_size) goto out_short_read; @@ -6586,13 +6696,15 @@ int btrfs_read_sys_array(struct btrfs_root *root) sb_array_offset += len; cur_offset += len; } - free_extent_buffer(sb); + clear_extent_buffer_uptodate(sb); + free_extent_buffer_stale(sb); return ret; out_short_read: printk(KERN_ERR "BTRFS: sys_array too short to read %u bytes at offset %u\n", len, cur_offset); - free_extent_buffer(sb); + clear_extent_buffer_uptodate(sb); + free_extent_buffer_stale(sb); return -EIO; } @@ -6604,6 +6716,7 @@ int btrfs_read_chunk_tree(struct btrfs_root *root) struct btrfs_key found_key; int ret; int slot; + u64 total_dev = 0; root = root->fs_info->chunk_root; @@ -6645,6 +6758,7 @@ int btrfs_read_chunk_tree(struct btrfs_root *root) ret = read_one_dev(root, leaf, dev_item); if (ret) goto error; + total_dev++; } else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) { struct btrfs_chunk *chunk; chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk); @@ -6654,6 +6768,28 @@ int btrfs_read_chunk_tree(struct btrfs_root *root) } path->slots[0]++; } + + /* + * After loading chunk tree, we've got all device information, + * do another round of validation checks. + */ + if (total_dev != root->fs_info->fs_devices->total_devices) { + btrfs_err(root->fs_info, + "super_num_devices %llu mismatch with num_devices %llu found here", + btrfs_super_num_devices(root->fs_info->super_copy), + total_dev); + ret = -EINVAL; + goto error; + } + if (btrfs_super_total_bytes(root->fs_info->super_copy) < + root->fs_info->fs_devices->total_rw_bytes) { + btrfs_err(root->fs_info, + "super_total_bytes %llu mismatch with fs_devices total_rw_bytes %llu", + btrfs_super_total_bytes(root->fs_info->super_copy), + root->fs_info->fs_devices->total_rw_bytes); + ret = -EINVAL; + goto error; + } ret = 0; error: unlock_chunks(root); diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 1939ebde63df..6613e6335ca2 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -340,14 +340,14 @@ struct btrfs_raid_attr { }; extern const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES]; - +extern const int btrfs_raid_mindev_error[BTRFS_NR_RAID_TYPES]; extern const u64 btrfs_raid_group[BTRFS_NR_RAID_TYPES]; struct map_lookup { u64 type; int io_align; int io_width; - int stripe_len; + u64 stripe_len; int sector_size; int num_stripes; int sub_stripes; @@ -357,52 +357,6 @@ struct map_lookup { #define map_lookup_size(n) (sizeof(struct map_lookup) + \ (sizeof(struct btrfs_bio_stripe) * (n))) -/* - * Restriper's general type filter - */ -#define BTRFS_BALANCE_DATA (1ULL << 0) -#define BTRFS_BALANCE_SYSTEM (1ULL << 1) -#define BTRFS_BALANCE_METADATA (1ULL << 2) - -#define BTRFS_BALANCE_TYPE_MASK (BTRFS_BALANCE_DATA | \ - BTRFS_BALANCE_SYSTEM | \ - BTRFS_BALANCE_METADATA) - -#define BTRFS_BALANCE_FORCE (1ULL << 3) -#define BTRFS_BALANCE_RESUME (1ULL << 4) - -/* - * Balance filters - */ -#define BTRFS_BALANCE_ARGS_PROFILES (1ULL << 0) -#define BTRFS_BALANCE_ARGS_USAGE (1ULL << 1) -#define BTRFS_BALANCE_ARGS_DEVID (1ULL << 2) -#define BTRFS_BALANCE_ARGS_DRANGE (1ULL << 3) -#define BTRFS_BALANCE_ARGS_VRANGE (1ULL << 4) -#define BTRFS_BALANCE_ARGS_LIMIT (1ULL << 5) -#define BTRFS_BALANCE_ARGS_LIMIT_RANGE (1ULL << 6) -#define BTRFS_BALANCE_ARGS_STRIPES_RANGE (1ULL << 7) -#define BTRFS_BALANCE_ARGS_USAGE_RANGE (1ULL << 10) - -#define BTRFS_BALANCE_ARGS_MASK \ - (BTRFS_BALANCE_ARGS_PROFILES | \ - BTRFS_BALANCE_ARGS_USAGE | \ - BTRFS_BALANCE_ARGS_DEVID | \ - BTRFS_BALANCE_ARGS_DRANGE | \ - BTRFS_BALANCE_ARGS_VRANGE | \ - BTRFS_BALANCE_ARGS_LIMIT | \ - BTRFS_BALANCE_ARGS_LIMIT_RANGE | \ - BTRFS_BALANCE_ARGS_STRIPES_RANGE | \ - BTRFS_BALANCE_ARGS_USAGE_RANGE) - -/* - * Profile changing flags. When SOFT is set we won't relocate chunk if - * it already has the target profile (even though it may be - * half-filled). - */ -#define BTRFS_BALANCE_ARGS_CONVERT (1ULL << 8) -#define BTRFS_BALANCE_ARGS_SOFT (1ULL << 9) - struct btrfs_balance_args; struct btrfs_balance_progress; struct btrfs_balance_control { @@ -421,10 +375,10 @@ int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start, u64 end, u64 *length); void btrfs_get_bbio(struct btrfs_bio *bbio); void btrfs_put_bbio(struct btrfs_bio *bbio); -int btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, +int btrfs_map_block(struct btrfs_fs_info *fs_info, int op, u64 logical, u64 *length, struct btrfs_bio **bbio_ret, int mirror_num); -int btrfs_map_sblock(struct btrfs_fs_info *fs_info, int rw, +int btrfs_map_sblock(struct btrfs_fs_info *fs_info, int op, u64 logical, u64 *length, struct btrfs_bio **bbio_ret, int mirror_num, int need_raid_map); @@ -437,7 +391,7 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, struct btrfs_root *extent_root, u64 type); void btrfs_mapping_init(struct btrfs_mapping_tree *tree); void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree); -int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, +int btrfs_map_bio(struct btrfs_root *root, struct bio *bio, int mirror_num, int async_submit); int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, fmode_t flags, void *holder); @@ -445,13 +399,18 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, struct btrfs_fs_devices **fs_devices_ret); int btrfs_close_devices(struct btrfs_fs_devices *fs_devices); void btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices, int step); +void btrfs_assign_next_active_device(struct btrfs_fs_info *fs_info, + struct btrfs_device *device, struct btrfs_device *this_dev); int btrfs_find_device_missing_or_by_path(struct btrfs_root *root, char *device_path, struct btrfs_device **device); +int btrfs_find_device_by_devspec(struct btrfs_root *root, u64 devid, + char *devpath, + struct btrfs_device **device); struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info, const u64 *devid, const u8 *uuid); -int btrfs_rm_device(struct btrfs_root *root, char *device_path); +int btrfs_rm_device(struct btrfs_root *root, char *device_path, u64 devid); void btrfs_cleanup_fs_uuids(void); int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len); int btrfs_grow_device(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index 3bfb252206c7..d1a177a3dbe8 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -380,23 +380,21 @@ static int btrfs_xattr_handler_get(const struct xattr_handler *handler, } static int btrfs_xattr_handler_set(const struct xattr_handler *handler, - struct dentry *dentry, const char *name, - const void *buffer, size_t size, - int flags) + struct dentry *unused, struct inode *inode, + const char *name, const void *buffer, + size_t size, int flags) { - struct inode *inode = d_inode(dentry); - name = xattr_full_name(handler, name); return __btrfs_setxattr(NULL, inode, name, buffer, size, flags); } static int btrfs_xattr_handler_set_prop(const struct xattr_handler *handler, - struct dentry *dentry, + struct dentry *unused, struct inode *inode, const char *name, const void *value, size_t size, int flags) { name = xattr_full_name(handler, name); - return btrfs_set_prop(d_inode(dentry), name, value, size, flags); + return btrfs_set_prop(inode, name, value, size, flags); } static const struct xattr_handler btrfs_security_xattr_handler = { diff --git a/fs/buffer.c b/fs/buffer.c index af0d9a82a8ed..9c8eb9b6db6a 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -21,6 +21,7 @@ #include <linux/kernel.h> #include <linux/syscalls.h> #include <linux/fs.h> +#include <linux/iomap.h> #include <linux/mm.h> #include <linux/percpu.h> #include <linux/slab.h> @@ -45,7 +46,7 @@ #include <trace/events/block.h> static int fsync_buffers_list(spinlock_t *lock, struct list_head *list); -static int submit_bh_wbc(int rw, struct buffer_head *bh, +static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh, unsigned long bio_flags, struct writeback_control *wbc); @@ -153,7 +154,7 @@ static void __end_buffer_read_notouch(struct buffer_head *bh, int uptodate) if (uptodate) { set_buffer_uptodate(bh); } else { - /* This happens, due to failed READA attempts. */ + /* This happens, due to failed read-ahead attempts. */ clear_buffer_uptodate(bh); } unlock_buffer(bh); @@ -255,17 +256,17 @@ out: */ static void free_more_memory(void) { - struct zone *zone; + struct zoneref *z; int nid; wakeup_flusher_threads(1024, WB_REASON_FREE_MORE_MEM); yield(); for_each_online_node(nid) { - (void)first_zones_zonelist(node_zonelist(nid, GFP_NOFS), - gfp_zone(GFP_NOFS), NULL, - &zone); - if (zone) + + z = first_zones_zonelist(node_zonelist(nid, GFP_NOFS), + gfp_zone(GFP_NOFS), NULL); + if (z->zone) try_to_free_pages(node_zonelist(nid, GFP_NOFS), 0, GFP_NOFS, NULL); } @@ -588,7 +589,7 @@ void write_boundary_block(struct block_device *bdev, struct buffer_head *bh = __find_get_block(bdev, bblock + 1, blocksize); if (bh) { if (buffer_dirty(bh)) - ll_rw_block(WRITE, 1, &bh); + ll_rw_block(REQ_OP_WRITE, 0, 1, &bh); put_bh(bh); } } @@ -1225,7 +1226,7 @@ static struct buffer_head *__bread_slow(struct buffer_head *bh) } else { get_bh(bh); bh->b_end_io = end_buffer_read_sync; - submit_bh(READ, bh); + submit_bh(REQ_OP_READ, 0, bh); wait_on_buffer(bh); if (buffer_uptodate(bh)) return bh; @@ -1395,7 +1396,7 @@ void __breadahead(struct block_device *bdev, sector_t block, unsigned size) { struct buffer_head *bh = __getblk(bdev, block, size); if (likely(bh)) { - ll_rw_block(READA, 1, &bh); + ll_rw_block(REQ_OP_READ, REQ_RAHEAD, 1, &bh); brelse(bh); } } @@ -1687,7 +1688,7 @@ static struct buffer_head *create_page_buffers(struct page *page, struct inode * * WB_SYNC_ALL, the writes are posted using WRITE_SYNC; this * causes the writes to be flagged as synchronous writes. */ -static int __block_write_full_page(struct inode *inode, struct page *page, +int __block_write_full_page(struct inode *inode, struct page *page, get_block_t *get_block, struct writeback_control *wbc, bh_end_io_t *handler) { @@ -1697,7 +1698,7 @@ static int __block_write_full_page(struct inode *inode, struct page *page, struct buffer_head *bh, *head; unsigned int blocksize, bbits; int nr_underway = 0; - int write_op = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE); + int write_flags = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : 0); head = create_page_buffers(page, inode, (1 << BH_Dirty)|(1 << BH_Uptodate)); @@ -1786,7 +1787,7 @@ static int __block_write_full_page(struct inode *inode, struct page *page, do { struct buffer_head *next = bh->b_this_page; if (buffer_async_write(bh)) { - submit_bh_wbc(write_op, bh, 0, wbc); + submit_bh_wbc(REQ_OP_WRITE, write_flags, bh, 0, wbc); nr_underway++; } bh = next; @@ -1840,7 +1841,7 @@ recover: struct buffer_head *next = bh->b_this_page; if (buffer_async_write(bh)) { clear_buffer_dirty(bh); - submit_bh_wbc(write_op, bh, 0, wbc); + submit_bh_wbc(REQ_OP_WRITE, write_flags, bh, 0, wbc); nr_underway++; } bh = next; @@ -1848,6 +1849,7 @@ recover: unlock_page(page); goto done; } +EXPORT_SYMBOL(__block_write_full_page); /* * If a page has any new buffers, zero them out here, and mark them uptodate @@ -1891,8 +1893,62 @@ void page_zero_new_buffers(struct page *page, unsigned from, unsigned to) } EXPORT_SYMBOL(page_zero_new_buffers); -int __block_write_begin(struct page *page, loff_t pos, unsigned len, - get_block_t *get_block) +static void +iomap_to_bh(struct inode *inode, sector_t block, struct buffer_head *bh, + struct iomap *iomap) +{ + loff_t offset = block << inode->i_blkbits; + + bh->b_bdev = iomap->bdev; + + /* + * Block points to offset in file we need to map, iomap contains + * the offset at which the map starts. If the map ends before the + * current block, then do not map the buffer and let the caller + * handle it. + */ + BUG_ON(offset >= iomap->offset + iomap->length); + + switch (iomap->type) { + case IOMAP_HOLE: + /* + * If the buffer is not up to date or beyond the current EOF, + * we need to mark it as new to ensure sub-block zeroing is + * executed if necessary. + */ + if (!buffer_uptodate(bh) || + (offset >= i_size_read(inode))) + set_buffer_new(bh); + break; + case IOMAP_DELALLOC: + if (!buffer_uptodate(bh) || + (offset >= i_size_read(inode))) + set_buffer_new(bh); + set_buffer_uptodate(bh); + set_buffer_mapped(bh); + set_buffer_delay(bh); + break; + case IOMAP_UNWRITTEN: + /* + * For unwritten regions, we always need to ensure that + * sub-block writes cause the regions in the block we are not + * writing to are zeroed. Set the buffer as new to ensure this. + */ + set_buffer_new(bh); + set_buffer_unwritten(bh); + /* FALLTHRU */ + case IOMAP_MAPPED: + if (offset >= i_size_read(inode)) + set_buffer_new(bh); + bh->b_blocknr = (iomap->blkno >> (inode->i_blkbits - 9)) + + ((offset - iomap->offset) >> inode->i_blkbits); + set_buffer_mapped(bh); + break; + } +} + +int __block_write_begin_int(struct page *page, loff_t pos, unsigned len, + get_block_t *get_block, struct iomap *iomap) { unsigned from = pos & (PAGE_SIZE - 1); unsigned to = from + len; @@ -1928,9 +1984,14 @@ int __block_write_begin(struct page *page, loff_t pos, unsigned len, clear_buffer_new(bh); if (!buffer_mapped(bh)) { WARN_ON(bh->b_size != blocksize); - err = get_block(inode, block, bh, 1); - if (err) - break; + if (get_block) { + err = get_block(inode, block, bh, 1); + if (err) + break; + } else { + iomap_to_bh(inode, block, bh, iomap); + } + if (buffer_new(bh)) { unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr); @@ -1955,7 +2016,7 @@ int __block_write_begin(struct page *page, loff_t pos, unsigned len, if (!buffer_uptodate(bh) && !buffer_delay(bh) && !buffer_unwritten(bh) && (block_start < from || block_end > to)) { - ll_rw_block(READ, 1, &bh); + ll_rw_block(REQ_OP_READ, 0, 1, &bh); *wait_bh++=bh; } } @@ -1971,6 +2032,12 @@ int __block_write_begin(struct page *page, loff_t pos, unsigned len, page_zero_new_buffers(page, from, to); return err; } + +int __block_write_begin(struct page *page, loff_t pos, unsigned len, + get_block_t *get_block) +{ + return __block_write_begin_int(page, pos, len, get_block, NULL); +} EXPORT_SYMBOL(__block_write_begin); static int __block_commit_write(struct inode *inode, struct page *page, @@ -2248,7 +2315,7 @@ int block_read_full_page(struct page *page, get_block_t *get_block) if (buffer_uptodate(bh)) end_buffer_async_read(bh, 1); else - submit_bh(READ, bh); + submit_bh(REQ_OP_READ, 0, bh); } return 0; } @@ -2582,7 +2649,7 @@ int nobh_write_begin(struct address_space *mapping, if (block_start < from || block_end > to) { lock_buffer(bh); bh->b_end_io = end_buffer_read_nobh; - submit_bh(READ, bh); + submit_bh(REQ_OP_READ, 0, bh); nr_reads++; } } @@ -2852,7 +2919,7 @@ int block_truncate_page(struct address_space *mapping, if (!buffer_uptodate(bh) && !buffer_delay(bh) && !buffer_unwritten(bh)) { err = -EIO; - ll_rw_block(READ, 1, &bh); + ll_rw_block(REQ_OP_READ, 0, 1, &bh); wait_on_buffer(bh); /* Uhhuh. Read error. Complain and punt. */ if (!buffer_uptodate(bh)) @@ -2949,7 +3016,7 @@ static void end_bio_bh_io_sync(struct bio *bio) * errors, this only handles the "we need to be able to * do IO at the final sector" case. */ -void guard_bio_eod(int rw, struct bio *bio) +void guard_bio_eod(int op, struct bio *bio) { sector_t maxsector; struct bio_vec *bvec = &bio->bi_io_vec[bio->bi_vcnt - 1]; @@ -2979,13 +3046,13 @@ void guard_bio_eod(int rw, struct bio *bio) bvec->bv_len -= truncated_bytes; /* ..and clear the end of the buffer for reads */ - if ((rw & RW_MASK) == READ) { + if (op == REQ_OP_READ) { zero_user(bvec->bv_page, bvec->bv_offset + bvec->bv_len, truncated_bytes); } } -static int submit_bh_wbc(int rw, struct buffer_head *bh, +static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh, unsigned long bio_flags, struct writeback_control *wbc) { struct bio *bio; @@ -2999,7 +3066,7 @@ static int submit_bh_wbc(int rw, struct buffer_head *bh, /* * Only clear out a write error when rewriting */ - if (test_set_buffer_req(bh) && (rw & WRITE)) + if (test_set_buffer_req(bh) && (op == REQ_OP_WRITE)) clear_buffer_write_io_error(bh); /* @@ -3024,39 +3091,42 @@ static int submit_bh_wbc(int rw, struct buffer_head *bh, bio->bi_flags |= bio_flags; /* Take care of bh's that straddle the end of the device */ - guard_bio_eod(rw, bio); + guard_bio_eod(op, bio); if (buffer_meta(bh)) - rw |= REQ_META; + op_flags |= REQ_META; if (buffer_prio(bh)) - rw |= REQ_PRIO; + op_flags |= REQ_PRIO; + bio_set_op_attrs(bio, op, op_flags); - submit_bio(rw, bio); + submit_bio(bio); return 0; } -int _submit_bh(int rw, struct buffer_head *bh, unsigned long bio_flags) +int _submit_bh(int op, int op_flags, struct buffer_head *bh, + unsigned long bio_flags) { - return submit_bh_wbc(rw, bh, bio_flags, NULL); + return submit_bh_wbc(op, op_flags, bh, bio_flags, NULL); } EXPORT_SYMBOL_GPL(_submit_bh); -int submit_bh(int rw, struct buffer_head *bh) +int submit_bh(int op, int op_flags, struct buffer_head *bh) { - return submit_bh_wbc(rw, bh, 0, NULL); + return submit_bh_wbc(op, op_flags, bh, 0, NULL); } EXPORT_SYMBOL(submit_bh); /** * ll_rw_block: low-level access to block devices (DEPRECATED) - * @rw: whether to %READ or %WRITE or maybe %READA (readahead) + * @op: whether to %READ or %WRITE + * @op_flags: rq_flag_bits * @nr: number of &struct buffer_heads in the array * @bhs: array of pointers to &struct buffer_head * * ll_rw_block() takes an array of pointers to &struct buffer_heads, and - * requests an I/O operation on them, either a %READ or a %WRITE. The third - * %READA option is described in the documentation for generic_make_request() - * which ll_rw_block() calls. + * requests an I/O operation on them, either a %REQ_OP_READ or a %REQ_OP_WRITE. + * @op_flags contains flags modifying the detailed I/O behavior, most notably + * %REQ_RAHEAD. * * This function drops any buffer that it cannot get a lock on (with the * BH_Lock state bit), any buffer that appears to be clean when doing a write @@ -3072,7 +3142,7 @@ EXPORT_SYMBOL(submit_bh); * All of the buffers must be for the same device, and must also be a * multiple of the current approved size for the device. */ -void ll_rw_block(int rw, int nr, struct buffer_head *bhs[]) +void ll_rw_block(int op, int op_flags, int nr, struct buffer_head *bhs[]) { int i; @@ -3081,18 +3151,18 @@ void ll_rw_block(int rw, int nr, struct buffer_head *bhs[]) if (!trylock_buffer(bh)) continue; - if (rw == WRITE) { + if (op == WRITE) { if (test_clear_buffer_dirty(bh)) { bh->b_end_io = end_buffer_write_sync; get_bh(bh); - submit_bh(WRITE, bh); + submit_bh(op, op_flags, bh); continue; } } else { if (!buffer_uptodate(bh)) { bh->b_end_io = end_buffer_read_sync; get_bh(bh); - submit_bh(rw, bh); + submit_bh(op, op_flags, bh); continue; } } @@ -3101,7 +3171,7 @@ void ll_rw_block(int rw, int nr, struct buffer_head *bhs[]) } EXPORT_SYMBOL(ll_rw_block); -void write_dirty_buffer(struct buffer_head *bh, int rw) +void write_dirty_buffer(struct buffer_head *bh, int op_flags) { lock_buffer(bh); if (!test_clear_buffer_dirty(bh)) { @@ -3110,7 +3180,7 @@ void write_dirty_buffer(struct buffer_head *bh, int rw) } bh->b_end_io = end_buffer_write_sync; get_bh(bh); - submit_bh(rw, bh); + submit_bh(REQ_OP_WRITE, op_flags, bh); } EXPORT_SYMBOL(write_dirty_buffer); @@ -3119,7 +3189,7 @@ EXPORT_SYMBOL(write_dirty_buffer); * and then start new I/O and then wait upon it. The caller must have a ref on * the buffer_head. */ -int __sync_dirty_buffer(struct buffer_head *bh, int rw) +int __sync_dirty_buffer(struct buffer_head *bh, int op_flags) { int ret = 0; @@ -3128,7 +3198,7 @@ int __sync_dirty_buffer(struct buffer_head *bh, int rw) if (test_clear_buffer_dirty(bh)) { get_bh(bh); bh->b_end_io = end_buffer_write_sync; - ret = submit_bh(rw, bh); + ret = submit_bh(REQ_OP_WRITE, op_flags, bh); wait_on_buffer(bh); if (!ret && !buffer_uptodate(bh)) ret = -EIO; @@ -3391,7 +3461,7 @@ int bh_submit_read(struct buffer_head *bh) get_bh(bh); bh->b_end_io = end_buffer_read_sync; - submit_bh(READ, bh); + submit_bh(REQ_OP_READ, 0, bh); wait_on_buffer(bh); if (buffer_uptodate(bh)) return 0; diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c index 861d611b8c05..ce5f345d70f5 100644 --- a/fs/cachefiles/interface.c +++ b/fs/cachefiles/interface.c @@ -380,7 +380,7 @@ static void cachefiles_sync_cache(struct fscache_cache *_cache) * check if the backing cache is updated to FS-Cache * - called by FS-Cache when evaluates if need to invalidate the cache */ -static bool cachefiles_check_consistency(struct fscache_operation *op) +static int cachefiles_check_consistency(struct fscache_operation *op) { struct cachefiles_object *object; struct cachefiles_cache *cache; diff --git a/fs/cachefiles/proc.c b/fs/cachefiles/proc.c index eccd33941199..125b90f6c796 100644 --- a/fs/cachefiles/proc.c +++ b/fs/cachefiles/proc.c @@ -93,7 +93,6 @@ static int cachefiles_histogram_open(struct inode *inode, struct file *file) } static const struct file_operations cachefiles_histogram_fops = { - .owner = THIS_MODULE, .open = cachefiles_histogram_open, .read = seq_read, .llseek = seq_lseek, diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 43098cd9602b..26a9d10d75e9 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -257,12 +257,12 @@ static int ceph_readpage(struct file *filp, struct page *page) /* * Finish an async read(ahead) op. */ -static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg) +static void finish_read(struct ceph_osd_request *req) { struct inode *inode = req->r_inode; struct ceph_osd_data *osd_data; - int rc = req->r_result; - int bytes = le32_to_cpu(msg->hdr.data_len); + int rc = req->r_result <= 0 ? req->r_result : 0; + int bytes = req->r_result >= 0 ? req->r_result : 0; int num_pages; int i; @@ -276,8 +276,10 @@ static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg) for (i = 0; i < num_pages; i++) { struct page *page = osd_data->pages[i]; - if (rc < 0 && rc != -ENOENT) + if (rc < 0 && rc != -ENOENT) { + ceph_fscache_readpage_cancel(inode, page); goto unlock; + } if (bytes < (int)PAGE_SIZE) { /* zero (remainder of) page */ int s = bytes < 0 ? 0 : bytes; @@ -376,8 +378,6 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max) req->r_callback = finish_read; req->r_inode = inode; - ceph_osdc_build_request(req, off, NULL, vino.snap, NULL); - dout("start_read %p starting %p %lld~%lld\n", inode, req, off, len); ret = ceph_osdc_start_request(osdc, req, false); if (ret < 0) @@ -537,8 +537,6 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) CONGESTION_ON_THRESH(fsc->mount_options->congestion_kb)) set_bdi_congested(&fsc->backing_dev_info, BLK_RW_ASYNC); - ceph_readpage_to_fscache(inode, page); - set_page_writeback(page); err = ceph_osdc_writepages(osdc, ceph_vino(inode), &ci->i_layout, snapc, @@ -546,11 +544,21 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) truncate_seq, truncate_size, &inode->i_mtime, &page, 1); if (err < 0) { - dout("writepage setting page/mapping error %d %p\n", err, page); + struct writeback_control tmp_wbc; + if (!wbc) + wbc = &tmp_wbc; + if (err == -ERESTARTSYS) { + /* killed by SIGKILL */ + dout("writepage interrupted page %p\n", page); + redirty_page_for_writepage(wbc, page); + end_page_writeback(page); + goto out; + } + dout("writepage setting page/mapping error %d %p\n", + err, page); SetPageError(page); mapping_set_error(&inode->i_data, err); - if (wbc) - wbc->pages_skipped++; + wbc->pages_skipped++; } else { dout("writepage cleaned page %p\n", page); err = 0; /* vfs expects us to return 0 */ @@ -571,12 +579,16 @@ static int ceph_writepage(struct page *page, struct writeback_control *wbc) BUG_ON(!inode); ihold(inode); err = writepage_nounlock(page, wbc); + if (err == -ERESTARTSYS) { + /* direct memory reclaimer was killed by SIGKILL. return 0 + * to prevent caller from setting mapping/page error */ + err = 0; + } unlock_page(page); iput(inode); return err; } - /* * lame release_pages helper. release_pages() isn't exported to * modules. @@ -600,8 +612,7 @@ static void ceph_release_pages(struct page **pages, int num) * If we get an error, set the mapping error bit, but not the individual * page error bits. */ -static void writepages_finish(struct ceph_osd_request *req, - struct ceph_msg *msg) +static void writepages_finish(struct ceph_osd_request *req) { struct inode *inode = req->r_inode; struct ceph_inode_info *ci = ceph_inode(inode); @@ -615,7 +626,6 @@ static void writepages_finish(struct ceph_osd_request *req, struct ceph_fs_client *fsc = ceph_inode_to_client(inode); bool remove_page; - dout("writepages_finish %p rc %d\n", inode, rc); if (rc < 0) mapping_set_error(mapping, rc); @@ -650,6 +660,9 @@ static void writepages_finish(struct ceph_osd_request *req, clear_bdi_congested(&fsc->backing_dev_info, BLK_RW_ASYNC); + if (rc < 0) + SetPageError(page); + ceph_put_snap_context(page_snap_context(page)); page->private = 0; ClearPagePrivate(page); @@ -718,8 +731,11 @@ static int ceph_writepages_start(struct address_space *mapping, (wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD")); if (ACCESS_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) { - pr_warn("writepage_start %p on forced umount\n", inode); - truncate_pagecache(inode, 0); + if (ci->i_wrbuffer_ref > 0) { + pr_warn_ratelimited( + "writepage_start %p %lld forced umount\n", + inode, ceph_ino(inode)); + } mapping_set_error(mapping, -EIO); return -EIO; /* we're in a forced umount, don't write! */ } @@ -1063,10 +1079,7 @@ new_request: pages = NULL; } - vino = ceph_vino(inode); - ceph_osdc_build_request(req, offset, snapc, vino.snap, - &inode->i_mtime); - + req->r_mtime = inode->i_mtime; rc = ceph_osdc_start_request(&fsc->client->osdc, req, true); BUG_ON(rc); req = NULL; @@ -1099,8 +1112,7 @@ release_pvec_pages: mapping->writeback_index = index; out: - if (req) - ceph_osdc_put_request(req); + ceph_osdc_put_request(req); ceph_put_snap_context(snapc); dout("writepages done, rc = %d\n", rc); return rc; @@ -1134,6 +1146,7 @@ static int ceph_update_writeable_page(struct file *file, struct page *page) { struct inode *inode = file_inode(file); + struct ceph_fs_client *fsc = ceph_inode_to_client(inode); struct ceph_inode_info *ci = ceph_inode(inode); loff_t page_off = pos & PAGE_MASK; int pos_in_page = pos & ~PAGE_MASK; @@ -1142,6 +1155,12 @@ static int ceph_update_writeable_page(struct file *file, int r; struct ceph_snap_context *snapc, *oldest; + if (ACCESS_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) { + dout(" page %p forced umount\n", page); + unlock_page(page); + return -EIO; + } + retry_locked: /* writepages currently holds page lock, but if we change that later, */ wait_on_page_writeback(page); @@ -1165,7 +1184,7 @@ retry_locked: snapc = ceph_get_snap_context(snapc); unlock_page(page); ceph_queue_writeback(inode); - r = wait_event_interruptible(ci->i_cap_wq, + r = wait_event_killable(ci->i_cap_wq, context_is_writeable_or_written(inode, snapc)); ceph_put_snap_context(snapc); if (r == -ERESTARTSYS) @@ -1311,6 +1330,17 @@ const struct address_space_operations ceph_aops = { .direct_IO = ceph_direct_io, }; +static void ceph_block_sigs(sigset_t *oldset) +{ + sigset_t mask; + siginitsetinv(&mask, sigmask(SIGKILL)); + sigprocmask(SIG_BLOCK, &mask, oldset); +} + +static void ceph_restore_sigs(sigset_t *oldset) +{ + sigprocmask(SIG_SETMASK, oldset, NULL); +} /* * vm ops @@ -1323,6 +1353,9 @@ static int ceph_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) struct page *pinned_page = NULL; loff_t off = vmf->pgoff << PAGE_SHIFT; int want, got, ret; + sigset_t oldset; + + ceph_block_sigs(&oldset); dout("filemap_fault %p %llx.%llx %llu~%zd trying to get caps\n", inode, ceph_vinop(inode), off, (size_t)PAGE_SIZE); @@ -1330,17 +1363,12 @@ static int ceph_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO; else want = CEPH_CAP_FILE_CACHE; - while (1) { - got = 0; - ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, want, - -1, &got, &pinned_page); - if (ret == 0) - break; - if (ret != -ERESTARTSYS) { - WARN_ON(1); - return VM_FAULT_SIGBUS; - } - } + + got = 0; + ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, want, -1, &got, &pinned_page); + if (ret < 0) + goto out_restore; + dout("filemap_fault %p %llu~%zd got cap refs on %s\n", inode, off, (size_t)PAGE_SIZE, ceph_cap_string(got)); @@ -1357,7 +1385,7 @@ static int ceph_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) ceph_put_cap_refs(ci, got); if (ret != -EAGAIN) - return ret; + goto out_restore; /* read inline data */ if (off >= PAGE_SIZE) { @@ -1371,15 +1399,18 @@ static int ceph_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) ~__GFP_FS)); if (!page) { ret = VM_FAULT_OOM; - goto out; + goto out_inline; } ret1 = __ceph_do_getattr(inode, page, CEPH_STAT_CAP_INLINE_DATA, true); if (ret1 < 0 || off >= i_size_read(inode)) { unlock_page(page); put_page(page); - ret = VM_FAULT_SIGBUS; - goto out; + if (ret1 < 0) + ret = ret1; + else + ret = VM_FAULT_SIGBUS; + goto out_inline; } if (ret1 < PAGE_SIZE) zero_user_segment(page, ret1, PAGE_SIZE); @@ -1388,10 +1419,15 @@ static int ceph_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) SetPageUptodate(page); vmf->page = page; ret = VM_FAULT_MAJOR | VM_FAULT_LOCKED; +out_inline: + dout("filemap_fault %p %llu~%zd read inline data ret %d\n", + inode, off, (size_t)PAGE_SIZE, ret); } -out: - dout("filemap_fault %p %llu~%zd read inline data ret %d\n", - inode, off, (size_t)PAGE_SIZE, ret); +out_restore: + ceph_restore_sigs(&oldset); + if (ret < 0) + ret = (ret == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS; + return ret; } @@ -1409,10 +1445,13 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) loff_t size = i_size_read(inode); size_t len; int want, got, ret; + sigset_t oldset; prealloc_cf = ceph_alloc_cap_flush(); if (!prealloc_cf) - return VM_FAULT_SIGBUS; + return VM_FAULT_OOM; + + ceph_block_sigs(&oldset); if (ci->i_inline_version != CEPH_INLINE_NONE) { struct page *locked_page = NULL; @@ -1423,10 +1462,8 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) ret = ceph_uninline_data(vma->vm_file, locked_page); if (locked_page) unlock_page(locked_page); - if (ret < 0) { - ret = VM_FAULT_SIGBUS; + if (ret < 0) goto out_free; - } } if (off + PAGE_SIZE <= size) @@ -1440,45 +1477,36 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO; else want = CEPH_CAP_FILE_BUFFER; - while (1) { - got = 0; - ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, off + len, - &got, NULL); - if (ret == 0) - break; - if (ret != -ERESTARTSYS) { - WARN_ON(1); - ret = VM_FAULT_SIGBUS; - goto out_free; - } - } + + got = 0; + ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, off + len, + &got, NULL); + if (ret < 0) + goto out_free; + dout("page_mkwrite %p %llu~%zd got cap refs on %s\n", inode, off, len, ceph_cap_string(got)); /* Update time before taking page lock */ file_update_time(vma->vm_file); - lock_page(page); + do { + lock_page(page); - ret = VM_FAULT_NOPAGE; - if ((off > size) || - (page->mapping != inode->i_mapping)) { - unlock_page(page); - goto out; - } + if ((off > size) || (page->mapping != inode->i_mapping)) { + unlock_page(page); + ret = VM_FAULT_NOPAGE; + break; + } + + ret = ceph_update_writeable_page(vma->vm_file, off, len, page); + if (ret >= 0) { + /* success. we'll keep the page locked. */ + set_page_dirty(page); + ret = VM_FAULT_LOCKED; + } + } while (ret == -EAGAIN); - ret = ceph_update_writeable_page(vma->vm_file, off, len, page); - if (ret >= 0) { - /* success. we'll keep the page locked. */ - set_page_dirty(page); - ret = VM_FAULT_LOCKED; - } else { - if (ret == -ENOMEM) - ret = VM_FAULT_OOM; - else - ret = VM_FAULT_SIGBUS; - } -out: if (ret == VM_FAULT_LOCKED || ci->i_inline_version != CEPH_INLINE_NONE) { int dirty; @@ -1495,8 +1523,10 @@ out: inode, off, len, ceph_cap_string(got), ret); ceph_put_cap_refs(ci, got); out_free: + ceph_restore_sigs(&oldset); ceph_free_cap_flush(prealloc_cf); - + if (ret < 0) + ret = (ret == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS; return ret; } @@ -1614,7 +1644,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page) goto out; } - ceph_osdc_build_request(req, 0, NULL, CEPH_NOSNAP, &inode->i_mtime); + req->r_mtime = inode->i_mtime; err = ceph_osdc_start_request(&fsc->client->osdc, req, false); if (!err) err = ceph_osdc_wait_request(&fsc->client->osdc, req); @@ -1657,7 +1687,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page) goto out_put; } - ceph_osdc_build_request(req, 0, NULL, CEPH_NOSNAP, &inode->i_mtime); + req->r_mtime = inode->i_mtime; err = ceph_osdc_start_request(&fsc->client->osdc, req, false); if (!err) err = ceph_osdc_wait_request(&fsc->client->osdc, req); @@ -1758,9 +1788,11 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool) rd_req->r_flags = CEPH_OSD_FLAG_READ; osd_req_op_init(rd_req, 0, CEPH_OSD_OP_STAT, 0); rd_req->r_base_oloc.pool = pool; - snprintf(rd_req->r_base_oid.name, sizeof(rd_req->r_base_oid.name), - "%llx.00000000", ci->i_vino.ino); - rd_req->r_base_oid.name_len = strlen(rd_req->r_base_oid.name); + ceph_oid_printf(&rd_req->r_base_oid, "%llx.00000000", ci->i_vino.ino); + + err = ceph_osdc_alloc_messages(rd_req, GFP_NOFS); + if (err) + goto out_unlock; wr_req = ceph_osdc_alloc_request(&fsc->client->osdc, NULL, 1, false, GFP_NOFS); @@ -1769,11 +1801,14 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool) goto out_unlock; } - wr_req->r_flags = CEPH_OSD_FLAG_WRITE | - CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK; + wr_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ACK; osd_req_op_init(wr_req, 0, CEPH_OSD_OP_CREATE, CEPH_OSD_OP_FLAG_EXCL); - wr_req->r_base_oloc.pool = pool; - wr_req->r_base_oid = rd_req->r_base_oid; + ceph_oloc_copy(&wr_req->r_base_oloc, &rd_req->r_base_oloc); + ceph_oid_copy(&wr_req->r_base_oid, &rd_req->r_base_oid); + + err = ceph_osdc_alloc_messages(wr_req, GFP_NOFS); + if (err) + goto out_unlock; /* one page should be large enough for STAT data */ pages = ceph_alloc_page_vector(1, GFP_KERNEL); @@ -1784,12 +1819,9 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool) osd_req_op_raw_data_in_pages(rd_req, 0, pages, PAGE_SIZE, 0, false, true); - ceph_osdc_build_request(rd_req, 0, NULL, CEPH_NOSNAP, - &ci->vfs_inode.i_mtime); err = ceph_osdc_start_request(&fsc->client->osdc, rd_req, false); - ceph_osdc_build_request(wr_req, 0, NULL, CEPH_NOSNAP, - &ci->vfs_inode.i_mtime); + wr_req->r_mtime = ci->vfs_inode.i_mtime; err2 = ceph_osdc_start_request(&fsc->client->osdc, wr_req, false); if (!err) @@ -1823,10 +1855,8 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool) out_unlock: up_write(&mdsc->pool_perm_rwsem); - if (rd_req) - ceph_osdc_put_request(rd_req); - if (wr_req) - ceph_osdc_put_request(wr_req); + ceph_osdc_put_request(rd_req); + ceph_osdc_put_request(wr_req); out: if (!err) err = have; diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c index a351480dbabc..238c55b01723 100644 --- a/fs/ceph/cache.c +++ b/fs/ceph/cache.c @@ -25,6 +25,7 @@ #include "cache.h" struct ceph_aux_inode { + u64 version; struct timespec mtime; loff_t size; }; @@ -69,15 +70,8 @@ int ceph_fscache_register_fs(struct ceph_fs_client* fsc) fsc->fscache = fscache_acquire_cookie(ceph_cache_netfs.primary_index, &ceph_fscache_fsid_object_def, fsc, true); - - if (fsc->fscache == NULL) { + if (!fsc->fscache) pr_err("Unable to resgister fsid: %p fscache cookie", fsc); - return 0; - } - - fsc->revalidate_wq = alloc_workqueue("ceph-revalidate", 0, 1); - if (fsc->revalidate_wq == NULL) - return -ENOMEM; return 0; } @@ -105,6 +99,7 @@ static uint16_t ceph_fscache_inode_get_aux(const void *cookie_netfs_data, const struct inode* inode = &ci->vfs_inode; memset(&aux, 0, sizeof(aux)); + aux.version = ci->i_version; aux.mtime = inode->i_mtime; aux.size = i_size_read(inode); @@ -131,6 +126,7 @@ static enum fscache_checkaux ceph_fscache_inode_check_aux( return FSCACHE_CHECKAUX_OBSOLETE; memset(&aux, 0, sizeof(aux)); + aux.version = ci->i_version; aux.mtime = inode->i_mtime; aux.size = i_size_read(inode); @@ -181,32 +177,26 @@ static const struct fscache_cookie_def ceph_fscache_inode_object_def = { .now_uncached = ceph_fscache_inode_now_uncached, }; -void ceph_fscache_register_inode_cookie(struct ceph_fs_client* fsc, - struct ceph_inode_info* ci) +void ceph_fscache_register_inode_cookie(struct inode *inode) { - struct inode* inode = &ci->vfs_inode; + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_fs_client *fsc = ceph_inode_to_client(inode); /* No caching for filesystem */ if (fsc->fscache == NULL) return; /* Only cache for regular files that are read only */ - if ((ci->vfs_inode.i_mode & S_IFREG) == 0) + if (!S_ISREG(inode->i_mode)) return; - /* Avoid multiple racing open requests */ - inode_lock(inode); - - if (ci->fscache) - goto done; - - ci->fscache = fscache_acquire_cookie(fsc->fscache, - &ceph_fscache_inode_object_def, - ci, true); - fscache_check_consistency(ci->fscache); -done: + inode_lock_nested(inode, I_MUTEX_CHILD); + if (!ci->fscache) { + ci->fscache = fscache_acquire_cookie(fsc->fscache, + &ceph_fscache_inode_object_def, + ci, false); + } inode_unlock(inode); - } void ceph_fscache_unregister_inode_cookie(struct ceph_inode_info* ci) @@ -222,6 +212,34 @@ void ceph_fscache_unregister_inode_cookie(struct ceph_inode_info* ci) fscache_relinquish_cookie(cookie, 0); } +static bool ceph_fscache_can_enable(void *data) +{ + struct inode *inode = data; + return !inode_is_open_for_write(inode); +} + +void ceph_fscache_file_set_cookie(struct inode *inode, struct file *filp) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + + if (!fscache_cookie_valid(ci->fscache)) + return; + + if (inode_is_open_for_write(inode)) { + dout("fscache_file_set_cookie %p %p disabling cache\n", + inode, filp); + fscache_disable_cookie(ci->fscache, false); + fscache_uncache_all_inode_pages(ci->fscache, inode); + } else { + fscache_enable_cookie(ci->fscache, ceph_fscache_can_enable, + inode); + if (fscache_cookie_enabled(ci->fscache)) { + dout("fscache_file_set_cookie %p %p enabing cache\n", + inode, filp); + } + } +} + static void ceph_vfs_readpage_complete(struct page *page, void *data, int error) { if (!error) @@ -236,10 +254,9 @@ static void ceph_vfs_readpage_complete_unlock(struct page *page, void *data, int unlock_page(page); } -static inline int cache_valid(struct ceph_inode_info *ci) +static inline bool cache_valid(struct ceph_inode_info *ci) { - return ((ceph_caps_issued(ci) & CEPH_CAP_FILE_CACHE) && - (ci->i_fscache_gen == ci->i_rdcache_gen)); + return ci->i_fscache_gen == ci->i_rdcache_gen; } @@ -332,69 +349,27 @@ void ceph_invalidate_fscache_page(struct inode* inode, struct page *page) void ceph_fscache_unregister_fs(struct ceph_fs_client* fsc) { - if (fsc->revalidate_wq) - destroy_workqueue(fsc->revalidate_wq); - fscache_relinquish_cookie(fsc->fscache, 0); fsc->fscache = NULL; } -static void ceph_revalidate_work(struct work_struct *work) -{ - int issued; - u32 orig_gen; - struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info, - i_revalidate_work); - struct inode *inode = &ci->vfs_inode; - - spin_lock(&ci->i_ceph_lock); - issued = __ceph_caps_issued(ci, NULL); - orig_gen = ci->i_rdcache_gen; - spin_unlock(&ci->i_ceph_lock); - - if (!(issued & CEPH_CAP_FILE_CACHE)) { - dout("revalidate_work lost cache before validation %p\n", - inode); - goto out; - } - - if (!fscache_check_consistency(ci->fscache)) - fscache_invalidate(ci->fscache); - - spin_lock(&ci->i_ceph_lock); - /* Update the new valid generation (backwards sanity check too) */ - if (orig_gen > ci->i_fscache_gen) { - ci->i_fscache_gen = orig_gen; - } - spin_unlock(&ci->i_ceph_lock); - -out: - iput(&ci->vfs_inode); -} - -void ceph_queue_revalidate(struct inode *inode) +/* + * caller should hold CEPH_CAP_FILE_{RD,CACHE} + */ +void ceph_fscache_revalidate_cookie(struct ceph_inode_info *ci) { - struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb); - struct ceph_inode_info *ci = ceph_inode(inode); - - if (fsc->revalidate_wq == NULL || ci->fscache == NULL) + if (cache_valid(ci)) return; - ihold(inode); - - if (queue_work(ceph_sb_to_client(inode->i_sb)->revalidate_wq, - &ci->i_revalidate_work)) { - dout("ceph_queue_revalidate %p\n", inode); - } else { - dout("ceph_queue_revalidate %p failed\n)", inode); - iput(inode); + /* resue i_truncate_mutex. There should be no pending + * truncate while the caller holds CEPH_CAP_FILE_RD */ + mutex_lock(&ci->i_truncate_mutex); + if (!cache_valid(ci)) { + if (fscache_check_consistency(ci->fscache)) + fscache_invalidate(ci->fscache); + spin_lock(&ci->i_ceph_lock); + ci->i_fscache_gen = ci->i_rdcache_gen; + spin_unlock(&ci->i_ceph_lock); } -} - -void ceph_fscache_inode_init(struct ceph_inode_info *ci) -{ - ci->fscache = NULL; - /* The first load is verifed cookie open time */ - ci->i_fscache_gen = 1; - INIT_WORK(&ci->i_revalidate_work, ceph_revalidate_work); + mutex_unlock(&ci->i_truncate_mutex); } diff --git a/fs/ceph/cache.h b/fs/ceph/cache.h index 5ac591bd012b..7e72c7594f0c 100644 --- a/fs/ceph/cache.h +++ b/fs/ceph/cache.h @@ -34,10 +34,10 @@ void ceph_fscache_unregister(void); int ceph_fscache_register_fs(struct ceph_fs_client* fsc); void ceph_fscache_unregister_fs(struct ceph_fs_client* fsc); -void ceph_fscache_inode_init(struct ceph_inode_info *ci); -void ceph_fscache_register_inode_cookie(struct ceph_fs_client* fsc, - struct ceph_inode_info* ci); +void ceph_fscache_register_inode_cookie(struct inode *inode); void ceph_fscache_unregister_inode_cookie(struct ceph_inode_info* ci); +void ceph_fscache_file_set_cookie(struct inode *inode, struct file *filp); +void ceph_fscache_revalidate_cookie(struct ceph_inode_info *ci); int ceph_readpage_from_fscache(struct inode *inode, struct page *page); int ceph_readpages_from_fscache(struct inode *inode, @@ -46,12 +46,11 @@ int ceph_readpages_from_fscache(struct inode *inode, unsigned *nr_pages); void ceph_readpage_to_fscache(struct inode *inode, struct page *page); void ceph_invalidate_fscache_page(struct inode* inode, struct page *page); -void ceph_queue_revalidate(struct inode *inode); -static inline void ceph_fscache_update_objectsize(struct inode *inode) +static inline void ceph_fscache_inode_init(struct ceph_inode_info *ci) { - struct ceph_inode_info *ci = ceph_inode(inode); - fscache_attr_changed(ci->fscache); + ci->fscache = NULL; + ci->i_fscache_gen = 0; } static inline void ceph_fscache_invalidate(struct inode *inode) @@ -88,6 +87,11 @@ static inline void ceph_fscache_readpages_cancel(struct inode *inode, return fscache_readpages_cancel(ci->fscache, pages); } +static inline void ceph_disable_fscache_readpage(struct ceph_inode_info *ci) +{ + ci->i_fscache_gen = ci->i_rdcache_gen - 1; +} + #else static inline int ceph_fscache_register(void) @@ -112,8 +116,20 @@ static inline void ceph_fscache_inode_init(struct ceph_inode_info *ci) { } -static inline void ceph_fscache_register_inode_cookie(struct ceph_fs_client* parent_fsc, - struct ceph_inode_info* ci) +static inline void ceph_fscache_register_inode_cookie(struct inode *inode) +{ +} + +static inline void ceph_fscache_unregister_inode_cookie(struct ceph_inode_info* ci) +{ +} + +static inline void ceph_fscache_file_set_cookie(struct inode *inode, + struct file *filp) +{ +} + +static inline void ceph_fscache_revalidate_cookie(struct ceph_inode_info *ci) { } @@ -141,10 +157,6 @@ static inline void ceph_readpage_to_fscache(struct inode *inode, { } -static inline void ceph_fscache_update_objectsize(struct inode *inode) -{ -} - static inline void ceph_fscache_invalidate(struct inode *inode) { } @@ -154,10 +166,6 @@ static inline void ceph_invalidate_fscache_page(struct inode *inode, { } -static inline void ceph_fscache_unregister_inode_cookie(struct ceph_inode_info* ci) -{ -} - static inline int ceph_release_fscache_page(struct page *page, gfp_t gfp) { return 1; @@ -173,7 +181,7 @@ static inline void ceph_fscache_readpages_cancel(struct inode *inode, { } -static inline void ceph_queue_revalidate(struct inode *inode) +static inline void ceph_disable_fscache_readpage(struct ceph_inode_info *ci) { } diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index cfaeef18cbca..6f60d0a3d0f9 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -1656,7 +1656,7 @@ retry_locked: */ if ((!is_delayed || mdsc->stopping) && !S_ISDIR(inode->i_mode) && /* ignore readdir cache */ - ci->i_wrbuffer_ref == 0 && /* no dirty pages... */ + !(ci->i_wb_ref || ci->i_wrbuffer_ref) && /* no dirty pages... */ inode->i_data.nrpages && /* have cached pages */ (revoking & (CEPH_CAP_FILE_CACHE| CEPH_CAP_FILE_LAZYIO)) && /* or revoking cache */ @@ -1698,8 +1698,8 @@ retry_locked: revoking = cap->implemented & ~cap->issued; dout(" mds%d cap %p used %s issued %s implemented %s revoking %s\n", - cap->mds, cap, ceph_cap_string(cap->issued), - ceph_cap_string(cap_used), + cap->mds, cap, ceph_cap_string(cap_used), + ceph_cap_string(cap->issued), ceph_cap_string(cap->implemented), ceph_cap_string(revoking)); @@ -2317,7 +2317,7 @@ again: /* make sure file is actually open */ file_wanted = __ceph_caps_file_wanted(ci); - if ((file_wanted & need) == 0) { + if ((file_wanted & need) != need) { dout("try_get_cap_refs need %s file_wanted %s, EBADF\n", ceph_cap_string(need), ceph_cap_string(file_wanted)); *err = -EBADF; @@ -2393,6 +2393,9 @@ again: snap_rwsem_locked = true; } *got = need | (have & want); + if ((need & CEPH_CAP_FILE_RD) && + !(*got & CEPH_CAP_FILE_CACHE)) + ceph_disable_fscache_readpage(ci); __take_cap_refs(ci, *got, true); ret = 1; } @@ -2412,12 +2415,26 @@ again: goto out_unlock; } - if (!__ceph_is_any_caps(ci) && - ACCESS_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) { - dout("get_cap_refs %p forced umount\n", inode); - *err = -EIO; - ret = 1; - goto out_unlock; + if (ci->i_ceph_flags & CEPH_I_CAP_DROPPED) { + int mds_wanted; + if (ACCESS_ONCE(mdsc->fsc->mount_state) == + CEPH_MOUNT_SHUTDOWN) { + dout("get_cap_refs %p forced umount\n", inode); + *err = -EIO; + ret = 1; + goto out_unlock; + } + mds_wanted = __ceph_caps_mds_wanted(ci); + if ((mds_wanted & need) != need) { + dout("get_cap_refs %p caps were dropped" + " (session killed?)\n", inode); + *err = -ESTALE; + ret = 1; + goto out_unlock; + } + if ((mds_wanted & file_wanted) == + (file_wanted & (CEPH_CAP_FILE_RD|CEPH_CAP_FILE_WR))) + ci->i_ceph_flags &= ~CEPH_I_CAP_DROPPED; } dout("get_cap_refs %p have %s needed %s\n", inode, @@ -2487,7 +2504,7 @@ int ceph_get_caps(struct ceph_inode_info *ci, int need, int want, if (err == -EAGAIN) continue; if (err < 0) - return err; + ret = err; } else { ret = wait_event_interruptible(ci->i_cap_wq, try_get_cap_refs(ci, need, want, endoff, @@ -2496,8 +2513,15 @@ int ceph_get_caps(struct ceph_inode_info *ci, int need, int want, continue; if (err < 0) ret = err; - if (ret < 0) - return ret; + } + if (ret < 0) { + if (err == -ESTALE) { + /* session was killed, try renew caps */ + ret = ceph_renew_caps(&ci->vfs_inode); + if (ret == 0) + continue; + } + return ret; } if (ci->i_inline_version != CEPH_INLINE_NONE && @@ -2533,6 +2557,9 @@ int ceph_get_caps(struct ceph_inode_info *ci, int need, int want, break; } + if ((_got & CEPH_CAP_FILE_RD) && (_got & CEPH_CAP_FILE_CACHE)) + ceph_fscache_revalidate_cookie(ci); + *got = _got; return 0; } @@ -2774,7 +2801,6 @@ static void handle_cap_grant(struct ceph_mds_client *mdsc, bool writeback = false; bool queue_trunc = false; bool queue_invalidate = false; - bool queue_revalidate = false; bool deleted_inode = false; bool fill_inline = false; @@ -2807,7 +2833,7 @@ static void handle_cap_grant(struct ceph_mds_client *mdsc, if (!S_ISDIR(inode->i_mode) && /* don't invalidate readdir cache */ ((cap->issued & ~newcaps) & CEPH_CAP_FILE_CACHE) && (newcaps & CEPH_CAP_FILE_LAZYIO) == 0 && - !ci->i_wrbuffer_ref) { + !(ci->i_wrbuffer_ref || ci->i_wb_ref)) { if (try_nonblocking_invalidate(inode)) { /* there were locked pages.. invalidate later in a separate thread. */ @@ -2816,8 +2842,6 @@ static void handle_cap_grant(struct ceph_mds_client *mdsc, ci->i_rdcache_revoking = ci->i_rdcache_gen; } } - - ceph_fscache_invalidate(inode); } /* side effects now are allowed */ @@ -2859,11 +2883,6 @@ static void handle_cap_grant(struct ceph_mds_client *mdsc, } } - /* Do we need to revalidate our fscache cookie. Don't bother on the - * first cache cap as we already validate at cookie creation time. */ - if ((issued & CEPH_CAP_FILE_CACHE) && ci->i_rdcache_gen > 1) - queue_revalidate = true; - if (newcaps & CEPH_CAP_ANY_RD) { /* ctime/mtime/atime? */ ceph_decode_timespec(&mtime, &grant->mtime); @@ -2972,11 +2991,8 @@ static void handle_cap_grant(struct ceph_mds_client *mdsc, if (fill_inline) ceph_fill_inline_data(inode, NULL, inline_data, inline_len); - if (queue_trunc) { + if (queue_trunc) ceph_queue_vmtruncate(inode); - ceph_queue_revalidate(inode); - } else if (queue_revalidate) - ceph_queue_revalidate(inode); if (writeback) /* @@ -3178,10 +3194,8 @@ static void handle_cap_trunc(struct inode *inode, truncate_seq, truncate_size, size); spin_unlock(&ci->i_ceph_lock); - if (queue_trunc) { + if (queue_trunc) ceph_queue_vmtruncate(inode); - ceph_fscache_invalidate(inode); - } } /* @@ -3226,6 +3240,8 @@ retry: if (target < 0) { __ceph_remove_cap(cap, false); + if (!ci->i_auth_cap) + ci->i_ceph_flags |= CEPH_I_CAP_DROPPED; goto out_unlock; } diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c index 31f831471ed2..39ff678e567f 100644 --- a/fs/ceph/debugfs.c +++ b/fs/ceph/debugfs.c @@ -109,7 +109,7 @@ static int mdsc_show(struct seq_file *s, void *p) path ? path : ""); spin_unlock(&req->r_old_dentry->d_lock); kfree(path); - } else if (req->r_path2) { + } else if (req->r_path2 && req->r_op != CEPH_MDS_OP_SYMLINK) { if (req->r_ino2.ino) seq_printf(s, " #%llx/%s", req->r_ino2.ino, req->r_path2); diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 3ab1192d2029..6e0fedf6713b 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -70,16 +70,42 @@ out_unlock: } /* - * for readdir, we encode the directory frag and offset within that - * frag into f_pos. + * for f_pos for readdir: + * - hash order: + * (0xff << 52) | ((24 bits hash) << 28) | + * (the nth entry has hash collision); + * - frag+name order; + * ((frag value) << 28) | (the nth entry in frag); */ +#define OFFSET_BITS 28 +#define OFFSET_MASK ((1 << OFFSET_BITS) - 1) +#define HASH_ORDER (0xffull << (OFFSET_BITS + 24)) +loff_t ceph_make_fpos(unsigned high, unsigned off, bool hash_order) +{ + loff_t fpos = ((loff_t)high << 28) | (loff_t)off; + if (hash_order) + fpos |= HASH_ORDER; + return fpos; +} + +static bool is_hash_order(loff_t p) +{ + return (p & HASH_ORDER) == HASH_ORDER; +} + static unsigned fpos_frag(loff_t p) { - return p >> 32; + return p >> OFFSET_BITS; } + +static unsigned fpos_hash(loff_t p) +{ + return ceph_frag_value(fpos_frag(p)); +} + static unsigned fpos_off(loff_t p) { - return p & 0xffffffff; + return p & OFFSET_MASK; } static int fpos_cmp(loff_t l, loff_t r) @@ -111,6 +137,50 @@ static int note_last_dentry(struct ceph_file_info *fi, const char *name, return 0; } + +static struct dentry * +__dcache_find_get_entry(struct dentry *parent, u64 idx, + struct ceph_readdir_cache_control *cache_ctl) +{ + struct inode *dir = d_inode(parent); + struct dentry *dentry; + unsigned idx_mask = (PAGE_SIZE / sizeof(struct dentry *)) - 1; + loff_t ptr_pos = idx * sizeof(struct dentry *); + pgoff_t ptr_pgoff = ptr_pos >> PAGE_SHIFT; + + if (ptr_pos >= i_size_read(dir)) + return NULL; + + if (!cache_ctl->page || ptr_pgoff != page_index(cache_ctl->page)) { + ceph_readdir_cache_release(cache_ctl); + cache_ctl->page = find_lock_page(&dir->i_data, ptr_pgoff); + if (!cache_ctl->page) { + dout(" page %lu not found\n", ptr_pgoff); + return ERR_PTR(-EAGAIN); + } + /* reading/filling the cache are serialized by + i_mutex, no need to use page lock */ + unlock_page(cache_ctl->page); + cache_ctl->dentries = kmap(cache_ctl->page); + } + + cache_ctl->index = idx & idx_mask; + + rcu_read_lock(); + spin_lock(&parent->d_lock); + /* check i_size again here, because empty directory can be + * marked as complete while not holding the i_mutex. */ + if (ceph_dir_is_complete_ordered(dir) && ptr_pos < i_size_read(dir)) + dentry = cache_ctl->dentries[cache_ctl->index]; + else + dentry = NULL; + spin_unlock(&parent->d_lock); + if (dentry && !lockref_get_not_dead(&dentry->d_lockref)) + dentry = NULL; + rcu_read_unlock(); + return dentry ? : ERR_PTR(-EAGAIN); +} + /* * When possible, we try to satisfy a readdir by peeking at the * dcache. We make this work by carefully ordering dentries on @@ -130,75 +200,68 @@ static int __dcache_readdir(struct file *file, struct dir_context *ctx, struct inode *dir = d_inode(parent); struct dentry *dentry, *last = NULL; struct ceph_dentry_info *di; - unsigned nsize = PAGE_SIZE / sizeof(struct dentry *); - int err = 0; - loff_t ptr_pos = 0; struct ceph_readdir_cache_control cache_ctl = {}; + u64 idx = 0; + int err = 0; - dout("__dcache_readdir %p v%u at %llu\n", dir, shared_gen, ctx->pos); + dout("__dcache_readdir %p v%u at %llx\n", dir, shared_gen, ctx->pos); + + /* search start position */ + if (ctx->pos > 2) { + u64 count = div_u64(i_size_read(dir), sizeof(struct dentry *)); + while (count > 0) { + u64 step = count >> 1; + dentry = __dcache_find_get_entry(parent, idx + step, + &cache_ctl); + if (!dentry) { + /* use linar search */ + idx = 0; + break; + } + if (IS_ERR(dentry)) { + err = PTR_ERR(dentry); + goto out; + } + di = ceph_dentry(dentry); + spin_lock(&dentry->d_lock); + if (fpos_cmp(di->offset, ctx->pos) < 0) { + idx += step + 1; + count -= step + 1; + } else { + count = step; + } + spin_unlock(&dentry->d_lock); + dput(dentry); + } - /* we can calculate cache index for the first dirfrag */ - if (ceph_frag_is_leftmost(fpos_frag(ctx->pos))) { - cache_ctl.index = fpos_off(ctx->pos) - 2; - BUG_ON(cache_ctl.index < 0); - ptr_pos = cache_ctl.index * sizeof(struct dentry *); + dout("__dcache_readdir %p cache idx %llu\n", dir, idx); } - while (true) { - pgoff_t pgoff; - bool emit_dentry; - if (ptr_pos >= i_size_read(dir)) { + for (;;) { + bool emit_dentry = false; + dentry = __dcache_find_get_entry(parent, idx++, &cache_ctl); + if (!dentry) { fi->flags |= CEPH_F_ATEND; err = 0; break; } - - err = -EAGAIN; - pgoff = ptr_pos >> PAGE_SHIFT; - if (!cache_ctl.page || pgoff != page_index(cache_ctl.page)) { - ceph_readdir_cache_release(&cache_ctl); - cache_ctl.page = find_lock_page(&dir->i_data, pgoff); - if (!cache_ctl.page) { - dout(" page %lu not found\n", pgoff); - break; - } - /* reading/filling the cache are serialized by - * i_mutex, no need to use page lock */ - unlock_page(cache_ctl.page); - cache_ctl.dentries = kmap(cache_ctl.page); + if (IS_ERR(dentry)) { + err = PTR_ERR(dentry); + goto out; } - rcu_read_lock(); - spin_lock(&parent->d_lock); - /* check i_size again here, because empty directory can be - * marked as complete while not holding the i_mutex. */ - if (ceph_dir_is_complete_ordered(dir) && - ptr_pos < i_size_read(dir)) - dentry = cache_ctl.dentries[cache_ctl.index % nsize]; - else - dentry = NULL; - spin_unlock(&parent->d_lock); - if (dentry && !lockref_get_not_dead(&dentry->d_lockref)) - dentry = NULL; - rcu_read_unlock(); - if (!dentry) - break; - - emit_dentry = false; di = ceph_dentry(dentry); spin_lock(&dentry->d_lock); if (di->lease_shared_gen == shared_gen && d_really_is_positive(dentry) && - ceph_snap(d_inode(dentry)) != CEPH_SNAPDIR && - ceph_ino(d_inode(dentry)) != CEPH_INO_CEPH && fpos_cmp(ctx->pos, di->offset) <= 0) { emit_dentry = true; } spin_unlock(&dentry->d_lock); if (emit_dentry) { - dout(" %llu (%llu) dentry %p %pd %p\n", di->offset, ctx->pos, + dout(" %llx dentry %p %pd %p\n", di->offset, dentry, dentry, d_inode(dentry)); ctx->pos = di->offset; if (!dir_emit(ctx, dentry->d_name.name, @@ -218,10 +281,8 @@ static int __dcache_readdir(struct file *file, struct dir_context *ctx, } else { dput(dentry); } - - cache_ctl.index++; - ptr_pos += sizeof(struct dentry *); } +out: ceph_readdir_cache_release(&cache_ctl); if (last) { int ret; @@ -235,6 +296,16 @@ static int __dcache_readdir(struct file *file, struct dir_context *ctx, return err; } +static bool need_send_readdir(struct ceph_file_info *fi, loff_t pos) +{ + if (!fi->last_readdir) + return true; + if (is_hash_order(pos)) + return !ceph_frag_contains_value(fi->frag, fpos_hash(pos)); + else + return fi->frag != fpos_frag(pos); +} + static int ceph_readdir(struct file *file, struct dir_context *ctx) { struct ceph_file_info *fi = file->private_data; @@ -242,13 +313,12 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx) struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_fs_client *fsc = ceph_inode_to_client(inode); struct ceph_mds_client *mdsc = fsc->mdsc; - unsigned frag = fpos_frag(ctx->pos); - int off = fpos_off(ctx->pos); + int i; int err; u32 ftype; struct ceph_mds_reply_info_parsed *rinfo; - dout("readdir %p file %p frag %u off %u\n", inode, file, frag, off); + dout("readdir %p file %p pos %llx\n", inode, file, ctx->pos); if (fi->flags & CEPH_F_ATEND) return 0; @@ -260,7 +330,6 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx) inode->i_mode >> 12)) return 0; ctx->pos = 1; - off = 1; } if (ctx->pos == 1) { ino_t ino = parent_ino(file->f_path.dentry); @@ -270,7 +339,6 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx) inode->i_mode >> 12)) return 0; ctx->pos = 2; - off = 2; } /* can we use the dcache? */ @@ -285,8 +353,6 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx) err = __dcache_readdir(file, ctx, shared_gen); if (err != -EAGAIN) return err; - frag = fpos_frag(ctx->pos); - off = fpos_off(ctx->pos); } else { spin_unlock(&ci->i_ceph_lock); } @@ -294,8 +360,9 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx) /* proceed with a normal readdir */ more: /* do we have the correct frag content buffered? */ - if (fi->frag != frag || fi->last_readdir == NULL) { + if (need_send_readdir(fi, ctx->pos)) { struct ceph_mds_request *req; + unsigned frag; int op = ceph_snap(inode) == CEPH_SNAPDIR ? CEPH_MDS_OP_LSSNAP : CEPH_MDS_OP_READDIR; @@ -305,6 +372,13 @@ more: fi->last_readdir = NULL; } + if (is_hash_order(ctx->pos)) { + frag = ceph_choose_frag(ci, fpos_hash(ctx->pos), + NULL, NULL); + } else { + frag = fpos_frag(ctx->pos); + } + dout("readdir fetching %llx.%llx frag %x offset '%s'\n", ceph_vinop(inode), frag, fi->last_name); req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS); @@ -331,6 +405,8 @@ more: req->r_readdir_cache_idx = fi->readdir_cache_idx; req->r_readdir_offset = fi->next_offset; req->r_args.readdir.frag = cpu_to_le32(frag); + req->r_args.readdir.flags = + cpu_to_le16(CEPH_READDIR_REPLY_BITFLAGS); req->r_inode = inode; ihold(inode); @@ -340,22 +416,26 @@ more: ceph_mdsc_put_request(req); return err; } - dout("readdir got and parsed readdir result=%d" - " on frag %x, end=%d, complete=%d\n", err, frag, + dout("readdir got and parsed readdir result=%d on " + "frag %x, end=%d, complete=%d, hash_order=%d\n", + err, frag, (int)req->r_reply_info.dir_end, - (int)req->r_reply_info.dir_complete); - + (int)req->r_reply_info.dir_complete, + (int)req->r_reply_info.hash_order); - /* note next offset and last dentry name */ rinfo = &req->r_reply_info; if (le32_to_cpu(rinfo->dir_dir->frag) != frag) { frag = le32_to_cpu(rinfo->dir_dir->frag); - off = req->r_readdir_offset; - fi->next_offset = off; + if (!rinfo->hash_order) { + fi->next_offset = req->r_readdir_offset; + /* adjust ctx->pos to beginning of frag */ + ctx->pos = ceph_make_fpos(frag, + fi->next_offset, + false); + } } fi->frag = frag; - fi->offset = fi->next_offset; fi->last_readdir = req; if (req->r_did_prepopulate) { @@ -363,7 +443,8 @@ more: if (fi->readdir_cache_idx < 0) { /* preclude from marking dir ordered */ fi->dir_ordered_count = 0; - } else if (ceph_frag_is_leftmost(frag) && off == 2) { + } else if (ceph_frag_is_leftmost(frag) && + fi->next_offset == 2) { /* note dir version at start of readdir so * we can tell if any dentries get dropped */ fi->dir_release_count = req->r_dir_release_cnt; @@ -377,65 +458,87 @@ more: fi->dir_release_count = 0; } - if (req->r_reply_info.dir_end) { - kfree(fi->last_name); - fi->last_name = NULL; - if (ceph_frag_is_rightmost(frag)) - fi->next_offset = 2; - else - fi->next_offset = 0; - } else { - err = note_last_dentry(fi, - rinfo->dir_dname[rinfo->dir_nr-1], - rinfo->dir_dname_len[rinfo->dir_nr-1], - fi->next_offset + rinfo->dir_nr); + /* note next offset and last dentry name */ + if (rinfo->dir_nr > 0) { + struct ceph_mds_reply_dir_entry *rde = + rinfo->dir_entries + (rinfo->dir_nr-1); + unsigned next_offset = req->r_reply_info.dir_end ? + 2 : (fpos_off(rde->offset) + 1); + err = note_last_dentry(fi, rde->name, rde->name_len, + next_offset); if (err) return err; + } else if (req->r_reply_info.dir_end) { + fi->next_offset = 2; + /* keep last name */ } } rinfo = &fi->last_readdir->r_reply_info; - dout("readdir frag %x num %d off %d chunkoff %d\n", frag, - rinfo->dir_nr, off, fi->offset); - - ctx->pos = ceph_make_fpos(frag, off); - while (off >= fi->offset && off - fi->offset < rinfo->dir_nr) { - struct ceph_mds_reply_inode *in = - rinfo->dir_in[off - fi->offset].in; + dout("readdir frag %x num %d pos %llx chunk first %llx\n", + fi->frag, rinfo->dir_nr, ctx->pos, + rinfo->dir_nr ? rinfo->dir_entries[0].offset : 0LL); + + i = 0; + /* search start position */ + if (rinfo->dir_nr > 0) { + int step, nr = rinfo->dir_nr; + while (nr > 0) { + step = nr >> 1; + if (rinfo->dir_entries[i + step].offset < ctx->pos) { + i += step + 1; + nr -= step + 1; + } else { + nr = step; + } + } + } + for (; i < rinfo->dir_nr; i++) { + struct ceph_mds_reply_dir_entry *rde = rinfo->dir_entries + i; struct ceph_vino vino; ino_t ino; - dout("readdir off %d (%d/%d) -> %lld '%.*s' %p\n", - off, off - fi->offset, rinfo->dir_nr, ctx->pos, - rinfo->dir_dname_len[off - fi->offset], - rinfo->dir_dname[off - fi->offset], in); - BUG_ON(!in); - ftype = le32_to_cpu(in->mode) >> 12; - vino.ino = le64_to_cpu(in->ino); - vino.snap = le64_to_cpu(in->snapid); + BUG_ON(rde->offset < ctx->pos); + + ctx->pos = rde->offset; + dout("readdir (%d/%d) -> %llx '%.*s' %p\n", + i, rinfo->dir_nr, ctx->pos, + rde->name_len, rde->name, &rde->inode.in); + + BUG_ON(!rde->inode.in); + ftype = le32_to_cpu(rde->inode.in->mode) >> 12; + vino.ino = le64_to_cpu(rde->inode.in->ino); + vino.snap = le64_to_cpu(rde->inode.in->snapid); ino = ceph_vino_to_ino(vino); - if (!dir_emit(ctx, - rinfo->dir_dname[off - fi->offset], - rinfo->dir_dname_len[off - fi->offset], - ceph_translate_ino(inode->i_sb, ino), ftype)) { + + if (!dir_emit(ctx, rde->name, rde->name_len, + ceph_translate_ino(inode->i_sb, ino), ftype)) { dout("filldir stopping us...\n"); return 0; } - off++; ctx->pos++; } - if (fi->last_name) { + if (fi->next_offset > 2) { ceph_mdsc_put_request(fi->last_readdir); fi->last_readdir = NULL; goto more; } /* more frags? */ - if (!ceph_frag_is_rightmost(frag)) { - frag = ceph_frag_next(frag); - off = 0; - ctx->pos = ceph_make_fpos(frag, off); + if (!ceph_frag_is_rightmost(fi->frag)) { + unsigned frag = ceph_frag_next(fi->frag); + if (is_hash_order(ctx->pos)) { + loff_t new_pos = ceph_make_fpos(ceph_frag_value(frag), + fi->next_offset, true); + if (new_pos > ctx->pos) + ctx->pos = new_pos; + /* keep last_name */ + } else { + ctx->pos = ceph_make_fpos(frag, fi->next_offset, false); + kfree(fi->last_name); + fi->last_name = NULL; + } dout("readdir next frag is %x\n", frag); goto more; } @@ -467,7 +570,7 @@ more: return 0; } -static void reset_readdir(struct ceph_file_info *fi, unsigned frag) +static void reset_readdir(struct ceph_file_info *fi) { if (fi->last_readdir) { ceph_mdsc_put_request(fi->last_readdir); @@ -477,18 +580,38 @@ static void reset_readdir(struct ceph_file_info *fi, unsigned frag) fi->last_name = NULL; fi->dir_release_count = 0; fi->readdir_cache_idx = -1; - if (ceph_frag_is_leftmost(frag)) - fi->next_offset = 2; /* compensate for . and .. */ - else - fi->next_offset = 0; + fi->next_offset = 2; /* compensate for . and .. */ fi->flags &= ~CEPH_F_ATEND; } +/* + * discard buffered readdir content on seekdir(0), or seek to new frag, + * or seek prior to current chunk + */ +static bool need_reset_readdir(struct ceph_file_info *fi, loff_t new_pos) +{ + struct ceph_mds_reply_info_parsed *rinfo; + loff_t chunk_offset; + if (new_pos == 0) + return true; + if (is_hash_order(new_pos)) { + /* no need to reset last_name for a forward seek when + * dentries are sotred in hash order */ + } else if (fi->frag |= fpos_frag(new_pos)) { + return true; + } + rinfo = fi->last_readdir ? &fi->last_readdir->r_reply_info : NULL; + if (!rinfo || !rinfo->dir_nr) + return true; + chunk_offset = rinfo->dir_entries[0].offset; + return new_pos < chunk_offset || + is_hash_order(new_pos) != is_hash_order(chunk_offset); +} + static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence) { struct ceph_file_info *fi = file->private_data; struct inode *inode = file->f_mapping->host; - loff_t old_offset = ceph_make_fpos(fi->frag, fi->next_offset); loff_t retval; inode_lock(inode); @@ -505,25 +628,22 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence) } if (offset >= 0) { + if (need_reset_readdir(fi, offset)) { + dout("dir_llseek dropping %p content\n", file); + reset_readdir(fi); + } else if (is_hash_order(offset) && offset > file->f_pos) { + /* for hash offset, we don't know if a forward seek + * is within same frag */ + fi->dir_release_count = 0; + fi->readdir_cache_idx = -1; + } + if (offset != file->f_pos) { file->f_pos = offset; file->f_version = 0; fi->flags &= ~CEPH_F_ATEND; } retval = offset; - - if (offset == 0 || - fpos_frag(offset) != fi->frag || - fpos_off(offset) < fi->offset) { - /* discard buffered readdir content on seekdir(0), or - * seek to new frag, or seek prior to current chunk */ - dout("dir_llseek dropping %p content\n", file); - reset_readdir(fi, fpos_frag(offset)); - } else if (fpos_cmp(offset, old_offset) > 0) { - /* reset dir_release_count if we did a forward seek */ - fi->dir_release_count = 0; - fi->readdir_cache_idx = -1; - } } out: inode_unlock(inode); @@ -591,7 +711,7 @@ struct dentry *ceph_finish_lookup(struct ceph_mds_request *req, return dentry; } -static int is_root_ceph_dentry(struct inode *inode, struct dentry *dentry) +static bool is_root_ceph_dentry(struct inode *inode, struct dentry *dentry) { return ceph_ino(inode) == CEPH_INO_ROOT && strncmp(dentry->d_name.name, ".ceph", 5) == 0; diff --git a/fs/ceph/export.c b/fs/ceph/export.c index 6e72c98162d5..1780218a48f0 100644 --- a/fs/ceph/export.c +++ b/fs/ceph/export.c @@ -95,10 +95,8 @@ static struct dentry *__fh_to_dentry(struct super_block *sb, u64 ino) } dentry = d_obtain_alias(inode); - if (IS_ERR(dentry)) { - iput(inode); + if (IS_ERR(dentry)) return dentry; - } err = ceph_init_dentry(dentry); if (err < 0) { dput(dentry); @@ -167,10 +165,8 @@ static struct dentry *__get_parent(struct super_block *sb, return ERR_PTR(-ENOENT); dentry = d_obtain_alias(inode); - if (IS_ERR(dentry)) { - iput(inode); + if (IS_ERR(dentry)) return dentry; - } err = ceph_init_dentry(dentry); if (err < 0) { dput(dentry); @@ -210,7 +206,7 @@ static struct dentry *ceph_fh_to_parent(struct super_block *sb, dout("fh_to_parent %llx\n", cfh->parent_ino); dentry = __get_parent(sb, NULL, cfh->ino); - if (IS_ERR(dentry) && PTR_ERR(dentry) == -ENOENT) + if (unlikely(dentry == ERR_PTR(-ENOENT))) dentry = __fh_to_dentry(sb, cfh->parent_ino); return dentry; } diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 4f1dc7120916..0daaf7ceedc5 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -137,23 +137,11 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode) { struct ceph_file_info *cf; int ret = 0; - struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb); - struct ceph_mds_client *mdsc = fsc->mdsc; switch (inode->i_mode & S_IFMT) { case S_IFREG: - /* First file open request creates the cookie, we want to keep - * this cookie around for the filetime of the inode as not to - * have to worry about fscache register / revoke / operation - * races. - * - * Also, if we know the operation is going to invalidate data - * (non readonly) just nuke the cache right away. - */ - ceph_fscache_register_inode_cookie(mdsc->fsc, ci); - if ((fmode & CEPH_FILE_MODE_WR)) - ceph_fscache_invalidate(inode); + ceph_fscache_register_inode_cookie(inode); + ceph_fscache_file_set_cookie(inode, file); case S_IFDIR: dout("init_file %p %p 0%o (regular)\n", inode, file, inode->i_mode); @@ -192,6 +180,59 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode) } /* + * try renew caps after session gets killed. + */ +int ceph_renew_caps(struct inode *inode) +{ + struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_mds_request *req; + int err, flags, wanted; + + spin_lock(&ci->i_ceph_lock); + wanted = __ceph_caps_file_wanted(ci); + if (__ceph_is_any_real_caps(ci) && + (!(wanted & CEPH_CAP_ANY_WR) == 0 || ci->i_auth_cap)) { + int issued = __ceph_caps_issued(ci, NULL); + spin_unlock(&ci->i_ceph_lock); + dout("renew caps %p want %s issued %s updating mds_wanted\n", + inode, ceph_cap_string(wanted), ceph_cap_string(issued)); + ceph_check_caps(ci, 0, NULL); + return 0; + } + spin_unlock(&ci->i_ceph_lock); + + flags = 0; + if ((wanted & CEPH_CAP_FILE_RD) && (wanted & CEPH_CAP_FILE_WR)) + flags = O_RDWR; + else if (wanted & CEPH_CAP_FILE_RD) + flags = O_RDONLY; + else if (wanted & CEPH_CAP_FILE_WR) + flags = O_WRONLY; +#ifdef O_LAZY + if (wanted & CEPH_CAP_FILE_LAZYIO) + flags |= O_LAZY; +#endif + + req = prepare_open_request(inode->i_sb, flags, 0); + if (IS_ERR(req)) { + err = PTR_ERR(req); + goto out; + } + + req->r_inode = inode; + ihold(inode); + req->r_num_caps = 1; + req->r_fmode = -1; + + err = ceph_mdsc_do_request(mdsc, NULL, req); + ceph_mdsc_put_request(req); +out: + dout("renew caps %p open result=%d\n", inode, err); + return err < 0 ? err : 0; +} + +/* * If we already have the requisite capabilities, we can satisfy * the open request locally (no need to request new caps from the * MDS). We do, however, need to inform the MDS (asynchronously) @@ -353,7 +394,7 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry, if ((flags & O_CREAT) && !req->r_reply_info.head->is_dentry) err = ceph_handle_notrace_create(dir, dentry); - if (d_unhashed(dentry)) { + if (d_in_lookup(dentry)) { dn = ceph_finish_lookup(req, dentry, err); if (IS_ERR(dn)) err = PTR_ERR(dn); @@ -616,8 +657,7 @@ static void ceph_aio_complete(struct inode *inode, kfree(aio_req); } -static void ceph_aio_complete_req(struct ceph_osd_request *req, - struct ceph_msg *msg) +static void ceph_aio_complete_req(struct ceph_osd_request *req) { int rc = req->r_result; struct inode *inode = req->r_inode; @@ -714,14 +754,21 @@ static void ceph_aio_retry_work(struct work_struct *work) req->r_flags = CEPH_OSD_FLAG_ORDERSNAP | CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE; - req->r_base_oloc = orig_req->r_base_oloc; - req->r_base_oid = orig_req->r_base_oid; + ceph_oloc_copy(&req->r_base_oloc, &orig_req->r_base_oloc); + ceph_oid_copy(&req->r_base_oid, &orig_req->r_base_oid); + + ret = ceph_osdc_alloc_messages(req, GFP_NOFS); + if (ret) { + ceph_osdc_put_request(req); + req = orig_req; + goto out; + } req->r_ops[0] = orig_req->r_ops[0]; osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC, 0); - ceph_osdc_build_request(req, req->r_ops[0].extent.offset, - snapc, CEPH_NOSNAP, &aio_req->mtime); + req->r_mtime = aio_req->mtime; + req->r_data_offset = req->r_ops[0].extent.offset; ceph_osdc_put_request(orig_req); @@ -733,7 +780,7 @@ static void ceph_aio_retry_work(struct work_struct *work) out: if (ret < 0) { req->r_result = ret; - ceph_aio_complete_req(req, NULL); + ceph_aio_complete_req(req); } ceph_put_snap_context(snapc); @@ -764,6 +811,8 @@ static void ceph_sync_write_unsafe(struct ceph_osd_request *req, bool unsafe) list_add_tail(&req->r_unsafe_item, &ci->i_unsafe_writes); spin_unlock(&ci->i_unsafe_lock); + + complete_all(&req->r_completion); } else { spin_lock(&ci->i_unsafe_lock); list_del_init(&req->r_unsafe_item); @@ -875,14 +924,12 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, (pos+len) | (PAGE_SIZE - 1)); osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC, 0); + req->r_mtime = mtime; } - osd_req_op_extent_osd_data_pages(req, 0, pages, len, start, false, false); - ceph_osdc_build_request(req, pos, snapc, vino.snap, &mtime); - if (aio_req) { aio_req->total_len += len; aio_req->num_reqs++; @@ -956,7 +1003,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, req, false); if (ret < 0) { req->r_result = ret; - ceph_aio_complete_req(req, NULL); + ceph_aio_complete_req(req); } } return -EIOCBQUEUED; @@ -1067,9 +1114,7 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos, osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0, false, true); - /* BUG_ON(vino.snap != CEPH_NOSNAP); */ - ceph_osdc_build_request(req, pos, snapc, vino.snap, &mtime); - + req->r_mtime = mtime; ret = ceph_osdc_start_request(&fsc->client->osdc, req, false); if (!ret) ret = ceph_osdc_wait_request(&fsc->client->osdc, req); @@ -1292,7 +1337,7 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from) } retry_snap: - if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL)) { + if (ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL)) { err = -ENOSPC; goto out; } @@ -1350,7 +1395,6 @@ retry_snap: iov_iter_advance(from, written); ceph_put_snap_context(snapc); } else { - loff_t old_size = i_size_read(inode); /* * No need to acquire the i_truncate_mutex. Because * the MDS revokes Fwb caps before sending truncate @@ -1361,8 +1405,6 @@ retry_snap: written = generic_perform_write(file, from, pos); if (likely(written >= 0)) iocb->ki_pos = pos + written; - if (i_size_read(inode) > old_size) - ceph_fscache_update_objectsize(inode); inode_unlock(inode); } @@ -1383,7 +1425,7 @@ retry_snap: ceph_put_cap_refs(ci, got); if (written >= 0) { - if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_NEARFULL)) + if (ceph_osdmap_flag(osdc, CEPH_OSDMAP_NEARFULL)) iocb->ki_flags |= IOCB_DSYNC; written = generic_write_sync(iocb, written); @@ -1524,9 +1566,7 @@ static int ceph_zero_partial_object(struct inode *inode, goto out; } - ceph_osdc_build_request(req, offset, NULL, ceph_vino(inode).snap, - &inode->i_mtime); - + req->r_mtime = inode->i_mtime; ret = ceph_osdc_start_request(&fsc->client->osdc, req, false); if (!ret) { ret = ceph_osdc_wait_request(&fsc->client->osdc, req); @@ -1617,8 +1657,8 @@ static long ceph_fallocate(struct file *file, int mode, goto unlock; } - if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL) && - !(mode & FALLOC_FL_PUNCH_HOLE)) { + if (ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) && + !(mode & FALLOC_FL_PUNCH_HOLE)) { ret = -ENOSPC; goto unlock; } diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index e669cfa9d793..99bdef66213a 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -11,6 +11,7 @@ #include <linux/xattr.h> #include <linux/posix_acl.h> #include <linux/random.h> +#include <linux/sort.h> #include "super.h" #include "mds_client.h" @@ -254,6 +255,9 @@ static int ceph_fill_dirfrag(struct inode *inode, diri_auth = ci->i_auth_cap->mds; spin_unlock(&ci->i_ceph_lock); + if (mds == -1) /* CDIR_AUTH_PARENT */ + mds = diri_auth; + mutex_lock(&ci->i_fragtree_mutex); if (ndist == 0 && mds == diri_auth) { /* no delegation info needed. */ @@ -300,20 +304,38 @@ out: return err; } +static int frag_tree_split_cmp(const void *l, const void *r) +{ + struct ceph_frag_tree_split *ls = (struct ceph_frag_tree_split*)l; + struct ceph_frag_tree_split *rs = (struct ceph_frag_tree_split*)r; + return ceph_frag_compare(ls->frag, rs->frag); +} + +static bool is_frag_child(u32 f, struct ceph_inode_frag *frag) +{ + if (!frag) + return f == ceph_frag_make(0, 0); + if (ceph_frag_bits(f) != ceph_frag_bits(frag->frag) + frag->split_by) + return false; + return ceph_frag_contains_value(frag->frag, ceph_frag_value(f)); +} + static int ceph_fill_fragtree(struct inode *inode, struct ceph_frag_tree_head *fragtree, struct ceph_mds_reply_dirfrag *dirinfo) { struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_inode_frag *frag; + struct ceph_inode_frag *frag, *prev_frag = NULL; struct rb_node *rb_node; - int i; - u32 id, nsplits; + unsigned i, split_by, nsplits; + u32 id; bool update = false; mutex_lock(&ci->i_fragtree_mutex); nsplits = le32_to_cpu(fragtree->nsplits); - if (nsplits) { + if (nsplits != ci->i_fragtree_nsplits) { + update = true; + } else if (nsplits) { i = prandom_u32() % nsplits; id = le32_to_cpu(fragtree->splits[i].frag); if (!__ceph_find_frag(ci, id)) @@ -332,10 +354,22 @@ static int ceph_fill_fragtree(struct inode *inode, if (!update) goto out_unlock; + if (nsplits > 1) { + sort(fragtree->splits, nsplits, sizeof(fragtree->splits[0]), + frag_tree_split_cmp, NULL); + } + dout("fill_fragtree %llx.%llx\n", ceph_vinop(inode)); rb_node = rb_first(&ci->i_fragtree); for (i = 0; i < nsplits; i++) { id = le32_to_cpu(fragtree->splits[i].frag); + split_by = le32_to_cpu(fragtree->splits[i].by); + if (split_by == 0 || ceph_frag_bits(id) + split_by > 24) { + pr_err("fill_fragtree %llx.%llx invalid split %d/%u, " + "frag %x split by %d\n", ceph_vinop(inode), + i, nsplits, id, split_by); + continue; + } frag = NULL; while (rb_node) { frag = rb_entry(rb_node, struct ceph_inode_frag, node); @@ -347,8 +381,14 @@ static int ceph_fill_fragtree(struct inode *inode, break; } rb_node = rb_next(rb_node); - rb_erase(&frag->node, &ci->i_fragtree); - kfree(frag); + /* delete stale split/leaf node */ + if (frag->split_by > 0 || + !is_frag_child(frag->frag, prev_frag)) { + rb_erase(&frag->node, &ci->i_fragtree); + if (frag->split_by > 0) + ci->i_fragtree_nsplits--; + kfree(frag); + } frag = NULL; } if (!frag) { @@ -356,14 +396,23 @@ static int ceph_fill_fragtree(struct inode *inode, if (IS_ERR(frag)) continue; } - frag->split_by = le32_to_cpu(fragtree->splits[i].by); + if (frag->split_by == 0) + ci->i_fragtree_nsplits++; + frag->split_by = split_by; dout(" frag %x split by %d\n", frag->frag, frag->split_by); + prev_frag = frag; } while (rb_node) { frag = rb_entry(rb_node, struct ceph_inode_frag, node); rb_node = rb_next(rb_node); - rb_erase(&frag->node, &ci->i_fragtree); - kfree(frag); + /* delete stale split/leaf node */ + if (frag->split_by > 0 || + !is_frag_child(frag->frag, prev_frag)) { + rb_erase(&frag->node, &ci->i_fragtree); + if (frag->split_by > 0) + ci->i_fragtree_nsplits--; + kfree(frag); + } } out_unlock: mutex_unlock(&ci->i_fragtree_mutex); @@ -513,6 +562,7 @@ void ceph_destroy_inode(struct inode *inode) rb_erase(n, &ci->i_fragtree); kfree(frag); } + ci->i_fragtree_nsplits = 0; __ceph_destroy_xattrs(ci); if (ci->i_xattrs.blob) @@ -533,6 +583,11 @@ int ceph_drop_inode(struct inode *inode) return 1; } +static inline blkcnt_t calc_inode_blocks(u64 size) +{ + return (size + (1<<9) - 1) >> 9; +} + /* * Helpers to fill in size, ctime, mtime, and atime. We have to be * careful because either the client or MDS may have more up to date @@ -555,7 +610,7 @@ int ceph_fill_file_size(struct inode *inode, int issued, size = 0; } i_size_write(inode, size); - inode->i_blocks = (size + (1<<9) - 1) >> 9; + inode->i_blocks = calc_inode_blocks(size); ci->i_reported_size = size; if (truncate_seq != ci->i_truncate_seq) { dout("truncate_seq %u -> %u\n", @@ -814,9 +869,13 @@ static int fill_inode(struct inode *inode, struct page *locked_page, spin_unlock(&ci->i_ceph_lock); - err = -EINVAL; - if (WARN_ON(symlen != i_size_read(inode))) - goto out; + if (symlen != i_size_read(inode)) { + pr_err("fill_inode %llx.%llx BAD symlink " + "size %lld\n", ceph_vinop(inode), + i_size_read(inode)); + i_size_write(inode, symlen); + inode->i_blocks = calc_inode_blocks(symlen); + } err = -ENOMEM; sym = kstrndup(iinfo->symlink, symlen, GFP_NOFS); @@ -1105,7 +1164,7 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req, dname.name = rinfo->dname; dname.len = rinfo->dname_len; - dname.hash = full_name_hash(dname.name, dname.len); + dname.hash = full_name_hash(parent, dname.name, dname.len); vino.ino = le64_to_cpu(rinfo->targeti.in->ino); vino.snap = le64_to_cpu(rinfo->targeti.in->snapid); retry_lookup: @@ -1309,12 +1368,13 @@ static int readdir_prepopulate_inodes_only(struct ceph_mds_request *req, int i, err = 0; for (i = 0; i < rinfo->dir_nr; i++) { + struct ceph_mds_reply_dir_entry *rde = rinfo->dir_entries + i; struct ceph_vino vino; struct inode *in; int rc; - vino.ino = le64_to_cpu(rinfo->dir_in[i].in->ino); - vino.snap = le64_to_cpu(rinfo->dir_in[i].in->snapid); + vino.ino = le64_to_cpu(rde->inode.in->ino); + vino.snap = le64_to_cpu(rde->inode.in->snapid); in = ceph_get_inode(req->r_dentry->d_sb, vino); if (IS_ERR(in)) { @@ -1322,14 +1382,14 @@ static int readdir_prepopulate_inodes_only(struct ceph_mds_request *req, dout("new_inode badness got %d\n", err); continue; } - rc = fill_inode(in, NULL, &rinfo->dir_in[i], NULL, session, + rc = fill_inode(in, NULL, &rde->inode, NULL, session, req->r_request_started, -1, &req->r_caps_reservation); if (rc < 0) { pr_err("fill_inode badness on %p got %d\n", in, rc); err = rc; - continue; } + iput(in); } return err; @@ -1387,6 +1447,7 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req, struct ceph_mds_session *session) { struct dentry *parent = req->r_dentry; + struct ceph_inode_info *ci = ceph_inode(d_inode(parent)); struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info; struct qstr dname; struct dentry *dn; @@ -1394,22 +1455,27 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req, int err = 0, skipped = 0, ret, i; struct inode *snapdir = NULL; struct ceph_mds_request_head *rhead = req->r_request->front.iov_base; - struct ceph_dentry_info *di; u32 frag = le32_to_cpu(rhead->args.readdir.frag); + u32 last_hash = 0; + u32 fpos_offset; struct ceph_readdir_cache_control cache_ctl = {}; if (req->r_aborted) return readdir_prepopulate_inodes_only(req, session); + if (rinfo->hash_order && req->r_path2) { + last_hash = ceph_str_hash(ci->i_dir_layout.dl_dir_hash, + req->r_path2, strlen(req->r_path2)); + last_hash = ceph_frag_value(last_hash); + } + if (rinfo->dir_dir && le32_to_cpu(rinfo->dir_dir->frag) != frag) { dout("readdir_prepopulate got new frag %x -> %x\n", frag, le32_to_cpu(rinfo->dir_dir->frag)); frag = le32_to_cpu(rinfo->dir_dir->frag); - if (ceph_frag_is_leftmost(frag)) + if (!rinfo->hash_order) req->r_readdir_offset = 2; - else - req->r_readdir_offset = 0; } if (le32_to_cpu(rinfo->head->op) == CEPH_MDS_OP_LSSNAP) { @@ -1427,24 +1493,37 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req, if (ceph_frag_is_leftmost(frag) && req->r_readdir_offset == 2) { /* note dir version at start of readdir so we can tell * if any dentries get dropped */ - struct ceph_inode_info *ci = ceph_inode(d_inode(parent)); req->r_dir_release_cnt = atomic64_read(&ci->i_release_count); req->r_dir_ordered_cnt = atomic64_read(&ci->i_ordered_count); req->r_readdir_cache_idx = 0; } cache_ctl.index = req->r_readdir_cache_idx; + fpos_offset = req->r_readdir_offset; /* FIXME: release caps/leases if error occurs */ for (i = 0; i < rinfo->dir_nr; i++) { + struct ceph_mds_reply_dir_entry *rde = rinfo->dir_entries + i; struct ceph_vino vino; - dname.name = rinfo->dir_dname[i]; - dname.len = rinfo->dir_dname_len[i]; - dname.hash = full_name_hash(dname.name, dname.len); - - vino.ino = le64_to_cpu(rinfo->dir_in[i].in->ino); - vino.snap = le64_to_cpu(rinfo->dir_in[i].in->snapid); + dname.name = rde->name; + dname.len = rde->name_len; + dname.hash = full_name_hash(parent, dname.name, dname.len); + + vino.ino = le64_to_cpu(rde->inode.in->ino); + vino.snap = le64_to_cpu(rde->inode.in->snapid); + + if (rinfo->hash_order) { + u32 hash = ceph_str_hash(ci->i_dir_layout.dl_dir_hash, + rde->name, rde->name_len); + hash = ceph_frag_value(hash); + if (hash != last_hash) + fpos_offset = 2; + last_hash = hash; + rde->offset = ceph_make_fpos(hash, fpos_offset++, true); + } else { + rde->offset = ceph_make_fpos(frag, fpos_offset++, false); + } retry_lookup: dn = d_lookup(parent, &dname); @@ -1490,7 +1569,7 @@ retry_lookup: } } - ret = fill_inode(in, NULL, &rinfo->dir_in[i], NULL, session, + ret = fill_inode(in, NULL, &rde->inode, NULL, session, req->r_request_started, -1, &req->r_caps_reservation); if (ret < 0) { @@ -1523,11 +1602,9 @@ retry_lookup: dn = realdn; } - di = dn->d_fsdata; - di->offset = ceph_make_fpos(frag, i + req->r_readdir_offset); + ceph_dentry(dn)->offset = rde->offset; - update_dentry_lease(dn, rinfo->dir_dlease[i], - req->r_session, + update_dentry_lease(dn, rde->lease, req->r_session, req->r_request_started); if (err == 0 && skipped == 0 && cache_ctl.index >= 0) { @@ -1562,7 +1639,7 @@ int ceph_inode_set_size(struct inode *inode, loff_t size) spin_lock(&ci->i_ceph_lock); dout("set_size %p %llu -> %llu\n", inode, inode->i_size, size); i_size_write(inode, size); - inode->i_blocks = (size + (1 << 9) - 1) >> 9; + inode->i_blocks = calc_inode_blocks(size); /* tell the MDS if we are approaching max_size */ if ((size << 1) >= ci->i_max_size && @@ -1624,10 +1701,21 @@ static void ceph_invalidate_work(struct work_struct *work) struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info, i_pg_inv_work); struct inode *inode = &ci->vfs_inode; + struct ceph_fs_client *fsc = ceph_inode_to_client(inode); u32 orig_gen; int check = 0; mutex_lock(&ci->i_truncate_mutex); + + if (ACCESS_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) { + pr_warn_ratelimited("invalidate_pages %p %lld forced umount\n", + inode, ceph_ino(inode)); + mapping_set_error(inode->i_mapping, -EIO); + truncate_pagecache(inode, 0); + mutex_unlock(&ci->i_truncate_mutex); + goto out; + } + spin_lock(&ci->i_ceph_lock); dout("invalidate_pages %p gen %d revoking %d\n", inode, ci->i_rdcache_gen, ci->i_rdcache_revoking); @@ -1641,7 +1729,9 @@ static void ceph_invalidate_work(struct work_struct *work) orig_gen = ci->i_rdcache_gen; spin_unlock(&ci->i_ceph_lock); - truncate_pagecache(inode, 0); + if (invalidate_inode_pages2(inode->i_mapping) < 0) { + pr_err("invalidate_pages %p fails\n", inode); + } spin_lock(&ci->i_ceph_lock); if (orig_gen == ci->i_rdcache_gen && @@ -1920,8 +2010,7 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr) if ((issued & CEPH_CAP_FILE_EXCL) && attr->ia_size > inode->i_size) { i_size_write(inode, attr->ia_size); - inode->i_blocks = - (attr->ia_size + (1 << 9) - 1) >> 9; + inode->i_blocks = calc_inode_blocks(attr->ia_size); inode->i_ctime = attr->ia_ctime; ci->i_reported_size = attr->ia_size; dirtied |= CEPH_CAP_FILE_EXCL; diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c index f851d8d70158..be6b1657b1af 100644 --- a/fs/ceph/ioctl.c +++ b/fs/ceph/ioctl.c @@ -193,12 +193,12 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg) if (copy_from_user(&dl, arg, sizeof(dl))) return -EFAULT; - down_read(&osdc->map_sem); + down_read(&osdc->lock); r = ceph_calc_file_object_mapping(&ci->i_layout, dl.file_offset, len, &dl.object_no, &dl.object_offset, &olen); if (r < 0) { - up_read(&osdc->map_sem); + up_read(&osdc->lock); return -EIO; } dl.file_offset -= dl.object_offset; @@ -213,15 +213,15 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg) ceph_ino(inode), dl.object_no); oloc.pool = ceph_file_layout_pg_pool(ci->i_layout); - ceph_oid_set_name(&oid, dl.object_name); + ceph_oid_printf(&oid, "%s", dl.object_name); - r = ceph_oloc_oid_to_pg(osdc->osdmap, &oloc, &oid, &pgid); + r = ceph_object_locator_to_pg(osdc->osdmap, &oid, &oloc, &pgid); if (r < 0) { - up_read(&osdc->map_sem); + up_read(&osdc->lock); return r; } - dl.osd = ceph_calc_pg_primary(osdc->osdmap, pgid); + dl.osd = ceph_pg_to_acting_primary(osdc->osdmap, &pgid); if (dl.osd >= 0) { struct ceph_entity_addr *a = ceph_osd_addr(osdc->osdmap, dl.osd); @@ -230,7 +230,7 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg) } else { memset(&dl.osd_addr, 0, sizeof(dl.osd_addr)); } - up_read(&osdc->map_sem); + up_read(&osdc->lock); /* send result back to user */ if (copy_to_user(arg, &dl, sizeof(dl))) diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 85b8517f17a0..4e8678a612b6 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -181,17 +181,18 @@ static int parse_reply_info_dir(void **p, void *end, ceph_decode_need(p, end, sizeof(num) + 2, bad); num = ceph_decode_32(p); - info->dir_end = ceph_decode_8(p); - info->dir_complete = ceph_decode_8(p); + { + u16 flags = ceph_decode_16(p); + info->dir_end = !!(flags & CEPH_READDIR_FRAG_END); + info->dir_complete = !!(flags & CEPH_READDIR_FRAG_COMPLETE); + info->hash_order = !!(flags & CEPH_READDIR_HASH_ORDER); + } if (num == 0) goto done; - BUG_ON(!info->dir_in); - info->dir_dname = (void *)(info->dir_in + num); - info->dir_dname_len = (void *)(info->dir_dname + num); - info->dir_dlease = (void *)(info->dir_dname_len + num); - if ((unsigned long)(info->dir_dlease + num) > - (unsigned long)info->dir_in + info->dir_buf_size) { + BUG_ON(!info->dir_entries); + if ((unsigned long)(info->dir_entries + num) > + (unsigned long)info->dir_entries + info->dir_buf_size) { pr_err("dir contents are larger than expected\n"); WARN_ON(1); goto bad; @@ -199,21 +200,23 @@ static int parse_reply_info_dir(void **p, void *end, info->dir_nr = num; while (num) { + struct ceph_mds_reply_dir_entry *rde = info->dir_entries + i; /* dentry */ ceph_decode_need(p, end, sizeof(u32)*2, bad); - info->dir_dname_len[i] = ceph_decode_32(p); - ceph_decode_need(p, end, info->dir_dname_len[i], bad); - info->dir_dname[i] = *p; - *p += info->dir_dname_len[i]; - dout("parsed dir dname '%.*s'\n", info->dir_dname_len[i], - info->dir_dname[i]); - info->dir_dlease[i] = *p; + rde->name_len = ceph_decode_32(p); + ceph_decode_need(p, end, rde->name_len, bad); + rde->name = *p; + *p += rde->name_len; + dout("parsed dir dname '%.*s'\n", rde->name_len, rde->name); + rde->lease = *p; *p += sizeof(struct ceph_mds_reply_lease); /* inode */ - err = parse_reply_info_in(p, end, &info->dir_in[i], features); + err = parse_reply_info_in(p, end, &rde->inode, features); if (err < 0) goto out_bad; + /* ceph_readdir_prepopulate() will update it */ + rde->offset = 0; i++; num--; } @@ -345,9 +348,9 @@ out_bad: static void destroy_reply_info(struct ceph_mds_reply_info_parsed *info) { - if (!info->dir_in) + if (!info->dir_entries) return; - free_pages((unsigned long)info->dir_in, get_order(info->dir_buf_size)); + free_pages((unsigned long)info->dir_entries, get_order(info->dir_buf_size)); } @@ -567,51 +570,23 @@ void ceph_mdsc_release_request(struct kref *kref) kfree(req); } +DEFINE_RB_FUNCS(request, struct ceph_mds_request, r_tid, r_node) + /* * lookup session, bump ref if found. * * called under mdsc->mutex. */ -static struct ceph_mds_request *__lookup_request(struct ceph_mds_client *mdsc, - u64 tid) +static struct ceph_mds_request * +lookup_get_request(struct ceph_mds_client *mdsc, u64 tid) { struct ceph_mds_request *req; - struct rb_node *n = mdsc->request_tree.rb_node; - - while (n) { - req = rb_entry(n, struct ceph_mds_request, r_node); - if (tid < req->r_tid) - n = n->rb_left; - else if (tid > req->r_tid) - n = n->rb_right; - else { - ceph_mdsc_get_request(req); - return req; - } - } - return NULL; -} -static void __insert_request(struct ceph_mds_client *mdsc, - struct ceph_mds_request *new) -{ - struct rb_node **p = &mdsc->request_tree.rb_node; - struct rb_node *parent = NULL; - struct ceph_mds_request *req = NULL; + req = lookup_request(&mdsc->request_tree, tid); + if (req) + ceph_mdsc_get_request(req); - while (*p) { - parent = *p; - req = rb_entry(parent, struct ceph_mds_request, r_node); - if (new->r_tid < req->r_tid) - p = &(*p)->rb_left; - else if (new->r_tid > req->r_tid) - p = &(*p)->rb_right; - else - BUG(); - } - - rb_link_node(&new->r_node, parent, p); - rb_insert_color(&new->r_node, &mdsc->request_tree); + return req; } /* @@ -630,7 +605,7 @@ static void __register_request(struct ceph_mds_client *mdsc, req->r_num_caps); dout("__register_request %p tid %lld\n", req, req->r_tid); ceph_mdsc_get_request(req); - __insert_request(mdsc, req); + insert_request(&mdsc->request_tree, req); req->r_uid = current_fsuid(); req->r_gid = current_fsgid(); @@ -663,8 +638,7 @@ static void __unregister_request(struct ceph_mds_client *mdsc, } } - rb_erase(&req->r_node, &mdsc->request_tree); - RB_CLEAR_NODE(&req->r_node); + erase_request(&mdsc->request_tree, req); if (req->r_unsafe_dir && req->r_got_unsafe) { struct ceph_inode_info *ci = ceph_inode(req->r_unsafe_dir); @@ -868,12 +842,14 @@ static struct ceph_msg *create_session_open_msg(struct ceph_mds_client *mdsc, u6 int metadata_bytes = 0; int metadata_key_count = 0; struct ceph_options *opt = mdsc->fsc->client->options; + struct ceph_mount_options *fsopt = mdsc->fsc->mount_options; void *p; const char* metadata[][2] = { {"hostname", utsname()->nodename}, {"kernel_version", utsname()->release}, - {"entity_id", opt->name ? opt->name : ""}, + {"entity_id", opt->name ? : ""}, + {"root", fsopt->server_path ? : "/"}, {NULL, NULL} }; @@ -1149,9 +1125,11 @@ out: static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg) { + struct ceph_fs_client *fsc = (struct ceph_fs_client *)arg; struct ceph_inode_info *ci = ceph_inode(inode); LIST_HEAD(to_remove); - int drop = 0; + bool drop = false; + bool invalidate = false; dout("removing cap %p, ci is %p, inode is %p\n", cap, ci, &ci->vfs_inode); @@ -1159,8 +1137,13 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, __ceph_remove_cap(cap, false); if (!ci->i_auth_cap) { struct ceph_cap_flush *cf; - struct ceph_mds_client *mdsc = - ceph_sb_to_client(inode->i_sb)->mdsc; + struct ceph_mds_client *mdsc = fsc->mdsc; + + ci->i_ceph_flags |= CEPH_I_CAP_DROPPED; + + if (ci->i_wrbuffer_ref > 0 && + ACCESS_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) + invalidate = true; while (true) { struct rb_node *n = rb_first(&ci->i_cap_flush_tree); @@ -1183,7 +1166,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, inode, ceph_ino(inode)); ci->i_dirty_caps = 0; list_del_init(&ci->i_dirty_item); - drop = 1; + drop = true; } if (!list_empty(&ci->i_flushing_item)) { pr_warn_ratelimited( @@ -1193,7 +1176,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, ci->i_flushing_caps = 0; list_del_init(&ci->i_flushing_item); mdsc->num_cap_flushing--; - drop = 1; + drop = true; } spin_unlock(&mdsc->cap_dirty_lock); @@ -1210,7 +1193,11 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, list_del(&cf->list); ceph_free_cap_flush(cf); } - while (drop--) + + wake_up_all(&ci->i_cap_wq); + if (invalidate) + ceph_queue_invalidate(inode); + if (drop) iput(inode); return 0; } @@ -1220,12 +1207,13 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, */ static void remove_session_caps(struct ceph_mds_session *session) { + struct ceph_fs_client *fsc = session->s_mdsc->fsc; + struct super_block *sb = fsc->sb; dout("remove_session_caps on %p\n", session); - iterate_session_caps(session, remove_session_caps_cb, NULL); + iterate_session_caps(session, remove_session_caps_cb, fsc); spin_lock(&session->s_cap_lock); if (session->s_nr_caps > 0) { - struct super_block *sb = session->s_mdsc->fsc->sb; struct inode *inode; struct ceph_cap *cap, *prev = NULL; struct ceph_vino vino; @@ -1270,13 +1258,13 @@ static int wake_up_session_cb(struct inode *inode, struct ceph_cap *cap, { struct ceph_inode_info *ci = ceph_inode(inode); - wake_up_all(&ci->i_cap_wq); if (arg) { spin_lock(&ci->i_ceph_lock); ci->i_wanted_max_size = 0; ci->i_requested_max_size = 0; spin_unlock(&ci->i_ceph_lock); } + wake_up_all(&ci->i_cap_wq); return 0; } @@ -1671,8 +1659,7 @@ int ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req, struct ceph_inode_info *ci = ceph_inode(dir); struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info; struct ceph_mount_options *opt = req->r_mdsc->fsc->mount_options; - size_t size = sizeof(*rinfo->dir_in) + sizeof(*rinfo->dir_dname_len) + - sizeof(*rinfo->dir_dname) + sizeof(*rinfo->dir_dlease); + size_t size = sizeof(struct ceph_mds_reply_dir_entry); int order, num_entries; spin_lock(&ci->i_ceph_lock); @@ -1683,14 +1670,14 @@ int ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req, order = get_order(size * num_entries); while (order >= 0) { - rinfo->dir_in = (void*)__get_free_pages(GFP_KERNEL | - __GFP_NOWARN, - order); - if (rinfo->dir_in) + rinfo->dir_entries = (void*)__get_free_pages(GFP_KERNEL | + __GFP_NOWARN, + order); + if (rinfo->dir_entries) break; order--; } - if (!rinfo->dir_in) + if (!rinfo->dir_entries) return -ENOMEM; num_entries = (PAGE_SIZE << order) / size; @@ -1722,6 +1709,7 @@ ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode) INIT_LIST_HEAD(&req->r_unsafe_target_item); req->r_fmode = -1; kref_init(&req->r_kref); + RB_CLEAR_NODE(&req->r_node); INIT_LIST_HEAD(&req->r_wait); init_completion(&req->r_completion); init_completion(&req->r_safe_completion); @@ -2414,7 +2402,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) /* get request, session */ tid = le64_to_cpu(msg->hdr.tid); mutex_lock(&mdsc->mutex); - req = __lookup_request(mdsc, tid); + req = lookup_get_request(mdsc, tid); if (!req) { dout("handle_reply on unknown tid %llu\n", tid); mutex_unlock(&mdsc->mutex); @@ -2604,7 +2592,7 @@ static void handle_forward(struct ceph_mds_client *mdsc, fwd_seq = ceph_decode_32(&p); mutex_lock(&mdsc->mutex); - req = __lookup_request(mdsc, tid); + req = lookup_get_request(mdsc, tid); if (!req) { dout("forward tid %llu to mds%d - req dne\n", tid, next_mds); goto out; /* dup reply? */ @@ -3216,7 +3204,7 @@ static void handle_lease(struct ceph_mds_client *mdsc, WARN_ON(1); goto release; /* hrm... */ } - dname.hash = full_name_hash(dname.name, dname.len); + dname.hash = full_name_hash(parent, dname.name, dname.len); dentry = d_lookup(parent, &dname); dput(parent); if (!dentry) diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index ee69a537dba5..e7d38aac7109 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -47,6 +47,14 @@ struct ceph_mds_reply_info_in { u32 pool_ns_len; }; +struct ceph_mds_reply_dir_entry { + char *name; + u32 name_len; + struct ceph_mds_reply_lease *lease; + struct ceph_mds_reply_info_in inode; + loff_t offset; +}; + /* * parsed info about an mds reply, including information about * either: 1) the target inode and/or its parent directory and dentry, @@ -73,11 +81,10 @@ struct ceph_mds_reply_info_parsed { struct ceph_mds_reply_dirfrag *dir_dir; size_t dir_buf_size; int dir_nr; - char **dir_dname; - u32 *dir_dname_len; - struct ceph_mds_reply_lease **dir_dlease; - struct ceph_mds_reply_info_in *dir_in; - u8 dir_complete, dir_end; + bool dir_complete; + bool dir_end; + bool hash_order; + struct ceph_mds_reply_dir_entry *dir_entries; }; /* for create results */ diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c index 261531e55e9d..8c3591a7fbae 100644 --- a/fs/ceph/mdsmap.c +++ b/fs/ceph/mdsmap.c @@ -54,16 +54,21 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) const void *start = *p; int i, j, n; int err = -EINVAL; - u16 version; + u8 mdsmap_v, mdsmap_cv; m = kzalloc(sizeof(*m), GFP_NOFS); if (m == NULL) return ERR_PTR(-ENOMEM); - ceph_decode_16_safe(p, end, version, bad); - if (version > 3) { - pr_warn("got mdsmap version %d > 3, failing", version); - goto bad; + ceph_decode_need(p, end, 1 + 1, bad); + mdsmap_v = ceph_decode_8(p); + mdsmap_cv = ceph_decode_8(p); + if (mdsmap_v >= 4) { + u32 mdsmap_len; + ceph_decode_32_safe(p, end, mdsmap_len, bad); + if (end < *p + mdsmap_len) + goto bad; + end = *p + mdsmap_len; } ceph_decode_need(p, end, 8*sizeof(u32) + sizeof(u64), bad); @@ -87,16 +92,29 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) u32 namelen; s32 mds, inc, state; u64 state_seq; - u8 infoversion; + u8 info_v; + void *info_end = NULL; struct ceph_entity_addr addr; u32 num_export_targets; void *pexport_targets = NULL; struct ceph_timespec laggy_since; struct ceph_mds_info *info; - ceph_decode_need(p, end, sizeof(u64)*2 + 1 + sizeof(u32), bad); + ceph_decode_need(p, end, sizeof(u64) + 1, bad); global_id = ceph_decode_64(p); - infoversion = ceph_decode_8(p); + info_v= ceph_decode_8(p); + if (info_v >= 4) { + u32 info_len; + u8 info_cv; + ceph_decode_need(p, end, 1 + sizeof(u32), bad); + info_cv = ceph_decode_8(p); + info_len = ceph_decode_32(p); + info_end = *p + info_len; + if (info_end > end) + goto bad; + } + + ceph_decode_need(p, end, sizeof(u64) + sizeof(u32), bad); *p += sizeof(u64); namelen = ceph_decode_32(p); /* skip mds name */ *p += namelen; @@ -115,7 +133,7 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) *p += sizeof(u32); ceph_decode_32_safe(p, end, namelen, bad); *p += namelen; - if (infoversion >= 2) { + if (info_v >= 2) { ceph_decode_32_safe(p, end, num_export_targets, bad); pexport_targets = *p; *p += num_export_targets * sizeof(u32); @@ -123,6 +141,12 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) num_export_targets = 0; } + if (info_end && *p != info_end) { + if (*p > info_end) + goto bad; + *p = info_end; + } + dout("mdsmap_decode %d/%d %lld mds%d.%d %s %s\n", i+1, n, global_id, mds, inc, ceph_pr_addr(&addr.in_addr), @@ -163,6 +187,7 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) m->m_cas_pg_pool = ceph_decode_64(p); /* ok, we don't care about the rest. */ + *p = end; dout("mdsmap_decode success epoch %u\n", m->m_epoch); return m; diff --git a/fs/ceph/super.c b/fs/ceph/super.c index f12d5e2955c2..91e02481ce06 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -108,6 +108,7 @@ static int ceph_sync_fs(struct super_block *sb, int wait) * mount options */ enum { + Opt_mds_namespace, Opt_wsize, Opt_rsize, Opt_rasize, @@ -143,6 +144,7 @@ enum { }; static match_table_t fsopt_tokens = { + {Opt_mds_namespace, "mds_namespace=%d"}, {Opt_wsize, "wsize=%d"}, {Opt_rsize, "rsize=%d"}, {Opt_rasize, "rasize=%d"}, @@ -212,6 +214,9 @@ static int parse_fsopt_token(char *c, void *private) break; /* misc */ + case Opt_mds_namespace: + fsopt->mds_namespace = intval; + break; case Opt_wsize: fsopt->wsize = intval; break; @@ -297,6 +302,7 @@ static void destroy_mount_options(struct ceph_mount_options *args) { dout("destroy_mount_options %p\n", args); kfree(args->snapdir_name); + kfree(args->server_path); kfree(args); } @@ -328,14 +334,17 @@ static int compare_mount_options(struct ceph_mount_options *new_fsopt, if (ret) return ret; + ret = strcmp_null(fsopt1->server_path, fsopt2->server_path); + if (ret) + return ret; + return ceph_compare_options(new_opt, fsc->client); } static int parse_mount_options(struct ceph_mount_options **pfsopt, struct ceph_options **popt, int flags, char *options, - const char *dev_name, - const char **path) + const char *dev_name) { struct ceph_mount_options *fsopt; const char *dev_name_end; @@ -367,6 +376,7 @@ static int parse_mount_options(struct ceph_mount_options **pfsopt, fsopt->max_readdir = CEPH_MAX_READDIR_DEFAULT; fsopt->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT; fsopt->congestion_kb = default_congestion_kb(); + fsopt->mds_namespace = CEPH_FS_CLUSTER_ID_NONE; /* * Distinguish the server list from the path in "dev_name". @@ -380,12 +390,13 @@ static int parse_mount_options(struct ceph_mount_options **pfsopt, */ dev_name_end = strchr(dev_name, '/'); if (dev_name_end) { - /* skip over leading '/' for path */ - *path = dev_name_end + 1; + fsopt->server_path = kstrdup(dev_name_end, GFP_KERNEL); + if (!fsopt->server_path) { + err = -ENOMEM; + goto out; + } } else { - /* path is empty */ dev_name_end = dev_name + strlen(dev_name); - *path = dev_name_end; } err = -EINVAL; dev_name_end--; /* back up to ':' separator */ @@ -395,7 +406,8 @@ static int parse_mount_options(struct ceph_mount_options **pfsopt, goto out; } dout("device name '%.*s'\n", (int)(dev_name_end - dev_name), dev_name); - dout("server path '%s'\n", *path); + if (fsopt->server_path) + dout("server path '%s'\n", fsopt->server_path); *popt = ceph_parse_options(options, dev_name, dev_name_end, parse_fsopt_token, (void *)fsopt); @@ -457,6 +469,8 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root) seq_puts(m, ",noacl"); #endif + if (fsopt->mds_namespace != CEPH_FS_CLUSTER_ID_NONE) + seq_printf(m, ",mds_namespace=%d", fsopt->mds_namespace); if (fsopt->wsize) seq_printf(m, ",wsize=%d", fsopt->wsize); if (fsopt->rsize != CEPH_RSIZE_DEFAULT) @@ -511,9 +525,8 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt, { struct ceph_fs_client *fsc; const u64 supported_features = - CEPH_FEATURE_FLOCK | - CEPH_FEATURE_DIRLAYOUTHASH | - CEPH_FEATURE_MDS_INLINE_DATA; + CEPH_FEATURE_FLOCK | CEPH_FEATURE_DIRLAYOUTHASH | + CEPH_FEATURE_MDSENC | CEPH_FEATURE_MDS_INLINE_DATA; const u64 required_features = 0; int page_count; size_t size; @@ -530,6 +543,7 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt, goto fail; } fsc->client->extra_mon_dispatch = extra_mon_dispatch; + fsc->client->monc.fs_cluster_id = fsopt->mds_namespace; ceph_monc_want_map(&fsc->client->monc, CEPH_SUB_MDSMAP, 0, true); fsc->mount_options = fsopt; @@ -785,8 +799,7 @@ out: /* * mount: join the ceph cluster, and open root directory. */ -static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc, - const char *path) +static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc) { int err; unsigned long started = jiffies; /* note the start time */ @@ -815,11 +828,12 @@ static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc, goto fail; } - if (path[0] == 0) { + if (!fsc->mount_options->server_path) { root = fsc->sb->s_root; dget(root); } else { - dout("mount opening base mountpoint\n"); + const char *path = fsc->mount_options->server_path + 1; + dout("mount opening path %s\n", path); root = open_root_dentry(fsc, path, started); if (IS_ERR(root)) { err = PTR_ERR(root); @@ -935,7 +949,6 @@ static struct dentry *ceph_mount(struct file_system_type *fs_type, struct dentry *res; int err; int (*compare_super)(struct super_block *, void *) = ceph_compare_super; - const char *path = NULL; struct ceph_mount_options *fsopt = NULL; struct ceph_options *opt = NULL; @@ -944,7 +957,7 @@ static struct dentry *ceph_mount(struct file_system_type *fs_type, #ifdef CONFIG_CEPH_FS_POSIX_ACL flags |= MS_POSIXACL; #endif - err = parse_mount_options(&fsopt, &opt, flags, data, dev_name, &path); + err = parse_mount_options(&fsopt, &opt, flags, data, dev_name); if (err < 0) { res = ERR_PTR(err); goto out_final; @@ -987,7 +1000,7 @@ static struct dentry *ceph_mount(struct file_system_type *fs_type, } } - res = ceph_real_mount(fsc, path); + res = ceph_real_mount(fsc); if (IS_ERR(res)) goto out_splat; dout("root %p inode %p ino %llx.%llx\n", res, diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 7b99eb756477..0168b49fb6ad 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -62,6 +62,7 @@ struct ceph_mount_options { int cap_release_safety; int max_readdir; /* max readdir result (entires) */ int max_readdir_bytes; /* max readdir result (bytes) */ + int mds_namespace; /* * everything above this point can be memcmp'd; everything below @@ -69,6 +70,7 @@ struct ceph_mount_options { */ char *snapdir_name; /* default ".snap" */ + char *server_path; /* default "/" */ }; struct ceph_fs_client { @@ -101,7 +103,6 @@ struct ceph_fs_client { #ifdef CONFIG_CEPH_FSCACHE struct fscache_cookie *fscache; - struct workqueue_struct *revalidate_wq; #endif }; @@ -295,6 +296,7 @@ struct ceph_inode_info { u64 i_files, i_subdirs; struct rb_root i_fragtree; + int i_fragtree_nsplits; struct mutex i_fragtree_mutex; struct ceph_inode_xattrs_info i_xattrs; @@ -357,8 +359,7 @@ struct ceph_inode_info { #ifdef CONFIG_CEPH_FSCACHE struct fscache_cookie *fscache; - u32 i_fscache_gen; /* sequence, for delayed fscache validate */ - struct work_struct i_revalidate_work; + u32 i_fscache_gen; #endif struct inode vfs_inode; /* at end */ }; @@ -469,6 +470,7 @@ static inline struct inode *ceph_find_inode(struct super_block *sb, #define CEPH_I_POOL_RD (1 << 5) /* can read from pool */ #define CEPH_I_POOL_WR (1 << 6) /* can write to pool */ #define CEPH_I_SEC_INITED (1 << 7) /* security initialized */ +#define CEPH_I_CAP_DROPPED (1 << 8) /* caps were forcibly dropped */ static inline void __ceph_dir_set_complete(struct ceph_inode_info *ci, long long release_count, @@ -537,11 +539,6 @@ static inline struct ceph_dentry_info *ceph_dentry(struct dentry *dentry) return (struct ceph_dentry_info *)dentry->d_fsdata; } -static inline loff_t ceph_make_fpos(unsigned frag, unsigned off) -{ - return ((loff_t)frag << 32) | (loff_t)off; -} - /* * caps helpers */ @@ -632,7 +629,6 @@ struct ceph_file_info { struct ceph_mds_request *last_readdir; /* readdir: position within a frag */ - unsigned offset; /* offset of last chunk, adjusted for . and .. */ unsigned next_offset; /* offset of next chunk (last_name's + 1) */ char *last_name; /* last entry in previous chunk */ long long dir_release_count; @@ -927,6 +923,7 @@ extern void ceph_pool_perm_destroy(struct ceph_mds_client* mdsc); /* file.c */ extern const struct file_operations ceph_file_fops; +extern int ceph_renew_caps(struct inode *inode); extern int ceph_open(struct inode *inode, struct file *file); extern int ceph_atomic_open(struct inode *dir, struct dentry *dentry, struct file *file, unsigned flags, umode_t mode, @@ -942,6 +939,7 @@ extern const struct inode_operations ceph_snapdir_iops; extern const struct dentry_operations ceph_dentry_ops, ceph_snap_dentry_ops, ceph_snapdir_dentry_ops; +extern loff_t ceph_make_fpos(unsigned high, unsigned off, bool hash_order); extern int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry); extern int ceph_handle_snapdir(struct ceph_mds_request *req, struct dentry *dentry, int err); diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index 0d66722c6a52..4870b29df224 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -77,7 +77,7 @@ static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val, char buf[128]; dout("ceph_vxattrcb_layout %p\n", &ci->vfs_inode); - down_read(&osdc->map_sem); + down_read(&osdc->lock); pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, pool); if (pool_name) { size_t len = strlen(pool_name); @@ -109,7 +109,7 @@ static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val, ret = -ERANGE; } } - up_read(&osdc->map_sem); + up_read(&osdc->lock); return ret; } @@ -143,13 +143,13 @@ static size_t ceph_vxattrcb_layout_pool(struct ceph_inode_info *ci, s64 pool = ceph_file_layout_pg_pool(ci->i_layout); const char *pool_name; - down_read(&osdc->map_sem); + down_read(&osdc->lock); pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, pool); if (pool_name) ret = snprintf(val, size, "%s", pool_name); else ret = snprintf(val, size, "%lld", (unsigned long long)pool); - up_read(&osdc->map_sem); + up_read(&osdc->lock); return ret; } @@ -862,6 +862,7 @@ static int ceph_sync_setxattr(struct inode *inode, const char *name, struct ceph_mds_request *req; struct ceph_mds_client *mdsc = fsc->mdsc; struct ceph_pagelist *pagelist = NULL; + int op = CEPH_MDS_OP_SETXATTR; int err; if (size > 0) { @@ -875,20 +876,21 @@ static int ceph_sync_setxattr(struct inode *inode, const char *name, if (err) goto out; } else if (!value) { - flags |= CEPH_XATTR_REMOVE; + if (flags & CEPH_XATTR_REPLACE) + op = CEPH_MDS_OP_RMXATTR; + else + flags |= CEPH_XATTR_REMOVE; } dout("setxattr value=%.*s\n", (int)size, value); /* do request */ - req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETXATTR, - USE_AUTH_MDS); + req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS); if (IS_ERR(req)) { err = PTR_ERR(req); goto out; } - req->r_args.setxattr.flags = cpu_to_le32(flags); req->r_path2 = kstrdup(name, GFP_NOFS); if (!req->r_path2) { ceph_mdsc_put_request(req); @@ -896,8 +898,11 @@ static int ceph_sync_setxattr(struct inode *inode, const char *name, goto out; } - req->r_pagelist = pagelist; - pagelist = NULL; + if (op == CEPH_MDS_OP_SETXATTR) { + req->r_args.setxattr.flags = cpu_to_le32(flags); + req->r_pagelist = pagelist; + pagelist = NULL; + } req->r_inode = inode; ihold(inode); @@ -1051,12 +1056,13 @@ static int ceph_get_xattr_handler(const struct xattr_handler *handler, } static int ceph_set_xattr_handler(const struct xattr_handler *handler, - struct dentry *dentry, const char *name, - const void *value, size_t size, int flags) + struct dentry *unused, struct inode *inode, + const char *name, const void *value, + size_t size, int flags) { if (!ceph_is_valid_xattr(name)) return -EOPNOTSUPP; - return __ceph_setxattr(d_inode(dentry), name, value, size, flags); + return __ceph_setxattr(inode, name, value, size, flags); } const struct xattr_handler ceph_other_xattr_handler = { diff --git a/fs/char_dev.c b/fs/char_dev.c index 24b142569ca9..6edd825231c5 100644 --- a/fs/char_dev.c +++ b/fs/char_dev.c @@ -91,6 +91,10 @@ __register_chrdev_region(unsigned int major, unsigned int baseminor, break; } + if (i < CHRDEV_MAJOR_DYN_END) + pr_warn("CHRDEV \"%s\" major number %d goes below the dynamic allocation range\n", + name, i); + if (i == 0) { ret = -EBUSY; goto out; diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c index 788e19195991..6c58e13fed2f 100644 --- a/fs/cifs/cifs_debug.c +++ b/fs/cifs/cifs_debug.c @@ -244,7 +244,6 @@ static int cifs_debug_data_proc_open(struct inode *inode, struct file *file) } static const struct file_operations cifs_debug_data_proc_fops = { - .owner = THIS_MODULE, .open = cifs_debug_data_proc_open, .read = seq_read, .llseek = seq_lseek, @@ -361,7 +360,6 @@ static int cifs_stats_proc_open(struct inode *inode, struct file *file) } static const struct file_operations cifs_stats_proc_fops = { - .owner = THIS_MODULE, .open = cifs_stats_proc_open, .read = seq_read, .llseek = seq_lseek, @@ -447,7 +445,6 @@ static ssize_t cifsFYI_proc_write(struct file *file, const char __user *buffer, } static const struct file_operations cifsFYI_proc_fops = { - .owner = THIS_MODULE, .open = cifsFYI_proc_open, .read = seq_read, .llseek = seq_lseek, @@ -479,7 +476,6 @@ static ssize_t cifs_linux_ext_proc_write(struct file *file, } static const struct file_operations cifs_linux_ext_proc_fops = { - .owner = THIS_MODULE, .open = cifs_linux_ext_proc_open, .read = seq_read, .llseek = seq_lseek, @@ -511,7 +507,6 @@ static ssize_t cifs_lookup_cache_proc_write(struct file *file, } static const struct file_operations cifs_lookup_cache_proc_fops = { - .owner = THIS_MODULE, .open = cifs_lookup_cache_proc_open, .read = seq_read, .llseek = seq_lseek, @@ -543,7 +538,6 @@ static ssize_t traceSMB_proc_write(struct file *file, const char __user *buffer, } static const struct file_operations traceSMB_proc_fops = { - .owner = THIS_MODULE, .open = traceSMB_proc_open, .read = seq_read, .llseek = seq_lseek, @@ -655,7 +649,6 @@ static ssize_t cifs_security_flags_proc_write(struct file *file, } static const struct file_operations cifs_security_flags_proc_fops = { - .owner = THIS_MODULE, .open = cifs_security_flags_proc_open, .read = seq_read, .llseek = seq_lseek, diff --git a/fs/cifs/cifs_spnego.c b/fs/cifs/cifs_spnego.c index 6908080e9b6d..b611fc2e8984 100644 --- a/fs/cifs/cifs_spnego.c +++ b/fs/cifs/cifs_spnego.c @@ -24,10 +24,13 @@ #include <linux/string.h> #include <keys/user-type.h> #include <linux/key-type.h> +#include <linux/keyctl.h> #include <linux/inet.h> #include "cifsglob.h" #include "cifs_spnego.h" #include "cifs_debug.h" +#include "cifsproto.h" +static const struct cred *spnego_cred; /* create a new cifs key */ static int @@ -102,6 +105,7 @@ cifs_get_spnego_key(struct cifs_ses *sesInfo) size_t desc_len; struct key *spnego_key; const char *hostname = server->hostname; + const struct cred *saved_cred; /* length of fields (with semicolons): ver=0xyz ip4=ipaddress host=hostname sec=mechanism uid=0xFF user=username */ @@ -163,7 +167,9 @@ cifs_get_spnego_key(struct cifs_ses *sesInfo) sprintf(dp, ";pid=0x%x", current->pid); cifs_dbg(FYI, "key description = %s\n", description); + saved_cred = override_creds(spnego_cred); spnego_key = request_key(&cifs_spnego_key_type, description, ""); + revert_creds(saved_cred); #ifdef CONFIG_CIFS_DEBUG2 if (cifsFYI && !IS_ERR(spnego_key)) { @@ -177,3 +183,64 @@ out: kfree(description); return spnego_key; } + +int +init_cifs_spnego(void) +{ + struct cred *cred; + struct key *keyring; + int ret; + + cifs_dbg(FYI, "Registering the %s key type\n", + cifs_spnego_key_type.name); + + /* + * Create an override credential set with special thread keyring for + * spnego upcalls. + */ + + cred = prepare_kernel_cred(NULL); + if (!cred) + return -ENOMEM; + + keyring = keyring_alloc(".cifs_spnego", + GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, cred, + (KEY_POS_ALL & ~KEY_POS_SETATTR) | + KEY_USR_VIEW | KEY_USR_READ, + KEY_ALLOC_NOT_IN_QUOTA, NULL, NULL); + if (IS_ERR(keyring)) { + ret = PTR_ERR(keyring); + goto failed_put_cred; + } + + ret = register_key_type(&cifs_spnego_key_type); + if (ret < 0) + goto failed_put_key; + + /* + * instruct request_key() to use this special keyring as a cache for + * the results it looks up + */ + set_bit(KEY_FLAG_ROOT_CAN_CLEAR, &keyring->flags); + cred->thread_keyring = keyring; + cred->jit_keyring = KEY_REQKEY_DEFL_THREAD_KEYRING; + spnego_cred = cred; + + cifs_dbg(FYI, "cifs spnego keyring: %d\n", key_serial(keyring)); + return 0; + +failed_put_key: + key_put(keyring); +failed_put_cred: + put_cred(cred); + return ret; +} + +void +exit_cifs_spnego(void) +{ + key_revoke(spnego_cred->thread_keyring); + unregister_key_type(&cifs_spnego_key_type); + put_cred(spnego_cred); + cifs_dbg(FYI, "Unregistered %s key type\n", cifs_spnego_key_type.name); +} diff --git a/fs/cifs/cifs_unicode.c b/fs/cifs/cifs_unicode.c index 5a53ac6b1e02..02b071bf3732 100644 --- a/fs/cifs/cifs_unicode.c +++ b/fs/cifs/cifs_unicode.c @@ -101,6 +101,12 @@ convert_sfm_char(const __u16 src_char, char *target) case SFM_SLASH: *target = '\\'; break; + case SFM_SPACE: + *target = ' '; + break; + case SFM_PERIOD: + *target = '.'; + break; default: return false; } @@ -404,7 +410,7 @@ static __le16 convert_to_sfu_char(char src_char) return dest_char; } -static __le16 convert_to_sfm_char(char src_char) +static __le16 convert_to_sfm_char(char src_char, bool end_of_string) { __le16 dest_char; @@ -427,6 +433,18 @@ static __le16 convert_to_sfm_char(char src_char) case '|': dest_char = cpu_to_le16(SFM_PIPE); break; + case '.': + if (end_of_string) + dest_char = cpu_to_le16(SFM_PERIOD); + else + dest_char = 0; + break; + case ' ': + if (end_of_string) + dest_char = cpu_to_le16(SFM_SPACE); + else + dest_char = 0; + break; default: dest_char = 0; } @@ -469,9 +487,16 @@ cifsConvertToUTF16(__le16 *target, const char *source, int srclen, /* see if we must remap this char */ if (map_chars == SFU_MAP_UNI_RSVD) dst_char = convert_to_sfu_char(src_char); - else if (map_chars == SFM_MAP_UNI_RSVD) - dst_char = convert_to_sfm_char(src_char); - else + else if (map_chars == SFM_MAP_UNI_RSVD) { + bool end_of_string; + + if (i == srclen - 1) + end_of_string = true; + else + end_of_string = false; + + dst_char = convert_to_sfm_char(src_char, end_of_string); + } else dst_char = 0; /* * FIXME: We can not handle remapping backslash (UNI_SLASH) diff --git a/fs/cifs/cifs_unicode.h b/fs/cifs/cifs_unicode.h index bdc52cb9a676..479bc0a941f3 100644 --- a/fs/cifs/cifs_unicode.h +++ b/fs/cifs/cifs_unicode.h @@ -64,6 +64,8 @@ #define SFM_LESSTHAN ((__u16) 0xF023) #define SFM_PIPE ((__u16) 0xF027) #define SFM_SLASH ((__u16) 0xF026) +#define SFM_PERIOD ((__u16) 0xF028) +#define SFM_SPACE ((__u16) 0xF029) /* * Mapping mechanism to use when one of the seven reserved characters is diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c index 3f93125916bf..71e8a56e9479 100644 --- a/fs/cifs/cifsacl.c +++ b/fs/cifs/cifsacl.c @@ -360,7 +360,7 @@ init_cifs_idmap(void) GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, cred, (KEY_POS_ALL & ~KEY_POS_SETATTR) | KEY_USR_VIEW | KEY_USR_READ, - KEY_ALLOC_NOT_IN_QUOTA, NULL); + KEY_ALLOC_NOT_IN_QUOTA, NULL, NULL); if (IS_ERR(keyring)) { ret = PTR_ERR(keyring); goto failed_put_cred; diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 08fa36e5b2bc..5d841f39c4b7 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -87,6 +87,7 @@ extern mempool_t *cifs_req_poolp; extern mempool_t *cifs_mid_poolp; struct workqueue_struct *cifsiod_wq; +__u32 cifs_lock_secret; /* * Bumps refcount for cifs super block. @@ -890,7 +891,6 @@ const struct inode_operations cifs_dir_inode_ops = { .rmdir = cifs_rmdir, .rename2 = cifs_rename2, .permission = cifs_permission, -/* revalidate:cifs_revalidate, */ .setattr = cifs_setattr, .symlink = cifs_symlink, .mknod = cifs_mknod, @@ -901,9 +901,8 @@ const struct inode_operations cifs_dir_inode_ops = { }; const struct inode_operations cifs_file_inode_ops = { -/* revalidate:cifs_revalidate, */ .setattr = cifs_setattr, - .getattr = cifs_getattr, /* do we need this anymore? */ + .getattr = cifs_getattr, .permission = cifs_permission, .setxattr = generic_setxattr, .getxattr = generic_getxattr, @@ -915,9 +914,6 @@ const struct inode_operations cifs_symlink_inode_ops = { .readlink = generic_readlink, .get_link = cifs_get_link, .permission = cifs_permission, - /* BB add the following two eventually */ - /* revalidate: cifs_revalidate, - setattr: cifs_notify_change, *//* BB do we need notify change */ .setxattr = generic_setxattr, .getxattr = generic_getxattr, .listxattr = cifs_listxattr, @@ -1271,6 +1267,8 @@ init_cifs(void) spin_lock_init(&cifs_file_list_lock); spin_lock_init(&GlobalMid_Lock); + get_random_bytes(&cifs_lock_secret, sizeof(cifs_lock_secret)); + if (cifs_max_pending < 2) { cifs_max_pending = 2; cifs_dbg(FYI, "cifs_max_pending set to min of 2\n"); @@ -1303,7 +1301,7 @@ init_cifs(void) goto out_destroy_mids; #ifdef CONFIG_CIFS_UPCALL - rc = register_key_type(&cifs_spnego_key_type); + rc = init_cifs_spnego(); if (rc) goto out_destroy_request_bufs; #endif /* CONFIG_CIFS_UPCALL */ @@ -1326,7 +1324,7 @@ out_init_cifs_idmap: out_register_key_type: #endif #ifdef CONFIG_CIFS_UPCALL - unregister_key_type(&cifs_spnego_key_type); + exit_cifs_spnego(); out_destroy_request_bufs: #endif cifs_destroy_request_bufs(); diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index bba106cdc43c..8f1d8c1e72be 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -1619,6 +1619,7 @@ void cifs_oplock_break(struct work_struct *work); extern const struct slow_work_ops cifs_oplock_break_ops; extern struct workqueue_struct *cifsiod_wq; +extern __u32 cifs_lock_secret; extern mempool_t *cifs_mid_poolp; diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index 0f9a6bc4ba43..1243bd326591 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -58,6 +58,8 @@ do { \ } while (0) extern int init_cifs_idmap(void); extern void exit_cifs_idmap(void); +extern int init_cifs_spnego(void); +extern void exit_cifs_spnego(void); extern char *build_path_from_dentry(struct dentry *); extern char *cifs_build_path_to_root(struct smb_vol *vol, struct cifs_sb_info *cifs_sb, diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 66736f57b5ab..7d2b15c06090 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -428,7 +428,9 @@ cifs_echo_request(struct work_struct *work) * server->ops->need_neg() == true. Also, no need to ping if * we got a response recently. */ - if (!server->ops->need_neg || server->ops->need_neg(server) || + + if (server->tcpStatus == CifsNeedReconnect || + server->tcpStatus == CifsExiting || server->tcpStatus == CifsNew || (server->ops->can_echo && !server->ops->can_echo(server)) || time_before(jiffies, server->lstrp + echo_interval - HZ)) goto requeue_echo; diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index c3eb998a99bd..5efd6f1fb8a5 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -445,7 +445,7 @@ cifs_atomic_open(struct inode *inode, struct dentry *direntry, * Check for hashed negative dentry. We have already revalidated * the dentry and it is fine. No need to perform another lookup. */ - if (!d_unhashed(direntry)) + if (!d_in_lookup(direntry)) return -ENOENT; res = cifs_lookup(inode, direntry, 0); @@ -856,7 +856,7 @@ static int cifs_ci_hash(const struct dentry *dentry, struct qstr *q) wchar_t c; int i, charlen; - hash = init_name_hash(); + hash = init_name_hash(dentry); for (i = 0; i < q->len; i += charlen) { charlen = codepage->char2uni(&q->name[i], q->len - i, &c); /* error out if we can't convert the character */ diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 9793ae0bcaa2..579e41b350a2 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -1112,6 +1112,12 @@ cifs_push_mandatory_locks(struct cifsFileInfo *cfile) return rc; } +static __u32 +hash_lockowner(fl_owner_t owner) +{ + return cifs_lock_secret ^ hash32_ptr((const void *)owner); +} + struct lock_to_push { struct list_head llist; __u64 offset; @@ -1178,7 +1184,7 @@ cifs_push_posix_locks(struct cifsFileInfo *cfile) else type = CIFS_WRLCK; lck = list_entry(el, struct lock_to_push, llist); - lck->pid = flock->fl_pid; + lck->pid = hash_lockowner(flock->fl_owner); lck->netfid = cfile->fid.netfid; lck->length = length; lck->type = type; @@ -1305,7 +1311,8 @@ cifs_getlk(struct file *file, struct file_lock *flock, __u32 type, posix_lock_type = CIFS_RDLCK; else posix_lock_type = CIFS_WRLCK; - rc = CIFSSMBPosixLock(xid, tcon, netfid, current->tgid, + rc = CIFSSMBPosixLock(xid, tcon, netfid, + hash_lockowner(flock->fl_owner), flock->fl_start, length, flock, posix_lock_type, wait_flag); return rc; @@ -1505,7 +1512,8 @@ cifs_setlk(struct file *file, struct file_lock *flock, __u32 type, posix_lock_type = CIFS_UNLCK; rc = CIFSSMBPosixLock(xid, tcon, cfile->fid.netfid, - current->tgid, flock->fl_start, length, + hash_lockowner(flock->fl_owner), + flock->fl_start, length, NULL, posix_lock_type, wait_flag); goto out; } @@ -3358,7 +3366,7 @@ readpages_get_pages(struct address_space *mapping, struct list_head *page_list, struct page *page, *tpage; unsigned int expected_index; int rc; - gfp_t gfp = mapping_gfp_constraint(mapping, GFP_KERNEL); + gfp_t gfp = readahead_gfp_mask(mapping); INIT_LIST_HEAD(tmplist); diff --git a/fs/cifs/ntlmssp.h b/fs/cifs/ntlmssp.h index 848249fa120f..3079b38f0afb 100644 --- a/fs/cifs/ntlmssp.h +++ b/fs/cifs/ntlmssp.h @@ -133,6 +133,6 @@ typedef struct _AUTHENTICATE_MESSAGE { int decode_ntlmssp_challenge(char *bcc_ptr, int blob_len, struct cifs_ses *ses); void build_ntlmssp_negotiate_blob(unsigned char *pbuffer, struct cifs_ses *ses); -int build_ntlmssp_auth_blob(unsigned char *pbuffer, u16 *buflen, +int build_ntlmssp_auth_blob(unsigned char **pbuffer, u16 *buflen, struct cifs_ses *ses, const struct nls_table *nls_cp); diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c index af0ec2d5ad0e..538d9b55699a 100644 --- a/fs/cifs/sess.c +++ b/fs/cifs/sess.c @@ -364,19 +364,43 @@ void build_ntlmssp_negotiate_blob(unsigned char *pbuffer, sec_blob->DomainName.MaximumLength = 0; } -/* We do not malloc the blob, it is passed in pbuffer, because its - maximum possible size is fixed and small, making this approach cleaner. - This function returns the length of the data in the blob */ -int build_ntlmssp_auth_blob(unsigned char *pbuffer, +static int size_of_ntlmssp_blob(struct cifs_ses *ses) +{ + int sz = sizeof(AUTHENTICATE_MESSAGE) + ses->auth_key.len + - CIFS_SESS_KEY_SIZE + CIFS_CPHTXT_SIZE + 2; + + if (ses->domainName) + sz += 2 * strnlen(ses->domainName, CIFS_MAX_DOMAINNAME_LEN); + else + sz += 2; + + if (ses->user_name) + sz += 2 * strnlen(ses->user_name, CIFS_MAX_USERNAME_LEN); + else + sz += 2; + + return sz; +} + +int build_ntlmssp_auth_blob(unsigned char **pbuffer, u16 *buflen, struct cifs_ses *ses, const struct nls_table *nls_cp) { int rc; - AUTHENTICATE_MESSAGE *sec_blob = (AUTHENTICATE_MESSAGE *)pbuffer; + AUTHENTICATE_MESSAGE *sec_blob; __u32 flags; unsigned char *tmp; + rc = setup_ntlmv2_rsp(ses, nls_cp); + if (rc) { + cifs_dbg(VFS, "Error %d during NTLMSSP authentication\n", rc); + *buflen = 0; + goto setup_ntlmv2_ret; + } + *pbuffer = kmalloc(size_of_ntlmssp_blob(ses), GFP_KERNEL); + sec_blob = (AUTHENTICATE_MESSAGE *)*pbuffer; + memcpy(sec_blob->Signature, NTLMSSP_SIGNATURE, 8); sec_blob->MessageType = NtLmAuthenticate; @@ -391,7 +415,7 @@ int build_ntlmssp_auth_blob(unsigned char *pbuffer, flags |= NTLMSSP_NEGOTIATE_KEY_XCH; } - tmp = pbuffer + sizeof(AUTHENTICATE_MESSAGE); + tmp = *pbuffer + sizeof(AUTHENTICATE_MESSAGE); sec_blob->NegotiateFlags = cpu_to_le32(flags); sec_blob->LmChallengeResponse.BufferOffset = @@ -399,13 +423,9 @@ int build_ntlmssp_auth_blob(unsigned char *pbuffer, sec_blob->LmChallengeResponse.Length = 0; sec_blob->LmChallengeResponse.MaximumLength = 0; - sec_blob->NtChallengeResponse.BufferOffset = cpu_to_le32(tmp - pbuffer); + sec_blob->NtChallengeResponse.BufferOffset = + cpu_to_le32(tmp - *pbuffer); if (ses->user_name != NULL) { - rc = setup_ntlmv2_rsp(ses, nls_cp); - if (rc) { - cifs_dbg(VFS, "Error %d during NTLMSSP authentication\n", rc); - goto setup_ntlmv2_ret; - } memcpy(tmp, ses->auth_key.response + CIFS_SESS_KEY_SIZE, ses->auth_key.len - CIFS_SESS_KEY_SIZE); tmp += ses->auth_key.len - CIFS_SESS_KEY_SIZE; @@ -423,23 +443,23 @@ int build_ntlmssp_auth_blob(unsigned char *pbuffer, } if (ses->domainName == NULL) { - sec_blob->DomainName.BufferOffset = cpu_to_le32(tmp - pbuffer); + sec_blob->DomainName.BufferOffset = cpu_to_le32(tmp - *pbuffer); sec_blob->DomainName.Length = 0; sec_blob->DomainName.MaximumLength = 0; tmp += 2; } else { int len; len = cifs_strtoUTF16((__le16 *)tmp, ses->domainName, - CIFS_MAX_USERNAME_LEN, nls_cp); + CIFS_MAX_DOMAINNAME_LEN, nls_cp); len *= 2; /* unicode is 2 bytes each */ - sec_blob->DomainName.BufferOffset = cpu_to_le32(tmp - pbuffer); + sec_blob->DomainName.BufferOffset = cpu_to_le32(tmp - *pbuffer); sec_blob->DomainName.Length = cpu_to_le16(len); sec_blob->DomainName.MaximumLength = cpu_to_le16(len); tmp += len; } if (ses->user_name == NULL) { - sec_blob->UserName.BufferOffset = cpu_to_le32(tmp - pbuffer); + sec_blob->UserName.BufferOffset = cpu_to_le32(tmp - *pbuffer); sec_blob->UserName.Length = 0; sec_blob->UserName.MaximumLength = 0; tmp += 2; @@ -448,13 +468,13 @@ int build_ntlmssp_auth_blob(unsigned char *pbuffer, len = cifs_strtoUTF16((__le16 *)tmp, ses->user_name, CIFS_MAX_USERNAME_LEN, nls_cp); len *= 2; /* unicode is 2 bytes each */ - sec_blob->UserName.BufferOffset = cpu_to_le32(tmp - pbuffer); + sec_blob->UserName.BufferOffset = cpu_to_le32(tmp - *pbuffer); sec_blob->UserName.Length = cpu_to_le16(len); sec_blob->UserName.MaximumLength = cpu_to_le16(len); tmp += len; } - sec_blob->WorkstationName.BufferOffset = cpu_to_le32(tmp - pbuffer); + sec_blob->WorkstationName.BufferOffset = cpu_to_le32(tmp - *pbuffer); sec_blob->WorkstationName.Length = 0; sec_blob->WorkstationName.MaximumLength = 0; tmp += 2; @@ -463,19 +483,19 @@ int build_ntlmssp_auth_blob(unsigned char *pbuffer, (ses->ntlmssp->server_flags & NTLMSSP_NEGOTIATE_EXTENDED_SEC)) && !calc_seckey(ses)) { memcpy(tmp, ses->ntlmssp->ciphertext, CIFS_CPHTXT_SIZE); - sec_blob->SessionKey.BufferOffset = cpu_to_le32(tmp - pbuffer); + sec_blob->SessionKey.BufferOffset = cpu_to_le32(tmp - *pbuffer); sec_blob->SessionKey.Length = cpu_to_le16(CIFS_CPHTXT_SIZE); sec_blob->SessionKey.MaximumLength = cpu_to_le16(CIFS_CPHTXT_SIZE); tmp += CIFS_CPHTXT_SIZE; } else { - sec_blob->SessionKey.BufferOffset = cpu_to_le32(tmp - pbuffer); + sec_blob->SessionKey.BufferOffset = cpu_to_le32(tmp - *pbuffer); sec_blob->SessionKey.Length = 0; sec_blob->SessionKey.MaximumLength = 0; } + *buflen = tmp - *pbuffer; setup_ntlmv2_ret: - *buflen = tmp - pbuffer; return rc; } @@ -690,6 +710,8 @@ sess_auth_lanman(struct sess_data *sess_data) rc = calc_lanman_hash(ses->password, ses->server->cryptkey, ses->server->sec_mode & SECMODE_PW_ENCRYPT ? true : false, lnm_session_key); + if (rc) + goto out; memcpy(bcc_ptr, (char *)lnm_session_key, CIFS_AUTH_RESP_SIZE); bcc_ptr += CIFS_AUTH_RESP_SIZE; @@ -1266,7 +1288,7 @@ sess_auth_rawntlmssp_authenticate(struct sess_data *sess_data) struct cifs_ses *ses = sess_data->ses; __u16 bytes_remaining; char *bcc_ptr; - char *ntlmsspblob = NULL; + unsigned char *ntlmsspblob = NULL; u16 blob_len; cifs_dbg(FYI, "rawntlmssp session setup authenticate phase\n"); @@ -1279,19 +1301,7 @@ sess_auth_rawntlmssp_authenticate(struct sess_data *sess_data) /* Build security blob before we assemble the request */ pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base; smb_buf = (struct smb_hdr *)pSMB; - /* - * 5 is an empirical value, large enough to hold - * authenticate message plus max 10 of av paris, - * domain, user, workstation names, flags, etc. - */ - ntlmsspblob = kzalloc(5*sizeof(struct _AUTHENTICATE_MESSAGE), - GFP_KERNEL); - if (!ntlmsspblob) { - rc = -ENOMEM; - goto out; - } - - rc = build_ntlmssp_auth_blob(ntlmsspblob, + rc = build_ntlmssp_auth_blob(&ntlmsspblob, &blob_len, ses, sess_data->nls_cp); if (rc) goto out_free_ntlmsspblob; diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 8f38e33d365b..29e06db5f187 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -588,7 +588,7 @@ SMB2_sess_setup(const unsigned int xid, struct cifs_ses *ses, u16 blob_length = 0; struct key *spnego_key = NULL; char *security_blob = NULL; - char *ntlmssp_blob = NULL; + unsigned char *ntlmssp_blob = NULL; bool use_spnego = false; /* else use raw ntlmssp */ cifs_dbg(FYI, "Session Setup\n"); @@ -713,13 +713,7 @@ ssetup_ntlmssp_authenticate: iov[1].iov_len = blob_length; } else if (phase == NtLmAuthenticate) { req->hdr.SessionId = ses->Suid; - ntlmssp_blob = kzalloc(sizeof(struct _NEGOTIATE_MESSAGE) + 500, - GFP_KERNEL); - if (ntlmssp_blob == NULL) { - rc = -ENOMEM; - goto ssetup_exit; - } - rc = build_ntlmssp_auth_blob(ntlmssp_blob, &blob_length, ses, + rc = build_ntlmssp_auth_blob(&ntlmssp_blob, &blob_length, ses, nls_cp); if (rc) { cifs_dbg(FYI, "build_ntlmssp_auth_blob failed %d\n", @@ -1818,6 +1812,33 @@ SMB2_echo(struct TCP_Server_Info *server) cifs_dbg(FYI, "In echo request\n"); + if (server->tcpStatus == CifsNeedNegotiate) { + struct list_head *tmp, *tmp2; + struct cifs_ses *ses; + struct cifs_tcon *tcon; + + cifs_dbg(FYI, "Need negotiate, reconnecting tcons\n"); + spin_lock(&cifs_tcp_ses_lock); + list_for_each(tmp, &server->smb_ses_list) { + ses = list_entry(tmp, struct cifs_ses, smb_ses_list); + list_for_each(tmp2, &ses->tcon_list) { + tcon = list_entry(tmp2, struct cifs_tcon, + tcon_list); + /* add check for persistent handle reconnect */ + if (tcon && tcon->need_reconnect) { + spin_unlock(&cifs_tcp_ses_lock); + rc = smb2_reconnect(SMB2_ECHO, tcon); + spin_lock(&cifs_tcp_ses_lock); + } + } + } + spin_unlock(&cifs_tcp_ses_lock); + } + + /* if no session, renegotiate failed above */ + if (server->tcpStatus == CifsNeedNegotiate) + return -EIO; + rc = small_smb2_init(SMB2_ECHO, NULL, (void **)&req); if (rc) return rc; diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c index c8b77aa24a1d..5e23f64c0804 100644 --- a/fs/cifs/xattr.c +++ b/fs/cifs/xattr.c @@ -39,8 +39,9 @@ enum { XATTR_USER, XATTR_CIFS_ACL, XATTR_ACL_ACCESS, XATTR_ACL_DEFAULT }; static int cifs_xattr_set(const struct xattr_handler *handler, - struct dentry *dentry, const char *name, - const void *value, size_t size, int flags) + struct dentry *dentry, struct inode *inode, + const char *name, const void *value, + size_t size, int flags) { int rc = -EOPNOTSUPP; unsigned int xid; @@ -99,12 +100,12 @@ static int cifs_xattr_set(const struct xattr_handler *handler, if (value && pTcon->ses->server->ops->set_acl) rc = pTcon->ses->server->ops->set_acl(pacl, - size, d_inode(dentry), + size, inode, full_path, CIFS_ACL_DACL); else rc = -EOPNOTSUPP; if (rc == 0) /* force revalidate of the inode */ - CIFS_I(d_inode(dentry))->time = 0; + CIFS_I(inode)->time = 0; kfree(pacl); } #endif /* CONFIG_CIFS_ACL */ diff --git a/fs/coda/pioctl.c b/fs/coda/pioctl.c index f36a4040afb8..b0b9cda41928 100644 --- a/fs/coda/pioctl.c +++ b/fs/coda/pioctl.c @@ -35,7 +35,6 @@ const struct inode_operations coda_ioctl_inode_operations = { }; const struct file_operations coda_ioctl_operations = { - .owner = THIS_MODULE, .unlocked_ioctl = coda_pioctl, .llseek = noop_llseek, }; diff --git a/fs/compat.c b/fs/compat.c index 8754e9aa14ad..be6e48b0a46c 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -936,6 +936,8 @@ static int compat_filldir(struct dir_context *ctx, const char *name, int namlen, } dirent = buf->previous; if (dirent) { + if (signal_pending(current)) + return -EINTR; if (__put_user(offset, &dirent->d_off)) goto efault; } @@ -1020,6 +1022,8 @@ static int compat_filldir64(struct dir_context *ctx, const char *name, dirent = buf->previous; if (dirent) { + if (signal_pending(current)) + return -EINTR; if (__put_user_unaligned(offset, &dirent->d_off)) goto efault; } diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c index bd01b92aad98..c1e9f29c924c 100644 --- a/fs/compat_ioctl.c +++ b/fs/compat_ioctl.c @@ -57,6 +57,7 @@ #include <linux/i2c-dev.h> #include <linux/atalk.h> #include <linux/gfp.h> +#include <linux/cec.h> #include "internal.h" @@ -1377,6 +1378,17 @@ COMPATIBLE_IOCTL(VIDEO_GET_NAVI) COMPATIBLE_IOCTL(VIDEO_SET_ATTRIBUTES) COMPATIBLE_IOCTL(VIDEO_GET_SIZE) COMPATIBLE_IOCTL(VIDEO_GET_FRAME_RATE) +/* cec */ +COMPATIBLE_IOCTL(CEC_ADAP_G_CAPS) +COMPATIBLE_IOCTL(CEC_ADAP_G_LOG_ADDRS) +COMPATIBLE_IOCTL(CEC_ADAP_S_LOG_ADDRS) +COMPATIBLE_IOCTL(CEC_ADAP_G_PHYS_ADDR) +COMPATIBLE_IOCTL(CEC_ADAP_S_PHYS_ADDR) +COMPATIBLE_IOCTL(CEC_G_MODE) +COMPATIBLE_IOCTL(CEC_S_MODE) +COMPATIBLE_IOCTL(CEC_TRANSMIT) +COMPATIBLE_IOCTL(CEC_RECEIVE) +COMPATIBLE_IOCTL(CEC_DQEVENT) /* joystick */ COMPATIBLE_IOCTL(JSIOCGVERSION) diff --git a/fs/configfs/file.c b/fs/configfs/file.c index 33b7ee34eda5..bbc1252a59f5 100644 --- a/fs/configfs/file.c +++ b/fs/configfs/file.c @@ -357,8 +357,6 @@ configfs_write_bin_file(struct file *file, const char __user *buf, len = simple_write_to_buffer(buffer->bin_buffer, buffer->bin_buffer_size, ppos, buf, count); - if (len > 0) - *ppos += len; out: mutex_unlock(&buffer->mutex); return len; diff --git a/fs/coredump.c b/fs/coredump.c index 492c2db25dc9..281b768000e6 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -413,7 +413,9 @@ static int coredump_wait(int exit_code, struct core_state *core_state) core_state->dumper.task = tsk; core_state->dumper.next = NULL; - down_write(&mm->mmap_sem); + if (down_write_killable(&mm->mmap_sem)) + return -EINTR; + if (!mm->core_state) core_waiters = zap_threads(tsk, mm, core_state, exit_code); up_write(&mm->mmap_sem); @@ -792,6 +794,7 @@ int dump_emit(struct coredump_params *cprm, const void *addr, int nr) return 0; file->f_pos = pos; cprm->written += n; + cprm->pos += n; nr -= n; } return 1; @@ -806,6 +809,7 @@ int dump_skip(struct coredump_params *cprm, size_t nr) if (dump_interrupted() || file->f_op->llseek(file, nr, SEEK_CUR) < 0) return 0; + cprm->pos += nr; return 1; } else { while (nr > PAGE_SIZE) { @@ -820,7 +824,7 @@ EXPORT_SYMBOL(dump_skip); int dump_align(struct coredump_params *cprm, int align) { - unsigned mod = cprm->file->f_pos & (align - 1); + unsigned mod = cprm->pos & (align - 1); if (align & (align - 1)) return 0; return mod ? dump_skip(cprm, align - mod) : 1; diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c index 2fc8c43ce531..c502c116924c 100644 --- a/fs/crypto/crypto.c +++ b/fs/crypto/crypto.c @@ -318,6 +318,7 @@ int fscrypt_zeroout_range(struct inode *inode, pgoff_t lblk, bio->bi_bdev = inode->i_sb->s_bdev; bio->bi_iter.bi_sector = pblk << (inode->i_sb->s_blocksize_bits - 9); + bio_set_op_attrs(bio, REQ_OP_WRITE, 0); ret = bio_add_page(bio, ciphertext_page, inode->i_sb->s_blocksize, 0); if (ret != inode->i_sb->s_blocksize) { @@ -327,7 +328,7 @@ int fscrypt_zeroout_range(struct inode *inode, pgoff_t lblk, err = -EIO; goto errout; } - err = submit_bio_wait(WRITE, bio); + err = submit_bio_wait(bio); if ((err == 0) && bio->bi_error) err = -EIO; bio_put(bio); diff --git a/fs/crypto/keyinfo.c b/fs/crypto/keyinfo.c index 06f5aa478bf2..1ac263eddc4e 100644 --- a/fs/crypto/keyinfo.c +++ b/fs/crypto/keyinfo.c @@ -78,6 +78,67 @@ out: return res; } +static int validate_user_key(struct fscrypt_info *crypt_info, + struct fscrypt_context *ctx, u8 *raw_key, + u8 *prefix, int prefix_size) +{ + u8 *full_key_descriptor; + struct key *keyring_key; + struct fscrypt_key *master_key; + const struct user_key_payload *ukp; + int full_key_len = prefix_size + (FS_KEY_DESCRIPTOR_SIZE * 2) + 1; + int res; + + full_key_descriptor = kmalloc(full_key_len, GFP_NOFS); + if (!full_key_descriptor) + return -ENOMEM; + + memcpy(full_key_descriptor, prefix, prefix_size); + sprintf(full_key_descriptor + prefix_size, + "%*phN", FS_KEY_DESCRIPTOR_SIZE, + ctx->master_key_descriptor); + full_key_descriptor[full_key_len - 1] = '\0'; + keyring_key = request_key(&key_type_logon, full_key_descriptor, NULL); + kfree(full_key_descriptor); + if (IS_ERR(keyring_key)) + return PTR_ERR(keyring_key); + + if (keyring_key->type != &key_type_logon) { + printk_once(KERN_WARNING + "%s: key type must be logon\n", __func__); + res = -ENOKEY; + goto out; + } + down_read(&keyring_key->sem); + ukp = user_key_payload(keyring_key); + if (ukp->datalen != sizeof(struct fscrypt_key)) { + res = -EINVAL; + up_read(&keyring_key->sem); + goto out; + } + master_key = (struct fscrypt_key *)ukp->data; + BUILD_BUG_ON(FS_AES_128_ECB_KEY_SIZE != FS_KEY_DERIVATION_NONCE_SIZE); + + if (master_key->size != FS_AES_256_XTS_KEY_SIZE) { + printk_once(KERN_WARNING + "%s: key size incorrect: %d\n", + __func__, master_key->size); + res = -ENOKEY; + up_read(&keyring_key->sem); + goto out; + } + res = derive_key_aes(ctx->nonce, master_key->raw, raw_key); + up_read(&keyring_key->sem); + if (res) + goto out; + + crypt_info->ci_keyring_key = keyring_key; + return 0; +out: + key_put(keyring_key); + return res; +} + static void put_crypt_info(struct fscrypt_info *ci) { if (!ci) @@ -91,12 +152,7 @@ static void put_crypt_info(struct fscrypt_info *ci) int get_crypt_info(struct inode *inode) { struct fscrypt_info *crypt_info; - u8 full_key_descriptor[FS_KEY_DESC_PREFIX_SIZE + - (FS_KEY_DESCRIPTOR_SIZE * 2) + 1]; - struct key *keyring_key = NULL; - struct fscrypt_key *master_key; struct fscrypt_context ctx; - const struct user_key_payload *ukp; struct crypto_skcipher *ctfm; const char *cipher_str; u8 raw_key[FS_MAX_KEY_SIZE]; @@ -167,48 +223,24 @@ retry: memset(raw_key, 0x42, FS_AES_256_XTS_KEY_SIZE); goto got_key; } - memcpy(full_key_descriptor, FS_KEY_DESC_PREFIX, - FS_KEY_DESC_PREFIX_SIZE); - sprintf(full_key_descriptor + FS_KEY_DESC_PREFIX_SIZE, - "%*phN", FS_KEY_DESCRIPTOR_SIZE, - ctx.master_key_descriptor); - full_key_descriptor[FS_KEY_DESC_PREFIX_SIZE + - (2 * FS_KEY_DESCRIPTOR_SIZE)] = '\0'; - keyring_key = request_key(&key_type_logon, full_key_descriptor, NULL); - if (IS_ERR(keyring_key)) { - res = PTR_ERR(keyring_key); - keyring_key = NULL; - goto out; - } - crypt_info->ci_keyring_key = keyring_key; - if (keyring_key->type != &key_type_logon) { - printk_once(KERN_WARNING - "%s: key type must be logon\n", __func__); - res = -ENOKEY; - goto out; - } - down_read(&keyring_key->sem); - ukp = user_key_payload(keyring_key); - if (ukp->datalen != sizeof(struct fscrypt_key)) { - res = -EINVAL; - up_read(&keyring_key->sem); - goto out; - } - master_key = (struct fscrypt_key *)ukp->data; - BUILD_BUG_ON(FS_AES_128_ECB_KEY_SIZE != FS_KEY_DERIVATION_NONCE_SIZE); - if (master_key->size != FS_AES_256_XTS_KEY_SIZE) { - printk_once(KERN_WARNING - "%s: key size incorrect: %d\n", - __func__, master_key->size); - res = -ENOKEY; - up_read(&keyring_key->sem); + res = validate_user_key(crypt_info, &ctx, raw_key, + FS_KEY_DESC_PREFIX, FS_KEY_DESC_PREFIX_SIZE); + if (res && inode->i_sb->s_cop->key_prefix) { + u8 *prefix = NULL; + int prefix_size, res2; + + prefix_size = inode->i_sb->s_cop->key_prefix(inode, &prefix); + res2 = validate_user_key(crypt_info, &ctx, raw_key, + prefix, prefix_size); + if (res2) { + if (res2 == -ENOKEY) + res = -ENOKEY; + goto out; + } + } else if (res) { goto out; } - res = derive_key_aes(ctx.nonce, master_key->raw, raw_key); - up_read(&keyring_key->sem); - if (res) - goto out; got_key: ctfm = crypto_alloc_skcipher(cipher_str, 0, 0); if (!ctfm || IS_ERR(ctfm)) { @@ -32,6 +32,44 @@ #include <linux/pfn_t.h> #include <linux/sizes.h> +/* + * We use lowest available bit in exceptional entry for locking, other two + * bits to determine entry type. In total 3 special bits. + */ +#define RADIX_DAX_SHIFT (RADIX_TREE_EXCEPTIONAL_SHIFT + 3) +#define RADIX_DAX_PTE (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 1)) +#define RADIX_DAX_PMD (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 2)) +#define RADIX_DAX_TYPE_MASK (RADIX_DAX_PTE | RADIX_DAX_PMD) +#define RADIX_DAX_TYPE(entry) ((unsigned long)entry & RADIX_DAX_TYPE_MASK) +#define RADIX_DAX_SECTOR(entry) (((unsigned long)entry >> RADIX_DAX_SHIFT)) +#define RADIX_DAX_ENTRY(sector, pmd) ((void *)((unsigned long)sector << \ + RADIX_DAX_SHIFT | (pmd ? RADIX_DAX_PMD : RADIX_DAX_PTE) | \ + RADIX_TREE_EXCEPTIONAL_ENTRY)) + +/* We choose 4096 entries - same as per-zone page wait tables */ +#define DAX_WAIT_TABLE_BITS 12 +#define DAX_WAIT_TABLE_ENTRIES (1 << DAX_WAIT_TABLE_BITS) + +wait_queue_head_t wait_table[DAX_WAIT_TABLE_ENTRIES]; + +static int __init init_dax_wait_table(void) +{ + int i; + + for (i = 0; i < DAX_WAIT_TABLE_ENTRIES; i++) + init_waitqueue_head(wait_table + i); + return 0; +} +fs_initcall(init_dax_wait_table); + +static wait_queue_head_t *dax_entry_waitqueue(struct address_space *mapping, + pgoff_t index) +{ + unsigned long hash = hash_long((unsigned long)mapping ^ index, + DAX_WAIT_TABLE_BITS); + return wait_table + hash; +} + static long dax_map_atomic(struct block_device *bdev, struct blk_dax_ctl *dax) { struct request_queue *q = bdev->bd_queue; @@ -78,50 +116,6 @@ struct page *read_dax_sector(struct block_device *bdev, sector_t n) return page; } -/* - * dax_clear_sectors() is called from within transaction context from XFS, - * and hence this means the stack from this point must follow GFP_NOFS - * semantics for all operations. - */ -int dax_clear_sectors(struct block_device *bdev, sector_t _sector, long _size) -{ - struct blk_dax_ctl dax = { - .sector = _sector, - .size = _size, - }; - - might_sleep(); - do { - long count, sz; - - count = dax_map_atomic(bdev, &dax); - if (count < 0) - return count; - sz = min_t(long, count, SZ_128K); - clear_pmem(dax.addr, sz); - dax.size -= sz; - dax.sector += sz / 512; - dax_unmap_atomic(bdev, &dax); - cond_resched(); - } while (dax.size); - - wmb_pmem(); - return 0; -} -EXPORT_SYMBOL_GPL(dax_clear_sectors); - -/* the clear_pmem() calls are ordered by a wmb_pmem() in the caller */ -static void dax_new_buf(void __pmem *addr, unsigned size, unsigned first, - loff_t pos, loff_t end) -{ - loff_t final = end - pos + first; /* The final byte of the buffer */ - - if (first > 0) - clear_pmem(addr, first); - if (final < size) - clear_pmem(addr + final, size - final); -} - static bool buffer_written(struct buffer_head *bh) { return buffer_mapped(bh) && !buffer_unwritten(bh); @@ -160,6 +154,9 @@ static ssize_t dax_io(struct inode *inode, struct iov_iter *iter, struct blk_dax_ctl dax = { .addr = (void __pmem *) ERR_PTR(-EIO), }; + unsigned blkbits = inode->i_blkbits; + sector_t file_blks = (i_size_read(inode) + (1 << blkbits) - 1) + >> blkbits; if (rw == READ) end = min(end, i_size_read(inode)); @@ -167,7 +164,6 @@ static ssize_t dax_io(struct inode *inode, struct iov_iter *iter, while (pos < end) { size_t len; if (pos == max) { - unsigned blkbits = inode->i_blkbits; long page = pos >> PAGE_SHIFT; sector_t block = page << (PAGE_SHIFT - blkbits); unsigned first = pos - (block << blkbits); @@ -183,6 +179,13 @@ static ssize_t dax_io(struct inode *inode, struct iov_iter *iter, bh->b_size = 1 << blkbits; bh_max = pos - first + bh->b_size; bdev = bh->b_bdev; + /* + * We allow uninitialized buffers for writes + * beyond EOF as those cannot race with faults + */ + WARN_ON_ONCE( + (buffer_new(bh) && block < file_blks) || + (rw == WRITE && buffer_unwritten(bh))); } else { unsigned done = bh->b_size - (bh_max - (pos - first)); @@ -202,15 +205,15 @@ static ssize_t dax_io(struct inode *inode, struct iov_iter *iter, rc = map_len; break; } - if (buffer_unwritten(bh) || buffer_new(bh)) { - dax_new_buf(dax.addr, map_len, first, - pos, end); - need_wmb = true; - } dax.addr += first; size = map_len - first; } - max = min(pos + size, end); + /* + * pos + size is one past the last offset for IO, + * so pos + size can overflow loff_t at extreme offsets. + * Cast to u64 to catch this and get the true minimum. + */ + max = min_t(u64, pos + size, end); } if (iov_iter_rw(iter) == WRITE) { @@ -267,15 +270,8 @@ ssize_t dax_do_io(struct kiocb *iocb, struct inode *inode, memset(&bh, 0, sizeof(bh)); bh.b_bdev = inode->i_sb->s_bdev; - if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ) { - struct address_space *mapping = inode->i_mapping; + if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ) inode_lock(inode); - retval = filemap_write_and_wait_range(mapping, pos, end - 1); - if (retval) { - inode_unlock(inode); - goto out; - } - } /* Protects against truncate */ if (!(flags & DIO_SKIP_DIO_COUNT)) @@ -296,12 +292,268 @@ ssize_t dax_do_io(struct kiocb *iocb, struct inode *inode, if (!(flags & DIO_SKIP_DIO_COUNT)) inode_dio_end(inode); - out: return retval; } EXPORT_SYMBOL_GPL(dax_do_io); /* + * DAX radix tree locking + */ +struct exceptional_entry_key { + struct address_space *mapping; + unsigned long index; +}; + +struct wait_exceptional_entry_queue { + wait_queue_t wait; + struct exceptional_entry_key key; +}; + +static int wake_exceptional_entry_func(wait_queue_t *wait, unsigned int mode, + int sync, void *keyp) +{ + struct exceptional_entry_key *key = keyp; + struct wait_exceptional_entry_queue *ewait = + container_of(wait, struct wait_exceptional_entry_queue, wait); + + if (key->mapping != ewait->key.mapping || + key->index != ewait->key.index) + return 0; + return autoremove_wake_function(wait, mode, sync, NULL); +} + +/* + * Check whether the given slot is locked. The function must be called with + * mapping->tree_lock held + */ +static inline int slot_locked(struct address_space *mapping, void **slot) +{ + unsigned long entry = (unsigned long) + radix_tree_deref_slot_protected(slot, &mapping->tree_lock); + return entry & RADIX_DAX_ENTRY_LOCK; +} + +/* + * Mark the given slot is locked. The function must be called with + * mapping->tree_lock held + */ +static inline void *lock_slot(struct address_space *mapping, void **slot) +{ + unsigned long entry = (unsigned long) + radix_tree_deref_slot_protected(slot, &mapping->tree_lock); + + entry |= RADIX_DAX_ENTRY_LOCK; + radix_tree_replace_slot(slot, (void *)entry); + return (void *)entry; +} + +/* + * Mark the given slot is unlocked. The function must be called with + * mapping->tree_lock held + */ +static inline void *unlock_slot(struct address_space *mapping, void **slot) +{ + unsigned long entry = (unsigned long) + radix_tree_deref_slot_protected(slot, &mapping->tree_lock); + + entry &= ~(unsigned long)RADIX_DAX_ENTRY_LOCK; + radix_tree_replace_slot(slot, (void *)entry); + return (void *)entry; +} + +/* + * Lookup entry in radix tree, wait for it to become unlocked if it is + * exceptional entry and return it. The caller must call + * put_unlocked_mapping_entry() when he decided not to lock the entry or + * put_locked_mapping_entry() when he locked the entry and now wants to + * unlock it. + * + * The function must be called with mapping->tree_lock held. + */ +static void *get_unlocked_mapping_entry(struct address_space *mapping, + pgoff_t index, void ***slotp) +{ + void *ret, **slot; + struct wait_exceptional_entry_queue ewait; + wait_queue_head_t *wq = dax_entry_waitqueue(mapping, index); + + init_wait(&ewait.wait); + ewait.wait.func = wake_exceptional_entry_func; + ewait.key.mapping = mapping; + ewait.key.index = index; + + for (;;) { + ret = __radix_tree_lookup(&mapping->page_tree, index, NULL, + &slot); + if (!ret || !radix_tree_exceptional_entry(ret) || + !slot_locked(mapping, slot)) { + if (slotp) + *slotp = slot; + return ret; + } + prepare_to_wait_exclusive(wq, &ewait.wait, + TASK_UNINTERRUPTIBLE); + spin_unlock_irq(&mapping->tree_lock); + schedule(); + finish_wait(wq, &ewait.wait); + spin_lock_irq(&mapping->tree_lock); + } +} + +/* + * Find radix tree entry at given index. If it points to a page, return with + * the page locked. If it points to the exceptional entry, return with the + * radix tree entry locked. If the radix tree doesn't contain given index, + * create empty exceptional entry for the index and return with it locked. + * + * Note: Unlike filemap_fault() we don't honor FAULT_FLAG_RETRY flags. For + * persistent memory the benefit is doubtful. We can add that later if we can + * show it helps. + */ +static void *grab_mapping_entry(struct address_space *mapping, pgoff_t index) +{ + void *ret, **slot; + +restart: + spin_lock_irq(&mapping->tree_lock); + ret = get_unlocked_mapping_entry(mapping, index, &slot); + /* No entry for given index? Make sure radix tree is big enough. */ + if (!ret) { + int err; + + spin_unlock_irq(&mapping->tree_lock); + err = radix_tree_preload( + mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM); + if (err) + return ERR_PTR(err); + ret = (void *)(RADIX_TREE_EXCEPTIONAL_ENTRY | + RADIX_DAX_ENTRY_LOCK); + spin_lock_irq(&mapping->tree_lock); + err = radix_tree_insert(&mapping->page_tree, index, ret); + radix_tree_preload_end(); + if (err) { + spin_unlock_irq(&mapping->tree_lock); + /* Someone already created the entry? */ + if (err == -EEXIST) + goto restart; + return ERR_PTR(err); + } + /* Good, we have inserted empty locked entry into the tree. */ + mapping->nrexceptional++; + spin_unlock_irq(&mapping->tree_lock); + return ret; + } + /* Normal page in radix tree? */ + if (!radix_tree_exceptional_entry(ret)) { + struct page *page = ret; + + get_page(page); + spin_unlock_irq(&mapping->tree_lock); + lock_page(page); + /* Page got truncated? Retry... */ + if (unlikely(page->mapping != mapping)) { + unlock_page(page); + put_page(page); + goto restart; + } + return page; + } + ret = lock_slot(mapping, slot); + spin_unlock_irq(&mapping->tree_lock); + return ret; +} + +void dax_wake_mapping_entry_waiter(struct address_space *mapping, + pgoff_t index, bool wake_all) +{ + wait_queue_head_t *wq = dax_entry_waitqueue(mapping, index); + + /* + * Checking for locked entry and prepare_to_wait_exclusive() happens + * under mapping->tree_lock, ditto for entry handling in our callers. + * So at this point all tasks that could have seen our entry locked + * must be in the waitqueue and the following check will see them. + */ + if (waitqueue_active(wq)) { + struct exceptional_entry_key key; + + key.mapping = mapping; + key.index = index; + __wake_up(wq, TASK_NORMAL, wake_all ? 0 : 1, &key); + } +} + +void dax_unlock_mapping_entry(struct address_space *mapping, pgoff_t index) +{ + void *ret, **slot; + + spin_lock_irq(&mapping->tree_lock); + ret = __radix_tree_lookup(&mapping->page_tree, index, NULL, &slot); + if (WARN_ON_ONCE(!ret || !radix_tree_exceptional_entry(ret) || + !slot_locked(mapping, slot))) { + spin_unlock_irq(&mapping->tree_lock); + return; + } + unlock_slot(mapping, slot); + spin_unlock_irq(&mapping->tree_lock); + dax_wake_mapping_entry_waiter(mapping, index, false); +} + +static void put_locked_mapping_entry(struct address_space *mapping, + pgoff_t index, void *entry) +{ + if (!radix_tree_exceptional_entry(entry)) { + unlock_page(entry); + put_page(entry); + } else { + dax_unlock_mapping_entry(mapping, index); + } +} + +/* + * Called when we are done with radix tree entry we looked up via + * get_unlocked_mapping_entry() and which we didn't lock in the end. + */ +static void put_unlocked_mapping_entry(struct address_space *mapping, + pgoff_t index, void *entry) +{ + if (!radix_tree_exceptional_entry(entry)) + return; + + /* We have to wake up next waiter for the radix tree entry lock */ + dax_wake_mapping_entry_waiter(mapping, index, false); +} + +/* + * Delete exceptional DAX entry at @index from @mapping. Wait for radix tree + * entry to get unlocked before deleting it. + */ +int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index) +{ + void *entry; + + spin_lock_irq(&mapping->tree_lock); + entry = get_unlocked_mapping_entry(mapping, index, NULL); + /* + * This gets called from truncate / punch_hole path. As such, the caller + * must hold locks protecting against concurrent modifications of the + * radix tree (usually fs-private i_mmap_sem for writing). Since the + * caller has seen exceptional entry for this index, we better find it + * at that index as well... + */ + if (WARN_ON_ONCE(!entry || !radix_tree_exceptional_entry(entry))) { + spin_unlock_irq(&mapping->tree_lock); + return 0; + } + radix_tree_delete(&mapping->page_tree, index); + mapping->nrexceptional--; + spin_unlock_irq(&mapping->tree_lock); + dax_wake_mapping_entry_waiter(mapping, index, true); + + return 1; +} + +/* * The user has performed a load from a hole in the file. Allocating * a new page in the file would cause excessive storage usage for * workloads with sparse files. We allocate a page cache page instead. @@ -309,24 +561,24 @@ EXPORT_SYMBOL_GPL(dax_do_io); * otherwise it will simply fall out of the page cache under memory * pressure without ever having been dirtied. */ -static int dax_load_hole(struct address_space *mapping, struct page *page, - struct vm_fault *vmf) +static int dax_load_hole(struct address_space *mapping, void *entry, + struct vm_fault *vmf) { - unsigned long size; - struct inode *inode = mapping->host; - if (!page) - page = find_or_create_page(mapping, vmf->pgoff, - GFP_KERNEL | __GFP_ZERO); - if (!page) - return VM_FAULT_OOM; - /* Recheck i_size under page lock to avoid truncate race */ - size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; - if (vmf->pgoff >= size) { - unlock_page(page); - put_page(page); - return VM_FAULT_SIGBUS; + struct page *page; + + /* Hole page already exists? Return it... */ + if (!radix_tree_exceptional_entry(entry)) { + vmf->page = entry; + return VM_FAULT_LOCKED; } + /* This will replace locked radix tree entry with a hole page */ + page = find_or_create_page(mapping, vmf->pgoff, + vmf->gfp_mask | __GFP_ZERO); + if (!page) { + put_locked_mapping_entry(mapping, vmf->pgoff, entry); + return VM_FAULT_OOM; + } vmf->page = page; return VM_FAULT_LOCKED; } @@ -350,77 +602,72 @@ static int copy_user_bh(struct page *to, struct inode *inode, return 0; } -#define NO_SECTOR -1 #define DAX_PMD_INDEX(page_index) (page_index & (PMD_MASK >> PAGE_SHIFT)) -static int dax_radix_entry(struct address_space *mapping, pgoff_t index, - sector_t sector, bool pmd_entry, bool dirty) +static void *dax_insert_mapping_entry(struct address_space *mapping, + struct vm_fault *vmf, + void *entry, sector_t sector) { struct radix_tree_root *page_tree = &mapping->page_tree; - pgoff_t pmd_index = DAX_PMD_INDEX(index); - int type, error = 0; - void *entry; + int error = 0; + bool hole_fill = false; + void *new_entry; + pgoff_t index = vmf->pgoff; - WARN_ON_ONCE(pmd_entry && !dirty); - if (dirty) + if (vmf->flags & FAULT_FLAG_WRITE) __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); - spin_lock_irq(&mapping->tree_lock); - - entry = radix_tree_lookup(page_tree, pmd_index); - if (entry && RADIX_DAX_TYPE(entry) == RADIX_DAX_PMD) { - index = pmd_index; - goto dirty; + /* Replacing hole page with block mapping? */ + if (!radix_tree_exceptional_entry(entry)) { + hole_fill = true; + /* + * Unmap the page now before we remove it from page cache below. + * The page is locked so it cannot be faulted in again. + */ + unmap_mapping_range(mapping, vmf->pgoff << PAGE_SHIFT, + PAGE_SIZE, 0); + error = radix_tree_preload(vmf->gfp_mask & ~__GFP_HIGHMEM); + if (error) + return ERR_PTR(error); } - entry = radix_tree_lookup(page_tree, index); - if (entry) { - type = RADIX_DAX_TYPE(entry); - if (WARN_ON_ONCE(type != RADIX_DAX_PTE && - type != RADIX_DAX_PMD)) { - error = -EIO; + spin_lock_irq(&mapping->tree_lock); + new_entry = (void *)((unsigned long)RADIX_DAX_ENTRY(sector, false) | + RADIX_DAX_ENTRY_LOCK); + if (hole_fill) { + __delete_from_page_cache(entry, NULL); + /* Drop pagecache reference */ + put_page(entry); + error = radix_tree_insert(page_tree, index, new_entry); + if (error) { + new_entry = ERR_PTR(error); goto unlock; } + mapping->nrexceptional++; + } else { + void **slot; + void *ret; - if (!pmd_entry || type == RADIX_DAX_PMD) - goto dirty; - - /* - * We only insert dirty PMD entries into the radix tree. This - * means we don't need to worry about removing a dirty PTE - * entry and inserting a clean PMD entry, thus reducing the - * range we would flush with a follow-up fsync/msync call. - */ - radix_tree_delete(&mapping->page_tree, index); - mapping->nrexceptional--; - } - - if (sector == NO_SECTOR) { - /* - * This can happen during correct operation if our pfn_mkwrite - * fault raced against a hole punch operation. If this - * happens the pte that was hole punched will have been - * unmapped and the radix tree entry will have been removed by - * the time we are called, but the call will still happen. We - * will return all the way up to wp_pfn_shared(), where the - * pte_same() check will fail, eventually causing page fault - * to be retried by the CPU. - */ - goto unlock; + ret = __radix_tree_lookup(page_tree, index, NULL, &slot); + WARN_ON_ONCE(ret != entry); + radix_tree_replace_slot(slot, new_entry); } - - error = radix_tree_insert(page_tree, index, - RADIX_DAX_ENTRY(sector, pmd_entry)); - if (error) - goto unlock; - - mapping->nrexceptional++; - dirty: - if (dirty) + if (vmf->flags & FAULT_FLAG_WRITE) radix_tree_tag_set(page_tree, index, PAGECACHE_TAG_DIRTY); unlock: spin_unlock_irq(&mapping->tree_lock); - return error; + if (hole_fill) { + radix_tree_preload_end(); + /* + * We don't need hole page anymore, it has been replaced with + * locked radix tree entry now. + */ + if (mapping->a_ops->freepage) + mapping->a_ops->freepage(entry); + unlock_page(entry); + put_page(entry); + } + return new_entry; } static int dax_writeback_one(struct block_device *bdev, @@ -546,81 +793,48 @@ int dax_writeback_mapping_range(struct address_space *mapping, } EXPORT_SYMBOL_GPL(dax_writeback_mapping_range); -static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh, +static int dax_insert_mapping(struct address_space *mapping, + struct buffer_head *bh, void **entryp, struct vm_area_struct *vma, struct vm_fault *vmf) { unsigned long vaddr = (unsigned long)vmf->virtual_address; - struct address_space *mapping = inode->i_mapping; struct block_device *bdev = bh->b_bdev; struct blk_dax_ctl dax = { - .sector = to_sector(bh, inode), + .sector = to_sector(bh, mapping->host), .size = bh->b_size, }; - pgoff_t size; - int error; - - i_mmap_lock_read(mapping); + void *ret; + void *entry = *entryp; - /* - * Check truncate didn't happen while we were allocating a block. - * If it did, this block may or may not be still allocated to the - * file. We can't tell the filesystem to free it because we can't - * take i_mutex here. In the worst case, the file still has blocks - * allocated past the end of the file. - */ - size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; - if (unlikely(vmf->pgoff >= size)) { - error = -EIO; - goto out; - } - - if (dax_map_atomic(bdev, &dax) < 0) { - error = PTR_ERR(dax.addr); - goto out; - } - - if (buffer_unwritten(bh) || buffer_new(bh)) { - clear_pmem(dax.addr, PAGE_SIZE); - wmb_pmem(); - } + if (dax_map_atomic(bdev, &dax) < 0) + return PTR_ERR(dax.addr); dax_unmap_atomic(bdev, &dax); - error = dax_radix_entry(mapping, vmf->pgoff, dax.sector, false, - vmf->flags & FAULT_FLAG_WRITE); - if (error) - goto out; - - error = vm_insert_mixed(vma, vaddr, dax.pfn); + ret = dax_insert_mapping_entry(mapping, vmf, entry, dax.sector); + if (IS_ERR(ret)) + return PTR_ERR(ret); + *entryp = ret; - out: - i_mmap_unlock_read(mapping); - - return error; + return vm_insert_mixed(vma, vaddr, dax.pfn); } /** - * __dax_fault - handle a page fault on a DAX file + * dax_fault - handle a page fault on a DAX file * @vma: The virtual memory area where the fault occurred * @vmf: The description of the fault * @get_block: The filesystem method used to translate file offsets to blocks - * @complete_unwritten: The filesystem method used to convert unwritten blocks - * to written so the data written to them is exposed. This is required for - * required by write faults for filesystems that will return unwritten - * extent mappings from @get_block, but it is optional for reads as - * dax_insert_mapping() will always zero unwritten blocks. If the fs does - * not support unwritten extents, the it should pass NULL. * * When a page fault occurs, filesystems may call this helper in their - * fault handler for DAX files. __dax_fault() assumes the caller has done all + * fault handler for DAX files. dax_fault() assumes the caller has done all * the necessary locking for the page fault to proceed successfully. */ -int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, - get_block_t get_block, dax_iodone_t complete_unwritten) +int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, + get_block_t get_block) { struct file *file = vma->vm_file; struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; - struct page *page; + void *entry; struct buffer_head bh; unsigned long vaddr = (unsigned long)vmf->virtual_address; unsigned blkbits = inode->i_blkbits; @@ -629,6 +843,11 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, int error; int major = 0; + /* + * Check whether offset isn't beyond end of file now. Caller is supposed + * to hold locks serializing us with truncate / punch hole so this is + * a reliable test. + */ size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; if (vmf->pgoff >= size) return VM_FAULT_SIGBUS; @@ -638,49 +857,17 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, bh.b_bdev = inode->i_sb->s_bdev; bh.b_size = PAGE_SIZE; - repeat: - page = find_get_page(mapping, vmf->pgoff); - if (page) { - if (!lock_page_or_retry(page, vma->vm_mm, vmf->flags)) { - put_page(page); - return VM_FAULT_RETRY; - } - if (unlikely(page->mapping != mapping)) { - unlock_page(page); - put_page(page); - goto repeat; - } - size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; - if (unlikely(vmf->pgoff >= size)) { - /* - * We have a struct page covering a hole in the file - * from a read fault and we've raced with a truncate - */ - error = -EIO; - goto unlock_page; - } + entry = grab_mapping_entry(mapping, vmf->pgoff); + if (IS_ERR(entry)) { + error = PTR_ERR(entry); + goto out; } error = get_block(inode, block, &bh, 0); if (!error && (bh.b_size < PAGE_SIZE)) error = -EIO; /* fs corruption? */ if (error) - goto unlock_page; - - if (!buffer_mapped(&bh) && !buffer_unwritten(&bh) && !vmf->cow_page) { - if (vmf->flags & FAULT_FLAG_WRITE) { - error = get_block(inode, block, &bh, 1); - count_vm_event(PGMAJFAULT); - mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); - major = VM_FAULT_MAJOR; - if (!error && (bh.b_size < PAGE_SIZE)) - error = -EIO; - if (error) - goto unlock_page; - } else { - return dax_load_hole(mapping, page, vmf); - } - } + goto unlock_entry; if (vmf->cow_page) { struct page *new_page = vmf->cow_page; @@ -689,53 +876,35 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, else clear_user_highpage(new_page, vaddr); if (error) - goto unlock_page; - vmf->page = page; - if (!page) { - i_mmap_lock_read(mapping); - /* Check we didn't race with truncate */ - size = (i_size_read(inode) + PAGE_SIZE - 1) >> - PAGE_SHIFT; - if (vmf->pgoff >= size) { - i_mmap_unlock_read(mapping); - error = -EIO; - goto out; - } + goto unlock_entry; + if (!radix_tree_exceptional_entry(entry)) { + vmf->page = entry; + return VM_FAULT_LOCKED; } - return VM_FAULT_LOCKED; + vmf->entry = entry; + return VM_FAULT_DAX_LOCKED; } - /* Check we didn't race with a read fault installing a new page */ - if (!page && major) - page = find_lock_page(mapping, vmf->pgoff); - - if (page) { - unmap_mapping_range(mapping, vmf->pgoff << PAGE_SHIFT, - PAGE_SIZE, 0); - delete_from_page_cache(page); - unlock_page(page); - put_page(page); - page = NULL; - } - - /* - * If we successfully insert the new mapping over an unwritten extent, - * we need to ensure we convert the unwritten extent. If there is an - * error inserting the mapping, the filesystem needs to leave it as - * unwritten to prevent exposure of the stale underlying data to - * userspace, but we still need to call the completion function so - * the private resources on the mapping buffer can be released. We - * indicate what the callback should do via the uptodate variable, same - * as for normal BH based IO completions. - */ - error = dax_insert_mapping(inode, &bh, vma, vmf); - if (buffer_unwritten(&bh)) { - if (complete_unwritten) - complete_unwritten(&bh, !error); - else - WARN_ON_ONCE(!(vmf->flags & FAULT_FLAG_WRITE)); + if (!buffer_mapped(&bh)) { + if (vmf->flags & FAULT_FLAG_WRITE) { + error = get_block(inode, block, &bh, 1); + count_vm_event(PGMAJFAULT); + mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); + major = VM_FAULT_MAJOR; + if (!error && (bh.b_size < PAGE_SIZE)) + error = -EIO; + if (error) + goto unlock_entry; + } else { + return dax_load_hole(mapping, entry, vmf); + } } + /* Filesystem should not return unwritten buffers to us! */ + WARN_ON_ONCE(buffer_unwritten(&bh) || buffer_new(&bh)); + error = dax_insert_mapping(mapping, &bh, &entry, vma, vmf); + unlock_entry: + put_locked_mapping_entry(mapping, vmf->pgoff, entry); out: if (error == -ENOMEM) return VM_FAULT_OOM | major; @@ -743,44 +912,10 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, if ((error < 0) && (error != -EBUSY)) return VM_FAULT_SIGBUS | major; return VM_FAULT_NOPAGE | major; - - unlock_page: - if (page) { - unlock_page(page); - put_page(page); - } - goto out; -} -EXPORT_SYMBOL(__dax_fault); - -/** - * dax_fault - handle a page fault on a DAX file - * @vma: The virtual memory area where the fault occurred - * @vmf: The description of the fault - * @get_block: The filesystem method used to translate file offsets to blocks - * - * When a page fault occurs, filesystems may call this helper in their - * fault handler for DAX files. - */ -int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, - get_block_t get_block, dax_iodone_t complete_unwritten) -{ - int result; - struct super_block *sb = file_inode(vma->vm_file)->i_sb; - - if (vmf->flags & FAULT_FLAG_WRITE) { - sb_start_pagefault(sb); - file_update_time(vma->vm_file); - } - result = __dax_fault(vma, vmf, get_block, complete_unwritten); - if (vmf->flags & FAULT_FLAG_WRITE) - sb_end_pagefault(sb); - - return result; } EXPORT_SYMBOL_GPL(dax_fault); -#ifdef CONFIG_TRANSPARENT_HUGEPAGE +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) /* * The 'colour' (ie low bits) within a PMD of a page offset. This comes up * more often than one might expect in the below function. @@ -805,9 +940,17 @@ static void __dax_dbg(struct buffer_head *bh, unsigned long address, #define dax_pmd_dbg(bh, address, reason) __dax_dbg(bh, address, reason, "dax_pmd") -int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, - pmd_t *pmd, unsigned int flags, get_block_t get_block, - dax_iodone_t complete_unwritten) +/** + * dax_pmd_fault - handle a PMD fault on a DAX file + * @vma: The virtual memory area where the fault occurred + * @vmf: The description of the fault + * @get_block: The filesystem method used to translate file offsets to blocks + * + * When a page fault occurs, filesystems may call this helper in their + * pmd_fault handler for DAX files. + */ +int dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, + pmd_t *pmd, unsigned int flags, get_block_t get_block) { struct file *file = vma->vm_file; struct address_space *mapping = file->f_mapping; @@ -819,7 +962,7 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, struct block_device *bdev; pgoff_t size, pgoff; sector_t block; - int error, result = 0; + int result = 0; bool alloc = false; /* dax pmd mappings require pfn_t_devmap() */ @@ -866,6 +1009,7 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, if (get_block(inode, block, &bh, 1) != 0) return VM_FAULT_SIGBUS; alloc = true; + WARN_ON_ONCE(buffer_unwritten(&bh) || buffer_new(&bh)); } bdev = bh.b_bdev; @@ -891,26 +1035,7 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, truncate_pagecache_range(inode, lstart, lend); } - i_mmap_lock_read(mapping); - - /* - * If a truncate happened while we were allocating blocks, we may - * leave blocks allocated to the file that are beyond EOF. We can't - * take i_mutex here, so just leave them hanging; they'll be freed - * when the file is deleted. - */ - size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; - if (pgoff >= size) { - result = VM_FAULT_SIGBUS; - goto out; - } - if ((pgoff | PG_PMD_COLOUR) >= size) { - dax_pmd_dbg(&bh, address, - "offset + huge page size > file size"); - goto fallback; - } - - if (!write && !buffer_mapped(&bh) && buffer_uptodate(&bh)) { + if (!write && !buffer_mapped(&bh)) { spinlock_t *ptl; pmd_t entry; struct page *zero_page = get_huge_zero_page(); @@ -945,8 +1070,8 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, long length = dax_map_atomic(bdev, &dax); if (length < 0) { - result = VM_FAULT_SIGBUS; - goto out; + dax_pmd_dbg(&bh, address, "dax-error fallback"); + goto fallback; } if (length < PMD_SIZE) { dax_pmd_dbg(&bh, address, "dax-length too small"); @@ -964,14 +1089,6 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, dax_pmd_dbg(&bh, address, "pfn not in memmap"); goto fallback; } - - if (buffer_unwritten(&bh) || buffer_new(&bh)) { - clear_pmem(dax.addr, PMD_SIZE); - wmb_pmem(); - count_vm_event(PGMAJFAULT); - mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); - result |= VM_FAULT_MAJOR; - } dax_unmap_atomic(bdev, &dax); /* @@ -984,19 +1101,16 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, * * The PMD path doesn't have an equivalent to * dax_pfn_mkwrite(), though, so for a read followed by a - * write we traverse all the way through __dax_pmd_fault() + * write we traverse all the way through dax_pmd_fault() * twice. This means we can just skip inserting a radix tree * entry completely on the initial read and just wait until * the write to insert a dirty entry. */ if (write) { - error = dax_radix_entry(mapping, pgoff, dax.sector, - true, true); - if (error) { - dax_pmd_dbg(&bh, address, - "PMD radix insertion failed"); - goto fallback; - } + /* + * We should insert radix-tree entry and dirty it here. + * For now this is broken... + */ } dev_dbg(part_to_dev(bdev->bd_part), @@ -1009,11 +1123,6 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, } out: - i_mmap_unlock_read(mapping); - - if (buffer_unwritten(&bh)) - complete_unwritten(&bh, !(result & VM_FAULT_ERROR)); - return result; fallback: @@ -1021,35 +1130,6 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, result = VM_FAULT_FALLBACK; goto out; } -EXPORT_SYMBOL_GPL(__dax_pmd_fault); - -/** - * dax_pmd_fault - handle a PMD fault on a DAX file - * @vma: The virtual memory area where the fault occurred - * @vmf: The description of the fault - * @get_block: The filesystem method used to translate file offsets to blocks - * - * When a page fault occurs, filesystems may call this helper in their - * pmd_fault handler for DAX files. - */ -int dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, - pmd_t *pmd, unsigned int flags, get_block_t get_block, - dax_iodone_t complete_unwritten) -{ - int result; - struct super_block *sb = file_inode(vma->vm_file)->i_sb; - - if (flags & FAULT_FLAG_WRITE) { - sb_start_pagefault(sb); - file_update_time(vma->vm_file); - } - result = __dax_pmd_fault(vma, address, pmd, flags, get_block, - complete_unwritten); - if (flags & FAULT_FLAG_WRITE) - sb_end_pagefault(sb); - - return result; -} EXPORT_SYMBOL_GPL(dax_pmd_fault); #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ @@ -1061,27 +1141,59 @@ EXPORT_SYMBOL_GPL(dax_pmd_fault); int dax_pfn_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) { struct file *file = vma->vm_file; - int error; - - /* - * We pass NO_SECTOR to dax_radix_entry() because we expect that a - * RADIX_DAX_PTE entry already exists in the radix tree from a - * previous call to __dax_fault(). We just want to look up that PTE - * entry using vmf->pgoff and make sure the dirty tag is set. This - * saves us from having to make a call to get_block() here to look - * up the sector. - */ - error = dax_radix_entry(file->f_mapping, vmf->pgoff, NO_SECTOR, false, - true); + struct address_space *mapping = file->f_mapping; + void *entry; + pgoff_t index = vmf->pgoff; - if (error == -ENOMEM) - return VM_FAULT_OOM; - if (error) - return VM_FAULT_SIGBUS; + spin_lock_irq(&mapping->tree_lock); + entry = get_unlocked_mapping_entry(mapping, index, NULL); + if (!entry || !radix_tree_exceptional_entry(entry)) + goto out; + radix_tree_tag_set(&mapping->page_tree, index, PAGECACHE_TAG_DIRTY); + put_unlocked_mapping_entry(mapping, index, entry); +out: + spin_unlock_irq(&mapping->tree_lock); return VM_FAULT_NOPAGE; } EXPORT_SYMBOL_GPL(dax_pfn_mkwrite); +static bool dax_range_is_aligned(struct block_device *bdev, + unsigned int offset, unsigned int length) +{ + unsigned short sector_size = bdev_logical_block_size(bdev); + + if (!IS_ALIGNED(offset, sector_size)) + return false; + if (!IS_ALIGNED(length, sector_size)) + return false; + + return true; +} + +int __dax_zero_page_range(struct block_device *bdev, sector_t sector, + unsigned int offset, unsigned int length) +{ + struct blk_dax_ctl dax = { + .sector = sector, + .size = PAGE_SIZE, + }; + + if (dax_range_is_aligned(bdev, offset, length)) { + sector_t start_sector = dax.sector + (offset >> 9); + + return blkdev_issue_zeroout(bdev, start_sector, + length >> 9, GFP_NOFS, true); + } else { + if (dax_map_atomic(bdev, &dax) < 0) + return PTR_ERR(dax.addr); + clear_pmem(dax.addr + offset, length); + wmb_pmem(); + dax_unmap_atomic(bdev, &dax); + } + return 0; +} +EXPORT_SYMBOL_GPL(__dax_zero_page_range); + /** * dax_zero_page_range - zero a range within a page of a DAX file * @inode: The file being truncated @@ -1093,12 +1205,6 @@ EXPORT_SYMBOL_GPL(dax_pfn_mkwrite); * page in a DAX file. This is intended for hole-punch operations. If * you are truncating a file, the helper function dax_truncate_page() may be * more convenient. - * - * We work in terms of PAGE_SIZE here for commonality with - * block_truncate_page(), but we could go down to PAGE_SIZE if the filesystem - * took care of disposing of the unnecessary blocks. Even if the filesystem - * block size is smaller than PAGE_SIZE, we have to zero the rest of the page - * since the file might be mmapped. */ int dax_zero_page_range(struct inode *inode, loff_t from, unsigned length, get_block_t get_block) @@ -1117,23 +1223,11 @@ int dax_zero_page_range(struct inode *inode, loff_t from, unsigned length, bh.b_bdev = inode->i_sb->s_bdev; bh.b_size = PAGE_SIZE; err = get_block(inode, index, &bh, 0); - if (err < 0) + if (err < 0 || !buffer_written(&bh)) return err; - if (buffer_written(&bh)) { - struct block_device *bdev = bh.b_bdev; - struct blk_dax_ctl dax = { - .sector = to_sector(&bh, inode), - .size = PAGE_SIZE, - }; - - if (dax_map_atomic(bdev, &dax) < 0) - return PTR_ERR(dax.addr); - clear_pmem(dax.addr + offset, length); - wmb_pmem(); - dax_unmap_atomic(bdev, &dax); - } - return 0; + return __dax_zero_page_range(bh.b_bdev, to_sector(&bh, inode), + offset, length); } EXPORT_SYMBOL_GPL(dax_zero_page_range); @@ -1145,12 +1239,6 @@ EXPORT_SYMBOL_GPL(dax_zero_page_range); * * Similar to block_truncate_page(), this function can be called by a * filesystem when it is truncating a DAX file to handle the partial page. - * - * We work in terms of PAGE_SIZE here for commonality with - * block_truncate_page(), but we could go down to PAGE_SIZE if the filesystem - * took care of disposing of the unnecessary blocks. Even if the filesystem - * block size is smaller than PAGE_SIZE, we have to zero the rest of the page - * since the file might be mmapped. */ int dax_truncate_page(struct inode *inode, loff_t from, get_block_t get_block) { diff --git a/fs/dcache.c b/fs/dcache.c index c622872c12c5..b90cf8e09d5b 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -104,11 +104,9 @@ static unsigned int d_hash_shift __read_mostly; static struct hlist_bl_head *dentry_hashtable __read_mostly; -static inline struct hlist_bl_head *d_hash(const struct dentry *parent, - unsigned int hash) +static inline struct hlist_bl_head *d_hash(unsigned int hash) { - hash += (unsigned long) parent / L1_CACHE_BYTES; - return dentry_hashtable + hash_32(hash, d_hash_shift); + return dentry_hashtable + (hash >> (32 - d_hash_shift)); } #define IN_LOOKUP_SHIFT 10 @@ -226,10 +224,9 @@ static inline int dentry_string_cmp(const unsigned char *cs, const unsigned char static inline int dentry_cmp(const struct dentry *dentry, const unsigned char *ct, unsigned tcount) { - const unsigned char *cs; /* * Be careful about RCU walk racing with rename: - * use ACCESS_ONCE to fetch the name pointer. + * use 'lockless_dereference' to fetch the name pointer. * * NOTE! Even if a rename will mean that the length * was not loaded atomically, we don't care. The @@ -243,8 +240,8 @@ static inline int dentry_cmp(const struct dentry *dentry, const unsigned char *c * early because the data cannot match (there can * be no NUL in the ct/tcount data) */ - cs = ACCESS_ONCE(dentry->d_name.name); - smp_read_barrier_depends(); + const unsigned char *cs = lockless_dereference(dentry->d_name.name); + return dentry_string_cmp(cs, ct, tcount); } @@ -335,44 +332,21 @@ static inline void dentry_rcuwalk_invalidate(struct dentry *dentry) /* * Release the dentry's inode, using the filesystem - * d_iput() operation if defined. Dentry has no refcount - * and is unhashed. - */ -static void dentry_iput(struct dentry * dentry) - __releases(dentry->d_lock) - __releases(dentry->d_inode->i_lock) -{ - struct inode *inode = dentry->d_inode; - if (inode) { - __d_clear_type_and_inode(dentry); - hlist_del_init(&dentry->d_u.d_alias); - spin_unlock(&dentry->d_lock); - spin_unlock(&inode->i_lock); - if (!inode->i_nlink) - fsnotify_inoderemove(inode); - if (dentry->d_op && dentry->d_op->d_iput) - dentry->d_op->d_iput(dentry, inode); - else - iput(inode); - } else { - spin_unlock(&dentry->d_lock); - } -} - -/* - * Release the dentry's inode, using the filesystem - * d_iput() operation if defined. dentry remains in-use. + * d_iput() operation if defined. */ static void dentry_unlink_inode(struct dentry * dentry) __releases(dentry->d_lock) __releases(dentry->d_inode->i_lock) { struct inode *inode = dentry->d_inode; + bool hashed = !d_unhashed(dentry); - raw_write_seqcount_begin(&dentry->d_seq); + if (hashed) + raw_write_seqcount_begin(&dentry->d_seq); __d_clear_type_and_inode(dentry); hlist_del_init(&dentry->d_u.d_alias); - raw_write_seqcount_end(&dentry->d_seq); + if (hashed) + raw_write_seqcount_end(&dentry->d_seq); spin_unlock(&dentry->d_lock); spin_unlock(&inode->i_lock); if (!inode->i_nlink) @@ -488,7 +462,7 @@ void __d_drop(struct dentry *dentry) if (unlikely(IS_ROOT(dentry))) b = &dentry->d_sb->s_anon; else - b = d_hash(dentry->d_parent, dentry->d_name.hash); + b = d_hash(dentry->d_name.hash); hlist_bl_lock(b); __hlist_bl_del(&dentry->d_hash); @@ -507,6 +481,44 @@ void d_drop(struct dentry *dentry) } EXPORT_SYMBOL(d_drop); +static inline void dentry_unlist(struct dentry *dentry, struct dentry *parent) +{ + struct dentry *next; + /* + * Inform d_walk() and shrink_dentry_list() that we are no longer + * attached to the dentry tree + */ + dentry->d_flags |= DCACHE_DENTRY_KILLED; + if (unlikely(list_empty(&dentry->d_child))) + return; + __list_del_entry(&dentry->d_child); + /* + * Cursors can move around the list of children. While we'd been + * a normal list member, it didn't matter - ->d_child.next would've + * been updated. However, from now on it won't be and for the + * things like d_walk() it might end up with a nasty surprise. + * Normally d_walk() doesn't care about cursors moving around - + * ->d_lock on parent prevents that and since a cursor has no children + * of its own, we get through it without ever unlocking the parent. + * There is one exception, though - if we ascend from a child that + * gets killed as soon as we unlock it, the next sibling is found + * using the value left in its ->d_child.next. And if _that_ + * pointed to a cursor, and cursor got moved (e.g. by lseek()) + * before d_walk() regains parent->d_lock, we'll end up skipping + * everything the cursor had been moved past. + * + * Solution: make sure that the pointer left behind in ->d_child.next + * points to something that won't be moving around. I.e. skip the + * cursors. + */ + while (dentry->d_child.next != &parent->d_subdirs) { + next = list_entry(dentry->d_child.next, struct dentry, d_child); + if (likely(!(next->d_flags & DCACHE_DENTRY_CURSOR))) + break; + dentry->d_child.next = next->d_child.next; + } +} + static void __dentry_kill(struct dentry *dentry) { struct dentry *parent = NULL; @@ -532,20 +544,13 @@ static void __dentry_kill(struct dentry *dentry) } /* if it was on the hash then remove it */ __d_drop(dentry); - __list_del_entry(&dentry->d_child); - /* - * Inform d_walk() that we are no longer attached to the - * dentry tree - */ - dentry->d_flags |= DCACHE_DENTRY_KILLED; + dentry_unlist(dentry, parent); if (parent) spin_unlock(&parent->d_lock); - dentry_iput(dentry); - /* - * dentry_iput drops the locks, at which point nobody (except - * transient RCU lookups) can reach this dentry. - */ - BUG_ON(dentry->d_lockref.count > 0); + if (dentry->d_inode) + dentry_unlink_inode(dentry); + else + spin_unlock(&dentry->d_lock); this_cpu_dec(nr_dentry); if (dentry->d_op && dentry->d_op->d_release) dentry->d_op->d_release(dentry); @@ -589,7 +594,6 @@ static struct dentry *dentry_kill(struct dentry *dentry) failed: spin_unlock(&dentry->d_lock); - cpu_relax(); return dentry; /* try again with same dentry */ } @@ -763,6 +767,8 @@ void dput(struct dentry *dentry) return; repeat: + might_sleep(); + rcu_read_lock(); if (likely(fast_dput(dentry))) { rcu_read_unlock(); @@ -796,8 +802,10 @@ repeat: kill_it: dentry = dentry_kill(dentry); - if (dentry) + if (dentry) { + cond_resched(); goto repeat; + } } EXPORT_SYMBOL(dput); @@ -1203,6 +1211,9 @@ resume: struct dentry *dentry = list_entry(tmp, struct dentry, d_child); next = tmp->next; + if (unlikely(dentry->d_flags & DCACHE_DENTRY_CURSOR)) + continue; + spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); ret = enter(data, dentry); @@ -1559,6 +1570,7 @@ struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name) { struct dentry *dentry; char *dname; + int err; dentry = kmem_cache_alloc(dentry_cache, GFP_KERNEL); if (!dentry) @@ -1617,6 +1629,16 @@ struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name) INIT_LIST_HEAD(&dentry->d_child); d_set_d_op(dentry, dentry->d_sb->s_d_op); + if (dentry->d_op && dentry->d_op->d_init) { + err = dentry->d_op->d_init(dentry); + if (err) { + if (dname_external(dentry)) + kfree(external_name(dentry)); + kmem_cache_free(dentry_cache, dentry); + return NULL; + } + } + this_cpu_inc(nr_dentry); return dentry; @@ -1636,7 +1658,7 @@ struct dentry *d_alloc(struct dentry * parent, const struct qstr *name) struct dentry *dentry = __d_alloc(parent->d_sb, name); if (!dentry) return NULL; - + dentry->d_flags |= DCACHE_RCUACCESS; spin_lock(&parent->d_lock); /* * don't need child lock because it is not subject @@ -1651,6 +1673,16 @@ struct dentry *d_alloc(struct dentry * parent, const struct qstr *name) } EXPORT_SYMBOL(d_alloc); +struct dentry *d_alloc_cursor(struct dentry * parent) +{ + struct dentry *dentry = __d_alloc(parent->d_sb, NULL); + if (dentry) { + dentry->d_flags |= DCACHE_RCUACCESS | DCACHE_DENTRY_CURSOR; + dentry->d_parent = dget(parent); + } + return dentry; +} + /** * d_alloc_pseudo - allocate a dentry (for lookup-less filesystems) * @sb: the superblock @@ -1670,8 +1702,7 @@ struct dentry *d_alloc_name(struct dentry *parent, const char *name) struct qstr q; q.name = name; - q.len = strlen(name); - q.hash = full_name_hash(q.name, q.len); + q.hash_len = hashlen_string(parent, name); return d_alloc(parent, &q); } EXPORT_SYMBOL(d_alloc_name); @@ -1684,7 +1715,6 @@ void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op) DCACHE_OP_REVALIDATE | DCACHE_OP_WEAK_REVALIDATE | DCACHE_OP_DELETE | - DCACHE_OP_SELECT_INODE | DCACHE_OP_REAL)); dentry->d_op = op; if (!op) @@ -1701,8 +1731,6 @@ void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op) dentry->d_flags |= DCACHE_OP_DELETE; if (op->d_prune) dentry->d_flags |= DCACHE_OP_PRUNE; - if (op->d_select_inode) - dentry->d_flags |= DCACHE_OP_SELECT_INODE; if (op->d_real) dentry->d_flags |= DCACHE_OP_REAL; @@ -1770,7 +1798,7 @@ static void __d_instantiate(struct dentry *dentry, struct inode *inode) raw_write_seqcount_begin(&dentry->d_seq); __d_set_inode_and_type(dentry, inode, add_flags); raw_write_seqcount_end(&dentry->d_seq); - __fsnotify_d_instantiate(dentry); + fsnotify_update_flags(dentry); spin_unlock(&dentry->d_lock); } @@ -2022,42 +2050,19 @@ struct dentry *d_add_ci(struct dentry *dentry, struct inode *inode, } EXPORT_SYMBOL(d_add_ci); -/* - * Do the slow-case of the dentry name compare. - * - * Unlike the dentry_cmp() function, we need to atomically - * load the name and length information, so that the - * filesystem can rely on them, and can use the 'name' and - * 'len' information without worrying about walking off the - * end of memory etc. - * - * Thus the read_seqcount_retry() and the "duplicate" info - * in arguments (the low-level filesystem should not look - * at the dentry inode or name contents directly, since - * rename can change them while we're in RCU mode). - */ -enum slow_d_compare { - D_COMP_OK, - D_COMP_NOMATCH, - D_COMP_SEQRETRY, -}; -static noinline enum slow_d_compare slow_dentry_cmp( - const struct dentry *parent, - struct dentry *dentry, - unsigned int seq, - const struct qstr *name) +static inline bool d_same_name(const struct dentry *dentry, + const struct dentry *parent, + const struct qstr *name) { - int tlen = dentry->d_name.len; - const char *tname = dentry->d_name.name; - - if (read_seqcount_retry(&dentry->d_seq, seq)) { - cpu_relax(); - return D_COMP_SEQRETRY; + if (likely(!(parent->d_flags & DCACHE_OP_COMPARE))) { + if (dentry->d_name.len != name->len) + return false; + return dentry_cmp(dentry, name->name, name->len) == 0; } - if (parent->d_op->d_compare(parent, dentry, tlen, tname, name)) - return D_COMP_NOMATCH; - return D_COMP_OK; + return parent->d_op->d_compare(parent, dentry, + dentry->d_name.len, dentry->d_name.name, + name) == 0; } /** @@ -2095,7 +2100,7 @@ struct dentry *__d_lookup_rcu(const struct dentry *parent, { u64 hashlen = name->hash_len; const unsigned char *str = name->name; - struct hlist_bl_head *b = d_hash(parent, hashlen_hash(hashlen)); + struct hlist_bl_head *b = d_hash(hashlen_hash(hashlen)); struct hlist_bl_node *node; struct dentry *dentry; @@ -2136,6 +2141,9 @@ seqretry: * dentry compare, we will do seqretries until it is stable, * and if we end up with a successful lookup, we actually * want to exit RCU lookup anyway. + * + * Note that raw_seqcount_begin still *does* smp_rmb(), so + * we are still guaranteed NUL-termination of ->d_name.name. */ seq = raw_seqcount_begin(&dentry->d_seq); if (dentry->d_parent != parent) @@ -2144,24 +2152,28 @@ seqretry: continue; if (unlikely(parent->d_flags & DCACHE_OP_COMPARE)) { + int tlen; + const char *tname; if (dentry->d_name.hash != hashlen_hash(hashlen)) continue; - *seqp = seq; - switch (slow_dentry_cmp(parent, dentry, seq, name)) { - case D_COMP_OK: - return dentry; - case D_COMP_NOMATCH: - continue; - default: + tlen = dentry->d_name.len; + tname = dentry->d_name.name; + /* we want a consistent (name,len) pair */ + if (read_seqcount_retry(&dentry->d_seq, seq)) { + cpu_relax(); goto seqretry; } + if (parent->d_op->d_compare(parent, dentry, + tlen, tname, name) != 0) + continue; + } else { + if (dentry->d_name.hash_len != hashlen) + continue; + if (dentry_cmp(dentry, str, hashlen_len(hashlen)) != 0) + continue; } - - if (dentry->d_name.hash_len != hashlen) - continue; *seqp = seq; - if (!dentry_cmp(dentry, str, hashlen_len(hashlen))) - return dentry; + return dentry; } return NULL; } @@ -2209,10 +2221,8 @@ EXPORT_SYMBOL(d_lookup); */ struct dentry *__d_lookup(const struct dentry *parent, const struct qstr *name) { - unsigned int len = name->len; unsigned int hash = name->hash; - const unsigned char *str = name->name; - struct hlist_bl_head *b = d_hash(parent, hash); + struct hlist_bl_head *b = d_hash(hash); struct hlist_bl_node *node; struct dentry *found = NULL; struct dentry *dentry; @@ -2250,21 +2260,8 @@ struct dentry *__d_lookup(const struct dentry *parent, const struct qstr *name) if (d_unhashed(dentry)) goto next; - /* - * It is safe to compare names since d_move() cannot - * change the qstr (protected by d_lock). - */ - if (parent->d_flags & DCACHE_OP_COMPARE) { - int tlen = dentry->d_name.len; - const char *tname = dentry->d_name.name; - if (parent->d_op->d_compare(parent, dentry, tlen, tname, name)) - goto next; - } else { - if (dentry->d_name.len != len) - goto next; - if (dentry_cmp(dentry, str, len)) - goto next; - } + if (!d_same_name(dentry, parent, name)) + goto next; dentry->d_lockref.count++; found = dentry; @@ -2292,7 +2289,7 @@ struct dentry *d_hash_and_lookup(struct dentry *dir, struct qstr *name) * calculate the standard hash first, as the d_op->d_hash() * routine may choose to leave the hash value unchanged. */ - name->hash = full_name_hash(name->name, name->len); + name->hash = full_name_hash(dir, name->name, name->len); if (dir->d_flags & DCACHE_OP_HASH) { int err = dir->d_op->d_hash(dir, name); if (unlikely(err < 0)) @@ -2359,14 +2356,13 @@ static void __d_rehash(struct dentry * entry, struct hlist_bl_head *b) { BUG_ON(!d_unhashed(entry)); hlist_bl_lock(b); - entry->d_flags |= DCACHE_RCUACCESS; hlist_bl_add_head_rcu(&entry->d_hash, b); hlist_bl_unlock(b); } static void _d_rehash(struct dentry * entry) { - __d_rehash(entry, d_hash(entry->d_parent, entry->d_name.hash)); + __d_rehash(entry, d_hash(entry->d_name.hash)); } /** @@ -2418,9 +2414,7 @@ struct dentry *d_alloc_parallel(struct dentry *parent, const struct qstr *name, wait_queue_head_t *wq) { - unsigned int len = name->len; unsigned int hash = name->hash; - const unsigned char *str = name->name; struct hlist_bl_head *b = in_lookup_hash(parent, hash); struct hlist_bl_node *node; struct dentry *new = d_alloc(parent, name); @@ -2459,7 +2453,6 @@ retry: rcu_read_unlock(); goto retry; } - rcu_read_unlock(); /* * No changes for the parent since the beginning of d_lookup(). * Since all removals from the chain happen with hlist_bl_lock(), @@ -2472,22 +2465,20 @@ retry: continue; if (dentry->d_parent != parent) continue; - if (d_unhashed(dentry)) + if (!d_same_name(dentry, parent, name)) continue; - if (parent->d_flags & DCACHE_OP_COMPARE) { - int tlen = dentry->d_name.len; - const char *tname = dentry->d_name.name; - if (parent->d_op->d_compare(parent, dentry, tlen, tname, name)) - continue; - } else { - if (dentry->d_name.len != len) - continue; - if (dentry_cmp(dentry, str, len)) - continue; - } - dget(dentry); hlist_bl_unlock(b); - /* somebody is doing lookup for it right now; wait for it */ + /* now we can try to grab a reference */ + if (!lockref_get_not_dead(&dentry->d_lockref)) { + rcu_read_unlock(); + goto retry; + } + + rcu_read_unlock(); + /* + * somebody is likely to be still doing lookup for it; + * wait for them to finish + */ spin_lock(&dentry->d_lock); d_wait_lookup(dentry); /* @@ -2502,22 +2493,14 @@ retry: goto mismatch; if (unlikely(d_unhashed(dentry))) goto mismatch; - if (parent->d_flags & DCACHE_OP_COMPARE) { - int tlen = dentry->d_name.len; - const char *tname = dentry->d_name.name; - if (parent->d_op->d_compare(parent, dentry, tlen, tname, name)) - goto mismatch; - } else { - if (unlikely(dentry->d_name.len != len)) - goto mismatch; - if (unlikely(dentry_cmp(dentry, str, len))) - goto mismatch; - } + if (unlikely(!d_same_name(dentry, parent, name))) + goto mismatch; /* OK, it *is* a hashed match; return it */ spin_unlock(&dentry->d_lock); dput(new); return dentry; } + rcu_read_unlock(); /* we can't take ->d_lock here; it's OK, though. */ new->d_flags |= DCACHE_PAR_LOOKUP; new->d_wait = wq; @@ -2564,7 +2547,7 @@ static inline void __d_add(struct dentry *dentry, struct inode *inode) raw_write_seqcount_begin(&dentry->d_seq); __d_set_inode_and_type(dentry, inode, add_flags); raw_write_seqcount_end(&dentry->d_seq); - __fsnotify_d_instantiate(dentry); + fsnotify_update_flags(dentry); } _d_rehash(dentry); if (dir) @@ -2607,8 +2590,6 @@ EXPORT_SYMBOL(d_add); struct dentry *d_exact_alias(struct dentry *entry, struct inode *inode) { struct dentry *alias; - int len = entry->d_name.len; - const char *name = entry->d_name.name; unsigned int hash = entry->d_name.hash; spin_lock(&inode->i_lock); @@ -2622,9 +2603,7 @@ struct dentry *d_exact_alias(struct dentry *entry, struct inode *inode) continue; if (alias->d_parent != entry->d_parent) continue; - if (alias->d_name.len != len) - continue; - if (dentry_cmp(alias, name, len)) + if (!d_same_name(alias, entry->d_parent, &entry->d_name)) continue; spin_lock(&alias->d_lock); if (!d_unhashed(alias)) { @@ -2823,7 +2802,7 @@ static void __d_move(struct dentry *dentry, struct dentry *target, * for the same hash queue because of how unlikely it is. */ __d_drop(dentry); - __d_rehash(dentry, d_hash(target->d_parent, target->d_name.hash)); + __d_rehash(dentry, d_hash(target->d_name.hash)); /* * Unhash the target (d_delete() is not usable here). If exchanging @@ -2831,8 +2810,7 @@ static void __d_move(struct dentry *dentry, struct dentry *target, */ __d_drop(target); if (exchange) { - __d_rehash(target, - d_hash(dentry->d_parent, dentry->d_name.hash)); + __d_rehash(target, d_hash(dentry->d_name.hash)); } /* Switch the names.. */ @@ -2844,6 +2822,7 @@ static void __d_move(struct dentry *dentry, struct dentry *target, /* ... and switch them in the tree */ if (IS_ROOT(dentry)) { /* splicing a tree */ + dentry->d_flags |= DCACHE_RCUACCESS; dentry->d_parent = target->d_parent; target->d_parent = target; list_del_init(&target->d_child); @@ -2854,8 +2833,8 @@ static void __d_move(struct dentry *dentry, struct dentry *target, list_move(&target->d_child, &target->d_parent->d_subdirs); list_move(&dentry->d_child, &dentry->d_parent->d_subdirs); if (exchange) - fsnotify_d_move(target); - fsnotify_d_move(dentry); + fsnotify_update_flags(target); + fsnotify_update_flags(dentry); } write_seqcount_end(&target->d_seq); diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c index d2ba12e23ed9..592059f88e04 100644 --- a/fs/debugfs/file.c +++ b/fs/debugfs/file.c @@ -22,6 +22,12 @@ #include <linux/slab.h> #include <linux/atomic.h> #include <linux/device.h> +#include <linux/srcu.h> +#include <asm/poll.h> + +#include "internal.h" + +struct poll_table_struct; static ssize_t default_read_file(struct file *file, char __user *buf, size_t count, loff_t *ppos) @@ -35,27 +41,294 @@ static ssize_t default_write_file(struct file *file, const char __user *buf, return count; } -const struct file_operations debugfs_file_operations = { +const struct file_operations debugfs_noop_file_operations = { .read = default_read_file, .write = default_write_file, .open = simple_open, .llseek = noop_llseek, }; -static struct dentry *debugfs_create_mode(const char *name, umode_t mode, - struct dentry *parent, void *value, - const struct file_operations *fops, - const struct file_operations *fops_ro, - const struct file_operations *fops_wo) +/** + * debugfs_use_file_start - mark the beginning of file data access + * @dentry: the dentry object whose data is being accessed. + * @srcu_idx: a pointer to some memory to store a SRCU index in. + * + * Up to a matching call to debugfs_use_file_finish(), any + * successive call into the file removing functions debugfs_remove() + * and debugfs_remove_recursive() will block. Since associated private + * file data may only get freed after a successful return of any of + * the removal functions, you may safely access it after a successful + * call to debugfs_use_file_start() without worrying about + * lifetime issues. + * + * If -%EIO is returned, the file has already been removed and thus, + * it is not safe to access any of its data. If, on the other hand, + * it is allowed to access the file data, zero is returned. + * + * Regardless of the return code, any call to + * debugfs_use_file_start() must be followed by a matching call + * to debugfs_use_file_finish(). + */ +int debugfs_use_file_start(const struct dentry *dentry, int *srcu_idx) + __acquires(&debugfs_srcu) +{ + *srcu_idx = srcu_read_lock(&debugfs_srcu); + barrier(); + if (d_unlinked(dentry)) + return -EIO; + return 0; +} +EXPORT_SYMBOL_GPL(debugfs_use_file_start); + +/** + * debugfs_use_file_finish - mark the end of file data access + * @srcu_idx: the SRCU index "created" by a former call to + * debugfs_use_file_start(). + * + * Allow any ongoing concurrent call into debugfs_remove() or + * debugfs_remove_recursive() blocked by a former call to + * debugfs_use_file_start() to proceed and return to its caller. + */ +void debugfs_use_file_finish(int srcu_idx) __releases(&debugfs_srcu) +{ + srcu_read_unlock(&debugfs_srcu, srcu_idx); +} +EXPORT_SYMBOL_GPL(debugfs_use_file_finish); + +#define F_DENTRY(filp) ((filp)->f_path.dentry) + +#define REAL_FOPS_DEREF(dentry) \ + ((const struct file_operations *)(dentry)->d_fsdata) + +static int open_proxy_open(struct inode *inode, struct file *filp) +{ + const struct dentry *dentry = F_DENTRY(filp); + const struct file_operations *real_fops = NULL; + int srcu_idx, r; + + r = debugfs_use_file_start(dentry, &srcu_idx); + if (r) { + r = -ENOENT; + goto out; + } + + real_fops = REAL_FOPS_DEREF(dentry); + real_fops = fops_get(real_fops); + if (!real_fops) { + /* Huh? Module did not clean up after itself at exit? */ + WARN(1, "debugfs file owner did not clean up at exit: %pd", + dentry); + r = -ENXIO; + goto out; + } + replace_fops(filp, real_fops); + + if (real_fops->open) + r = real_fops->open(inode, filp); + +out: + debugfs_use_file_finish(srcu_idx); + return r; +} + +const struct file_operations debugfs_open_proxy_file_operations = { + .open = open_proxy_open, +}; + +#define PROTO(args...) args +#define ARGS(args...) args + +#define FULL_PROXY_FUNC(name, ret_type, filp, proto, args) \ +static ret_type full_proxy_ ## name(proto) \ +{ \ + const struct dentry *dentry = F_DENTRY(filp); \ + const struct file_operations *real_fops = \ + REAL_FOPS_DEREF(dentry); \ + int srcu_idx; \ + ret_type r; \ + \ + r = debugfs_use_file_start(dentry, &srcu_idx); \ + if (likely(!r)) \ + r = real_fops->name(args); \ + debugfs_use_file_finish(srcu_idx); \ + return r; \ +} + +FULL_PROXY_FUNC(llseek, loff_t, filp, + PROTO(struct file *filp, loff_t offset, int whence), + ARGS(filp, offset, whence)); + +FULL_PROXY_FUNC(read, ssize_t, filp, + PROTO(struct file *filp, char __user *buf, size_t size, + loff_t *ppos), + ARGS(filp, buf, size, ppos)); + +FULL_PROXY_FUNC(write, ssize_t, filp, + PROTO(struct file *filp, const char __user *buf, size_t size, + loff_t *ppos), + ARGS(filp, buf, size, ppos)); + +FULL_PROXY_FUNC(unlocked_ioctl, long, filp, + PROTO(struct file *filp, unsigned int cmd, unsigned long arg), + ARGS(filp, cmd, arg)); + +static unsigned int full_proxy_poll(struct file *filp, + struct poll_table_struct *wait) +{ + const struct dentry *dentry = F_DENTRY(filp); + const struct file_operations *real_fops = REAL_FOPS_DEREF(dentry); + int srcu_idx; + unsigned int r = 0; + + if (debugfs_use_file_start(dentry, &srcu_idx)) { + debugfs_use_file_finish(srcu_idx); + return POLLHUP; + } + + r = real_fops->poll(filp, wait); + debugfs_use_file_finish(srcu_idx); + return r; +} + +static int full_proxy_release(struct inode *inode, struct file *filp) +{ + const struct dentry *dentry = F_DENTRY(filp); + const struct file_operations *real_fops = REAL_FOPS_DEREF(dentry); + const struct file_operations *proxy_fops = filp->f_op; + int r = 0; + + /* + * We must not protect this against removal races here: the + * original releaser should be called unconditionally in order + * not to leak any resources. Releasers must not assume that + * ->i_private is still being meaningful here. + */ + if (real_fops->release) + r = real_fops->release(inode, filp); + + replace_fops(filp, d_inode(dentry)->i_fop); + kfree((void *)proxy_fops); + fops_put(real_fops); + return 0; +} + +static void __full_proxy_fops_init(struct file_operations *proxy_fops, + const struct file_operations *real_fops) +{ + proxy_fops->release = full_proxy_release; + if (real_fops->llseek) + proxy_fops->llseek = full_proxy_llseek; + if (real_fops->read) + proxy_fops->read = full_proxy_read; + if (real_fops->write) + proxy_fops->write = full_proxy_write; + if (real_fops->poll) + proxy_fops->poll = full_proxy_poll; + if (real_fops->unlocked_ioctl) + proxy_fops->unlocked_ioctl = full_proxy_unlocked_ioctl; +} + +static int full_proxy_open(struct inode *inode, struct file *filp) +{ + const struct dentry *dentry = F_DENTRY(filp); + const struct file_operations *real_fops = NULL; + struct file_operations *proxy_fops = NULL; + int srcu_idx, r; + + r = debugfs_use_file_start(dentry, &srcu_idx); + if (r) { + r = -ENOENT; + goto out; + } + + real_fops = REAL_FOPS_DEREF(dentry); + real_fops = fops_get(real_fops); + if (!real_fops) { + /* Huh? Module did not cleanup after itself at exit? */ + WARN(1, "debugfs file owner did not clean up at exit: %pd", + dentry); + r = -ENXIO; + goto out; + } + + proxy_fops = kzalloc(sizeof(*proxy_fops), GFP_KERNEL); + if (!proxy_fops) { + r = -ENOMEM; + goto free_proxy; + } + __full_proxy_fops_init(proxy_fops, real_fops); + replace_fops(filp, proxy_fops); + + if (real_fops->open) { + r = real_fops->open(inode, filp); + if (r) { + replace_fops(filp, d_inode(dentry)->i_fop); + goto free_proxy; + } else if (filp->f_op != proxy_fops) { + /* No protection against file removal anymore. */ + WARN(1, "debugfs file owner replaced proxy fops: %pd", + dentry); + goto free_proxy; + } + } + + goto out; +free_proxy: + kfree(proxy_fops); + fops_put(real_fops); +out: + debugfs_use_file_finish(srcu_idx); + return r; +} + +const struct file_operations debugfs_full_proxy_file_operations = { + .open = full_proxy_open, +}; + +ssize_t debugfs_attr_read(struct file *file, char __user *buf, + size_t len, loff_t *ppos) +{ + ssize_t ret; + int srcu_idx; + + ret = debugfs_use_file_start(F_DENTRY(file), &srcu_idx); + if (likely(!ret)) + ret = simple_attr_read(file, buf, len, ppos); + debugfs_use_file_finish(srcu_idx); + return ret; +} +EXPORT_SYMBOL_GPL(debugfs_attr_read); + +ssize_t debugfs_attr_write(struct file *file, const char __user *buf, + size_t len, loff_t *ppos) +{ + ssize_t ret; + int srcu_idx; + + ret = debugfs_use_file_start(F_DENTRY(file), &srcu_idx); + if (likely(!ret)) + ret = simple_attr_write(file, buf, len, ppos); + debugfs_use_file_finish(srcu_idx); + return ret; +} +EXPORT_SYMBOL_GPL(debugfs_attr_write); + +static struct dentry *debugfs_create_mode_unsafe(const char *name, umode_t mode, + struct dentry *parent, void *value, + const struct file_operations *fops, + const struct file_operations *fops_ro, + const struct file_operations *fops_wo) { /* if there are no write bits set, make read only */ if (!(mode & S_IWUGO)) - return debugfs_create_file(name, mode, parent, value, fops_ro); + return debugfs_create_file_unsafe(name, mode, parent, value, + fops_ro); /* if there are no read bits set, make write only */ if (!(mode & S_IRUGO)) - return debugfs_create_file(name, mode, parent, value, fops_wo); + return debugfs_create_file_unsafe(name, mode, parent, value, + fops_wo); - return debugfs_create_file(name, mode, parent, value, fops); + return debugfs_create_file_unsafe(name, mode, parent, value, fops); } static int debugfs_u8_set(void *data, u64 val) @@ -68,9 +341,9 @@ static int debugfs_u8_get(void *data, u64 *val) *val = *(u8 *)data; return 0; } -DEFINE_SIMPLE_ATTRIBUTE(fops_u8, debugfs_u8_get, debugfs_u8_set, "%llu\n"); -DEFINE_SIMPLE_ATTRIBUTE(fops_u8_ro, debugfs_u8_get, NULL, "%llu\n"); -DEFINE_SIMPLE_ATTRIBUTE(fops_u8_wo, NULL, debugfs_u8_set, "%llu\n"); +DEFINE_DEBUGFS_ATTRIBUTE(fops_u8, debugfs_u8_get, debugfs_u8_set, "%llu\n"); +DEFINE_DEBUGFS_ATTRIBUTE(fops_u8_ro, debugfs_u8_get, NULL, "%llu\n"); +DEFINE_DEBUGFS_ATTRIBUTE(fops_u8_wo, NULL, debugfs_u8_set, "%llu\n"); /** * debugfs_create_u8 - create a debugfs file that is used to read and write an unsigned 8-bit value @@ -99,7 +372,7 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_u8_wo, NULL, debugfs_u8_set, "%llu\n"); struct dentry *debugfs_create_u8(const char *name, umode_t mode, struct dentry *parent, u8 *value) { - return debugfs_create_mode(name, mode, parent, value, &fops_u8, + return debugfs_create_mode_unsafe(name, mode, parent, value, &fops_u8, &fops_u8_ro, &fops_u8_wo); } EXPORT_SYMBOL_GPL(debugfs_create_u8); @@ -114,9 +387,9 @@ static int debugfs_u16_get(void *data, u64 *val) *val = *(u16 *)data; return 0; } -DEFINE_SIMPLE_ATTRIBUTE(fops_u16, debugfs_u16_get, debugfs_u16_set, "%llu\n"); -DEFINE_SIMPLE_ATTRIBUTE(fops_u16_ro, debugfs_u16_get, NULL, "%llu\n"); -DEFINE_SIMPLE_ATTRIBUTE(fops_u16_wo, NULL, debugfs_u16_set, "%llu\n"); +DEFINE_DEBUGFS_ATTRIBUTE(fops_u16, debugfs_u16_get, debugfs_u16_set, "%llu\n"); +DEFINE_DEBUGFS_ATTRIBUTE(fops_u16_ro, debugfs_u16_get, NULL, "%llu\n"); +DEFINE_DEBUGFS_ATTRIBUTE(fops_u16_wo, NULL, debugfs_u16_set, "%llu\n"); /** * debugfs_create_u16 - create a debugfs file that is used to read and write an unsigned 16-bit value @@ -145,7 +418,7 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_u16_wo, NULL, debugfs_u16_set, "%llu\n"); struct dentry *debugfs_create_u16(const char *name, umode_t mode, struct dentry *parent, u16 *value) { - return debugfs_create_mode(name, mode, parent, value, &fops_u16, + return debugfs_create_mode_unsafe(name, mode, parent, value, &fops_u16, &fops_u16_ro, &fops_u16_wo); } EXPORT_SYMBOL_GPL(debugfs_create_u16); @@ -160,9 +433,9 @@ static int debugfs_u32_get(void *data, u64 *val) *val = *(u32 *)data; return 0; } -DEFINE_SIMPLE_ATTRIBUTE(fops_u32, debugfs_u32_get, debugfs_u32_set, "%llu\n"); -DEFINE_SIMPLE_ATTRIBUTE(fops_u32_ro, debugfs_u32_get, NULL, "%llu\n"); -DEFINE_SIMPLE_ATTRIBUTE(fops_u32_wo, NULL, debugfs_u32_set, "%llu\n"); +DEFINE_DEBUGFS_ATTRIBUTE(fops_u32, debugfs_u32_get, debugfs_u32_set, "%llu\n"); +DEFINE_DEBUGFS_ATTRIBUTE(fops_u32_ro, debugfs_u32_get, NULL, "%llu\n"); +DEFINE_DEBUGFS_ATTRIBUTE(fops_u32_wo, NULL, debugfs_u32_set, "%llu\n"); /** * debugfs_create_u32 - create a debugfs file that is used to read and write an unsigned 32-bit value @@ -191,7 +464,7 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_u32_wo, NULL, debugfs_u32_set, "%llu\n"); struct dentry *debugfs_create_u32(const char *name, umode_t mode, struct dentry *parent, u32 *value) { - return debugfs_create_mode(name, mode, parent, value, &fops_u32, + return debugfs_create_mode_unsafe(name, mode, parent, value, &fops_u32, &fops_u32_ro, &fops_u32_wo); } EXPORT_SYMBOL_GPL(debugfs_create_u32); @@ -207,9 +480,9 @@ static int debugfs_u64_get(void *data, u64 *val) *val = *(u64 *)data; return 0; } -DEFINE_SIMPLE_ATTRIBUTE(fops_u64, debugfs_u64_get, debugfs_u64_set, "%llu\n"); -DEFINE_SIMPLE_ATTRIBUTE(fops_u64_ro, debugfs_u64_get, NULL, "%llu\n"); -DEFINE_SIMPLE_ATTRIBUTE(fops_u64_wo, NULL, debugfs_u64_set, "%llu\n"); +DEFINE_DEBUGFS_ATTRIBUTE(fops_u64, debugfs_u64_get, debugfs_u64_set, "%llu\n"); +DEFINE_DEBUGFS_ATTRIBUTE(fops_u64_ro, debugfs_u64_get, NULL, "%llu\n"); +DEFINE_DEBUGFS_ATTRIBUTE(fops_u64_wo, NULL, debugfs_u64_set, "%llu\n"); /** * debugfs_create_u64 - create a debugfs file that is used to read and write an unsigned 64-bit value @@ -238,7 +511,7 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_u64_wo, NULL, debugfs_u64_set, "%llu\n"); struct dentry *debugfs_create_u64(const char *name, umode_t mode, struct dentry *parent, u64 *value) { - return debugfs_create_mode(name, mode, parent, value, &fops_u64, + return debugfs_create_mode_unsafe(name, mode, parent, value, &fops_u64, &fops_u64_ro, &fops_u64_wo); } EXPORT_SYMBOL_GPL(debugfs_create_u64); @@ -254,9 +527,10 @@ static int debugfs_ulong_get(void *data, u64 *val) *val = *(unsigned long *)data; return 0; } -DEFINE_SIMPLE_ATTRIBUTE(fops_ulong, debugfs_ulong_get, debugfs_ulong_set, "%llu\n"); -DEFINE_SIMPLE_ATTRIBUTE(fops_ulong_ro, debugfs_ulong_get, NULL, "%llu\n"); -DEFINE_SIMPLE_ATTRIBUTE(fops_ulong_wo, NULL, debugfs_ulong_set, "%llu\n"); +DEFINE_DEBUGFS_ATTRIBUTE(fops_ulong, debugfs_ulong_get, debugfs_ulong_set, + "%llu\n"); +DEFINE_DEBUGFS_ATTRIBUTE(fops_ulong_ro, debugfs_ulong_get, NULL, "%llu\n"); +DEFINE_DEBUGFS_ATTRIBUTE(fops_ulong_wo, NULL, debugfs_ulong_set, "%llu\n"); /** * debugfs_create_ulong - create a debugfs file that is used to read and write @@ -286,26 +560,30 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_ulong_wo, NULL, debugfs_ulong_set, "%llu\n"); struct dentry *debugfs_create_ulong(const char *name, umode_t mode, struct dentry *parent, unsigned long *value) { - return debugfs_create_mode(name, mode, parent, value, &fops_ulong, - &fops_ulong_ro, &fops_ulong_wo); + return debugfs_create_mode_unsafe(name, mode, parent, value, + &fops_ulong, &fops_ulong_ro, + &fops_ulong_wo); } EXPORT_SYMBOL_GPL(debugfs_create_ulong); -DEFINE_SIMPLE_ATTRIBUTE(fops_x8, debugfs_u8_get, debugfs_u8_set, "0x%02llx\n"); -DEFINE_SIMPLE_ATTRIBUTE(fops_x8_ro, debugfs_u8_get, NULL, "0x%02llx\n"); -DEFINE_SIMPLE_ATTRIBUTE(fops_x8_wo, NULL, debugfs_u8_set, "0x%02llx\n"); +DEFINE_DEBUGFS_ATTRIBUTE(fops_x8, debugfs_u8_get, debugfs_u8_set, "0x%02llx\n"); +DEFINE_DEBUGFS_ATTRIBUTE(fops_x8_ro, debugfs_u8_get, NULL, "0x%02llx\n"); +DEFINE_DEBUGFS_ATTRIBUTE(fops_x8_wo, NULL, debugfs_u8_set, "0x%02llx\n"); -DEFINE_SIMPLE_ATTRIBUTE(fops_x16, debugfs_u16_get, debugfs_u16_set, "0x%04llx\n"); -DEFINE_SIMPLE_ATTRIBUTE(fops_x16_ro, debugfs_u16_get, NULL, "0x%04llx\n"); -DEFINE_SIMPLE_ATTRIBUTE(fops_x16_wo, NULL, debugfs_u16_set, "0x%04llx\n"); +DEFINE_DEBUGFS_ATTRIBUTE(fops_x16, debugfs_u16_get, debugfs_u16_set, + "0x%04llx\n"); +DEFINE_DEBUGFS_ATTRIBUTE(fops_x16_ro, debugfs_u16_get, NULL, "0x%04llx\n"); +DEFINE_DEBUGFS_ATTRIBUTE(fops_x16_wo, NULL, debugfs_u16_set, "0x%04llx\n"); -DEFINE_SIMPLE_ATTRIBUTE(fops_x32, debugfs_u32_get, debugfs_u32_set, "0x%08llx\n"); -DEFINE_SIMPLE_ATTRIBUTE(fops_x32_ro, debugfs_u32_get, NULL, "0x%08llx\n"); -DEFINE_SIMPLE_ATTRIBUTE(fops_x32_wo, NULL, debugfs_u32_set, "0x%08llx\n"); +DEFINE_DEBUGFS_ATTRIBUTE(fops_x32, debugfs_u32_get, debugfs_u32_set, + "0x%08llx\n"); +DEFINE_DEBUGFS_ATTRIBUTE(fops_x32_ro, debugfs_u32_get, NULL, "0x%08llx\n"); +DEFINE_DEBUGFS_ATTRIBUTE(fops_x32_wo, NULL, debugfs_u32_set, "0x%08llx\n"); -DEFINE_SIMPLE_ATTRIBUTE(fops_x64, debugfs_u64_get, debugfs_u64_set, "0x%016llx\n"); -DEFINE_SIMPLE_ATTRIBUTE(fops_x64_ro, debugfs_u64_get, NULL, "0x%016llx\n"); -DEFINE_SIMPLE_ATTRIBUTE(fops_x64_wo, NULL, debugfs_u64_set, "0x%016llx\n"); +DEFINE_DEBUGFS_ATTRIBUTE(fops_x64, debugfs_u64_get, debugfs_u64_set, + "0x%016llx\n"); +DEFINE_DEBUGFS_ATTRIBUTE(fops_x64_ro, debugfs_u64_get, NULL, "0x%016llx\n"); +DEFINE_DEBUGFS_ATTRIBUTE(fops_x64_wo, NULL, debugfs_u64_set, "0x%016llx\n"); /* * debugfs_create_x{8,16,32,64} - create a debugfs file that is used to read and write an unsigned {8,16,32,64}-bit value @@ -328,7 +606,7 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_x64_wo, NULL, debugfs_u64_set, "0x%016llx\n"); struct dentry *debugfs_create_x8(const char *name, umode_t mode, struct dentry *parent, u8 *value) { - return debugfs_create_mode(name, mode, parent, value, &fops_x8, + return debugfs_create_mode_unsafe(name, mode, parent, value, &fops_x8, &fops_x8_ro, &fops_x8_wo); } EXPORT_SYMBOL_GPL(debugfs_create_x8); @@ -346,7 +624,7 @@ EXPORT_SYMBOL_GPL(debugfs_create_x8); struct dentry *debugfs_create_x16(const char *name, umode_t mode, struct dentry *parent, u16 *value) { - return debugfs_create_mode(name, mode, parent, value, &fops_x16, + return debugfs_create_mode_unsafe(name, mode, parent, value, &fops_x16, &fops_x16_ro, &fops_x16_wo); } EXPORT_SYMBOL_GPL(debugfs_create_x16); @@ -364,7 +642,7 @@ EXPORT_SYMBOL_GPL(debugfs_create_x16); struct dentry *debugfs_create_x32(const char *name, umode_t mode, struct dentry *parent, u32 *value) { - return debugfs_create_mode(name, mode, parent, value, &fops_x32, + return debugfs_create_mode_unsafe(name, mode, parent, value, &fops_x32, &fops_x32_ro, &fops_x32_wo); } EXPORT_SYMBOL_GPL(debugfs_create_x32); @@ -382,7 +660,7 @@ EXPORT_SYMBOL_GPL(debugfs_create_x32); struct dentry *debugfs_create_x64(const char *name, umode_t mode, struct dentry *parent, u64 *value) { - return debugfs_create_mode(name, mode, parent, value, &fops_x64, + return debugfs_create_mode_unsafe(name, mode, parent, value, &fops_x64, &fops_x64_ro, &fops_x64_wo); } EXPORT_SYMBOL_GPL(debugfs_create_x64); @@ -398,10 +676,10 @@ static int debugfs_size_t_get(void *data, u64 *val) *val = *(size_t *)data; return 0; } -DEFINE_SIMPLE_ATTRIBUTE(fops_size_t, debugfs_size_t_get, debugfs_size_t_set, - "%llu\n"); /* %llu and %zu are more or less the same */ -DEFINE_SIMPLE_ATTRIBUTE(fops_size_t_ro, debugfs_size_t_get, NULL, "%llu\n"); -DEFINE_SIMPLE_ATTRIBUTE(fops_size_t_wo, NULL, debugfs_size_t_set, "%llu\n"); +DEFINE_DEBUGFS_ATTRIBUTE(fops_size_t, debugfs_size_t_get, debugfs_size_t_set, + "%llu\n"); /* %llu and %zu are more or less the same */ +DEFINE_DEBUGFS_ATTRIBUTE(fops_size_t_ro, debugfs_size_t_get, NULL, "%llu\n"); +DEFINE_DEBUGFS_ATTRIBUTE(fops_size_t_wo, NULL, debugfs_size_t_set, "%llu\n"); /** * debugfs_create_size_t - create a debugfs file that is used to read and write an size_t value @@ -416,8 +694,9 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_size_t_wo, NULL, debugfs_size_t_set, "%llu\n"); struct dentry *debugfs_create_size_t(const char *name, umode_t mode, struct dentry *parent, size_t *value) { - return debugfs_create_mode(name, mode, parent, value, &fops_size_t, - &fops_size_t_ro, &fops_size_t_wo); + return debugfs_create_mode_unsafe(name, mode, parent, value, + &fops_size_t, &fops_size_t_ro, + &fops_size_t_wo); } EXPORT_SYMBOL_GPL(debugfs_create_size_t); @@ -431,10 +710,12 @@ static int debugfs_atomic_t_get(void *data, u64 *val) *val = atomic_read((atomic_t *)data); return 0; } -DEFINE_SIMPLE_ATTRIBUTE(fops_atomic_t, debugfs_atomic_t_get, +DEFINE_DEBUGFS_ATTRIBUTE(fops_atomic_t, debugfs_atomic_t_get, debugfs_atomic_t_set, "%lld\n"); -DEFINE_SIMPLE_ATTRIBUTE(fops_atomic_t_ro, debugfs_atomic_t_get, NULL, "%lld\n"); -DEFINE_SIMPLE_ATTRIBUTE(fops_atomic_t_wo, NULL, debugfs_atomic_t_set, "%lld\n"); +DEFINE_DEBUGFS_ATTRIBUTE(fops_atomic_t_ro, debugfs_atomic_t_get, NULL, + "%lld\n"); +DEFINE_DEBUGFS_ATTRIBUTE(fops_atomic_t_wo, NULL, debugfs_atomic_t_set, + "%lld\n"); /** * debugfs_create_atomic_t - create a debugfs file that is used to read and @@ -450,8 +731,9 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_atomic_t_wo, NULL, debugfs_atomic_t_set, "%lld\n"); struct dentry *debugfs_create_atomic_t(const char *name, umode_t mode, struct dentry *parent, atomic_t *value) { - return debugfs_create_mode(name, mode, parent, value, &fops_atomic_t, - &fops_atomic_t_ro, &fops_atomic_t_wo); + return debugfs_create_mode_unsafe(name, mode, parent, value, + &fops_atomic_t, &fops_atomic_t_ro, + &fops_atomic_t_wo); } EXPORT_SYMBOL_GPL(debugfs_create_atomic_t); @@ -459,9 +741,17 @@ ssize_t debugfs_read_file_bool(struct file *file, char __user *user_buf, size_t count, loff_t *ppos) { char buf[3]; - bool *val = file->private_data; + bool val; + int r, srcu_idx; - if (*val) + r = debugfs_use_file_start(F_DENTRY(file), &srcu_idx); + if (likely(!r)) + val = *(bool *)file->private_data; + debugfs_use_file_finish(srcu_idx); + if (r) + return r; + + if (val) buf[0] = 'Y'; else buf[0] = 'N'; @@ -477,6 +767,7 @@ ssize_t debugfs_write_file_bool(struct file *file, const char __user *user_buf, char buf[32]; size_t buf_size; bool bv; + int r, srcu_idx; bool *val = file->private_data; buf_size = min(count, (sizeof(buf)-1)); @@ -484,8 +775,14 @@ ssize_t debugfs_write_file_bool(struct file *file, const char __user *user_buf, return -EFAULT; buf[buf_size] = '\0'; - if (strtobool(buf, &bv) == 0) - *val = bv; + if (strtobool(buf, &bv) == 0) { + r = debugfs_use_file_start(F_DENTRY(file), &srcu_idx); + if (likely(!r)) + *val = bv; + debugfs_use_file_finish(srcu_idx); + if (r) + return r; + } return count; } @@ -537,7 +834,7 @@ static const struct file_operations fops_bool_wo = { struct dentry *debugfs_create_bool(const char *name, umode_t mode, struct dentry *parent, bool *value) { - return debugfs_create_mode(name, mode, parent, value, &fops_bool, + return debugfs_create_mode_unsafe(name, mode, parent, value, &fops_bool, &fops_bool_ro, &fops_bool_wo); } EXPORT_SYMBOL_GPL(debugfs_create_bool); @@ -546,8 +843,15 @@ static ssize_t read_file_blob(struct file *file, char __user *user_buf, size_t count, loff_t *ppos) { struct debugfs_blob_wrapper *blob = file->private_data; - return simple_read_from_buffer(user_buf, count, ppos, blob->data, - blob->size); + ssize_t r; + int srcu_idx; + + r = debugfs_use_file_start(F_DENTRY(file), &srcu_idx); + if (likely(!r)) + r = simple_read_from_buffer(user_buf, count, ppos, blob->data, + blob->size); + debugfs_use_file_finish(srcu_idx); + return r; } static const struct file_operations fops_blob = { @@ -584,7 +888,7 @@ struct dentry *debugfs_create_blob(const char *name, umode_t mode, struct dentry *parent, struct debugfs_blob_wrapper *blob) { - return debugfs_create_file(name, mode, parent, blob, &fops_blob); + return debugfs_create_file_unsafe(name, mode, parent, blob, &fops_blob); } EXPORT_SYMBOL_GPL(debugfs_create_blob); @@ -689,7 +993,8 @@ struct dentry *debugfs_create_u32_array(const char *name, umode_t mode, data->array = array; data->elements = elements; - return debugfs_create_file(name, mode, parent, data, &u32_array_fops); + return debugfs_create_file_unsafe(name, mode, parent, data, + &u32_array_fops); } EXPORT_SYMBOL_GPL(debugfs_create_u32_array); diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index 8580831ed237..72361baf9da7 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -27,9 +27,14 @@ #include <linux/parser.h> #include <linux/magic.h> #include <linux/slab.h> +#include <linux/srcu.h> + +#include "internal.h" #define DEBUGFS_DEFAULT_MODE 0700 +DEFINE_SRCU(debugfs_srcu); + static struct vfsmount *debugfs_mount; static int debugfs_mount_count; static bool debugfs_registered; @@ -39,7 +44,8 @@ static struct inode *debugfs_get_inode(struct super_block *sb) struct inode *inode = new_inode(sb); if (inode) { inode->i_ino = get_next_ino(); - inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; + inode->i_atime = inode->i_mtime = + inode->i_ctime = current_fs_time(sb); } return inode; } @@ -294,6 +300,37 @@ static struct dentry *end_creating(struct dentry *dentry) return dentry; } +static struct dentry *__debugfs_create_file(const char *name, umode_t mode, + struct dentry *parent, void *data, + const struct file_operations *proxy_fops, + const struct file_operations *real_fops) +{ + struct dentry *dentry; + struct inode *inode; + + if (!(mode & S_IFMT)) + mode |= S_IFREG; + BUG_ON(!S_ISREG(mode)); + dentry = start_creating(name, parent); + + if (IS_ERR(dentry)) + return NULL; + + inode = debugfs_get_inode(dentry->d_sb); + if (unlikely(!inode)) + return failed_creating(dentry); + + inode->i_mode = mode; + inode->i_private = data; + + inode->i_fop = proxy_fops; + dentry->d_fsdata = (void *)real_fops; + + d_instantiate(dentry, inode); + fsnotify_create(d_inode(dentry->d_parent), dentry); + return end_creating(dentry); +} + /** * debugfs_create_file - create a file in the debugfs filesystem * @name: a pointer to a string containing the name of the file to create. @@ -324,29 +361,52 @@ struct dentry *debugfs_create_file(const char *name, umode_t mode, struct dentry *parent, void *data, const struct file_operations *fops) { - struct dentry *dentry; - struct inode *inode; - if (!(mode & S_IFMT)) - mode |= S_IFREG; - BUG_ON(!S_ISREG(mode)); - dentry = start_creating(name, parent); - - if (IS_ERR(dentry)) - return NULL; + return __debugfs_create_file(name, mode, parent, data, + fops ? &debugfs_full_proxy_file_operations : + &debugfs_noop_file_operations, + fops); +} +EXPORT_SYMBOL_GPL(debugfs_create_file); - inode = debugfs_get_inode(dentry->d_sb); - if (unlikely(!inode)) - return failed_creating(dentry); +/** + * debugfs_create_file_unsafe - create a file in the debugfs filesystem + * @name: a pointer to a string containing the name of the file to create. + * @mode: the permission that the file should have. + * @parent: a pointer to the parent dentry for this file. This should be a + * directory dentry if set. If this parameter is NULL, then the + * file will be created in the root of the debugfs filesystem. + * @data: a pointer to something that the caller will want to get to later + * on. The inode.i_private pointer will point to this value on + * the open() call. + * @fops: a pointer to a struct file_operations that should be used for + * this file. + * + * debugfs_create_file_unsafe() is completely analogous to + * debugfs_create_file(), the only difference being that the fops + * handed it will not get protected against file removals by the + * debugfs core. + * + * It is your responsibility to protect your struct file_operation + * methods against file removals by means of debugfs_use_file_start() + * and debugfs_use_file_finish(). ->open() is still protected by + * debugfs though. + * + * Any struct file_operations defined by means of + * DEFINE_DEBUGFS_ATTRIBUTE() is protected against file removals and + * thus, may be used here. + */ +struct dentry *debugfs_create_file_unsafe(const char *name, umode_t mode, + struct dentry *parent, void *data, + const struct file_operations *fops) +{ - inode->i_mode = mode; - inode->i_fop = fops ? fops : &debugfs_file_operations; - inode->i_private = data; - d_instantiate(dentry, inode); - fsnotify_create(d_inode(dentry->d_parent), dentry); - return end_creating(dentry); + return __debugfs_create_file(name, mode, parent, data, + fops ? &debugfs_open_proxy_file_operations : + &debugfs_noop_file_operations, + fops); } -EXPORT_SYMBOL_GPL(debugfs_create_file); +EXPORT_SYMBOL_GPL(debugfs_create_file_unsafe); /** * debugfs_create_file_size - create a file in the debugfs filesystem @@ -461,7 +521,11 @@ struct dentry *debugfs_create_automount(const char *name, inode->i_flags |= S_AUTOMOUNT; inode->i_private = data; dentry->d_fsdata = (void *)f; + /* directory inodes start off with i_nlink == 2 (for "." entry) */ + inc_nlink(inode); d_instantiate(dentry, inode); + inc_nlink(d_inode(dentry->d_parent)); + fsnotify_mkdir(d_inode(dentry->d_parent), dentry); return end_creating(dentry); } EXPORT_SYMBOL(debugfs_create_automount); @@ -557,14 +621,13 @@ void debugfs_remove(struct dentry *dentry) return; parent = dentry->d_parent; - if (!parent || d_really_is_negative(parent)) - return; - inode_lock(d_inode(parent)); ret = __debugfs_remove(dentry, parent); inode_unlock(d_inode(parent)); if (!ret) simple_release_fs(&debugfs_mount, &debugfs_mount_count); + + synchronize_srcu(&debugfs_srcu); } EXPORT_SYMBOL_GPL(debugfs_remove); @@ -588,10 +651,6 @@ void debugfs_remove_recursive(struct dentry *dentry) if (IS_ERR_OR_NULL(dentry)) return; - parent = dentry->d_parent; - if (!parent || d_really_is_negative(parent)) - return; - parent = dentry; down: inode_lock(d_inode(parent)); @@ -642,6 +701,8 @@ void debugfs_remove_recursive(struct dentry *dentry) if (!__debugfs_remove(child, parent)) simple_release_fs(&debugfs_mount, &debugfs_mount_count); inode_unlock(d_inode(parent)); + + synchronize_srcu(&debugfs_srcu); } EXPORT_SYMBOL_GPL(debugfs_remove_recursive); diff --git a/fs/debugfs/internal.h b/fs/debugfs/internal.h new file mode 100644 index 000000000000..bba52634b995 --- /dev/null +++ b/fs/debugfs/internal.h @@ -0,0 +1,26 @@ +/* + * internal.h - declarations internal to debugfs + * + * Copyright (C) 2016 Nicolai Stange <nicstange@gmail.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version + * 2 as published by the Free Software Foundation. + * + */ + +#ifndef _DEBUGFS_INTERNAL_H_ +#define _DEBUGFS_INTERNAL_H_ + +struct file_operations; + +/* declared over in file.c */ +extern const struct file_operations debugfs_noop_file_operations; +extern const struct file_operations debugfs_open_proxy_file_operations; +extern const struct file_operations debugfs_full_proxy_file_operations; + +struct dentry *debugfs_create_file_unsafe(const char *name, umode_t mode, + struct dentry *parent, void *data, + const struct file_operations *fops); + +#endif /* _DEBUGFS_INTERNAL_H_ */ diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c index 0b2954d7172d..37c134a132c7 100644 --- a/fs/devpts/inode.c +++ b/fs/devpts/inode.c @@ -95,8 +95,6 @@ static struct ctl_table pty_root_table[] = { static DEFINE_MUTEX(allocated_ptys_lock); -static struct vfsmount *devpts_mnt; - struct pts_mount_opts { int setuid; int setgid; @@ -104,7 +102,7 @@ struct pts_mount_opts { kgid_t gid; umode_t mode; umode_t ptmxmode; - int newinstance; + int reserve; int max; }; @@ -117,11 +115,9 @@ static const match_table_t tokens = { {Opt_uid, "uid=%u"}, {Opt_gid, "gid=%u"}, {Opt_mode, "mode=%o"}, -#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES {Opt_ptmxmode, "ptmxmode=%o"}, {Opt_newinstance, "newinstance"}, {Opt_max, "max=%d"}, -#endif {Opt_err, NULL} }; @@ -137,15 +133,48 @@ static inline struct pts_fs_info *DEVPTS_SB(struct super_block *sb) return sb->s_fs_info; } -static inline struct super_block *pts_sb_from_inode(struct inode *inode) +struct pts_fs_info *devpts_acquire(struct file *filp) { -#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES - if (inode->i_sb->s_magic == DEVPTS_SUPER_MAGIC) - return inode->i_sb; -#endif - if (!devpts_mnt) - return NULL; - return devpts_mnt->mnt_sb; + struct pts_fs_info *result; + struct path path; + struct super_block *sb; + int err; + + path = filp->f_path; + path_get(&path); + + /* Has the devpts filesystem already been found? */ + sb = path.mnt->mnt_sb; + if (sb->s_magic != DEVPTS_SUPER_MAGIC) { + /* Is a devpts filesystem at "pts" in the same directory? */ + err = path_pts(&path); + if (err) { + result = ERR_PTR(err); + goto out; + } + + /* Is the path the root of a devpts filesystem? */ + result = ERR_PTR(-ENODEV); + sb = path.mnt->mnt_sb; + if ((sb->s_magic != DEVPTS_SUPER_MAGIC) || + (path.mnt->mnt_root != sb->s_root)) + goto out; + } + + /* + * pty code needs to hold extra references in case of last /dev/tty close + */ + atomic_inc(&sb->s_active); + result = DEVPTS_SB(sb); + +out: + path_put(&path); + return result; +} + +void devpts_release(struct pts_fs_info *fsi) +{ + deactivate_super(fsi->sb); } #define PARSE_MOUNT 0 @@ -154,9 +183,7 @@ static inline struct super_block *pts_sb_from_inode(struct inode *inode) /* * parse_mount_options(): * Set @opts to mount options specified in @data. If an option is not - * specified in @data, set it to its default value. The exception is - * 'newinstance' option which can only be set/cleared on a mount (i.e. - * cannot be changed during remount). + * specified in @data, set it to its default value. * * Note: @data may be NULL (in which case all options are set to default). */ @@ -174,9 +201,12 @@ static int parse_mount_options(char *data, int op, struct pts_mount_opts *opts) opts->ptmxmode = DEVPTS_DEFAULT_PTMX_MODE; opts->max = NR_UNIX98_PTY_MAX; - /* newinstance makes sense only on initial mount */ + /* Only allow instances mounted from the initial mount + * namespace to tap the reserve pool of ptys. + */ if (op == PARSE_MOUNT) - opts->newinstance = 0; + opts->reserve = + (current->nsproxy->mnt_ns == init_task.nsproxy->mnt_ns); while ((p = strsep(&data, ",")) != NULL) { substring_t args[MAX_OPT_ARGS]; @@ -211,16 +241,12 @@ static int parse_mount_options(char *data, int op, struct pts_mount_opts *opts) return -EINVAL; opts->mode = option & S_IALLUGO; break; -#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES case Opt_ptmxmode: if (match_octal(&args[0], &option)) return -EINVAL; opts->ptmxmode = option & S_IALLUGO; break; case Opt_newinstance: - /* newinstance makes sense only on initial mount */ - if (op == PARSE_MOUNT) - opts->newinstance = 1; break; case Opt_max: if (match_int(&args[0], &option) || @@ -228,7 +254,6 @@ static int parse_mount_options(char *data, int op, struct pts_mount_opts *opts) return -EINVAL; opts->max = option; break; -#endif default: pr_err("called with bogus options\n"); return -EINVAL; @@ -238,7 +263,6 @@ static int parse_mount_options(char *data, int op, struct pts_mount_opts *opts) return 0; } -#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES static int mknod_ptmx(struct super_block *sb) { int mode; @@ -305,12 +329,6 @@ static void update_ptmx_mode(struct pts_fs_info *fsi) inode->i_mode = S_IFCHR|fsi->mount_opts.ptmxmode; } } -#else -static inline void update_ptmx_mode(struct pts_fs_info *fsi) -{ - return; -} -#endif static int devpts_remount(struct super_block *sb, int *flags, char *data) { @@ -344,11 +362,9 @@ static int devpts_show_options(struct seq_file *seq, struct dentry *root) seq_printf(seq, ",gid=%u", from_kgid_munged(&init_user_ns, opts->gid)); seq_printf(seq, ",mode=%03o", opts->mode); -#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES seq_printf(seq, ",ptmxmode=%03o", opts->ptmxmode); if (opts->max < NR_UNIX98_PTY_MAX) seq_printf(seq, ",max=%d", opts->max); -#endif return 0; } @@ -410,40 +426,11 @@ fail: return -ENOMEM; } -#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES -static int compare_init_pts_sb(struct super_block *s, void *p) -{ - if (devpts_mnt) - return devpts_mnt->mnt_sb == s; - return 0; -} - /* * devpts_mount() * - * If the '-o newinstance' mount option was specified, mount a new - * (private) instance of devpts. PTYs created in this instance are - * independent of the PTYs in other devpts instances. - * - * If the '-o newinstance' option was not specified, mount/remount the - * initial kernel mount of devpts. This type of mount gives the - * legacy, single-instance semantics. - * - * The 'newinstance' option is needed to support multiple namespace - * semantics in devpts while preserving backward compatibility of the - * current 'single-namespace' semantics. i.e all mounts of devpts - * without the 'newinstance' mount option should bind to the initial - * kernel mount, like mount_single(). - * - * Mounts with 'newinstance' option create a new, private namespace. - * - * NOTE: - * - * For single-mount semantics, devpts cannot use mount_single(), - * because mount_single()/sget() find and use the super-block from - * the most recent mount of devpts. But that recent mount may be a - * 'newinstance' mount and mount_single() would pick the newinstance - * super-block instead of the initial super-block. + * Mount a new (private) instance of devpts. PTYs created in this + * instance are independent of the PTYs in other devpts instances. */ static struct dentry *devpts_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data) @@ -456,18 +443,7 @@ static struct dentry *devpts_mount(struct file_system_type *fs_type, if (error) return ERR_PTR(error); - /* Require newinstance for all user namespace mounts to ensure - * the mount options are not changed. - */ - if ((current_user_ns() != &init_user_ns) && !opts.newinstance) - return ERR_PTR(-EINVAL); - - if (opts.newinstance) - s = sget(fs_type, NULL, set_anon_super, flags, NULL); - else - s = sget(fs_type, compare_init_pts_sb, set_anon_super, flags, - NULL); - + s = sget(fs_type, NULL, set_anon_super, flags, NULL); if (IS_ERR(s)) return ERR_CAST(s); @@ -491,18 +467,6 @@ out_undo_sget: return ERR_PTR(error); } -#else -/* - * This supports only the legacy single-instance semantics (no - * multiple-instance semantics) - */ -static struct dentry *devpts_mount(struct file_system_type *fs_type, int flags, - const char *dev_name, void *data) -{ - return mount_single(fs_type, flags, data, devpts_fill_super); -} -#endif - static void devpts_kill_sb(struct super_block *sb) { struct pts_fs_info *fsi = DEVPTS_SB(sb); @@ -516,9 +480,7 @@ static struct file_system_type devpts_fs_type = { .name = "devpts", .mount = devpts_mount, .kill_sb = devpts_kill_sb, -#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES .fs_flags = FS_USERNS_MOUNT | FS_USERNS_DEV_MOUNT, -#endif }; /* @@ -531,16 +493,13 @@ int devpts_new_index(struct pts_fs_info *fsi) int index; int ida_ret; - if (!fsi) - return -ENODEV; - retry: if (!ida_pre_get(&fsi->allocated_ptys, GFP_KERNEL)) return -ENOMEM; mutex_lock(&allocated_ptys_lock); - if (pty_count >= pty_limit - - (fsi->mount_opts.newinstance ? pty_reserve : 0)) { + if (pty_count >= (pty_limit - + (fsi->mount_opts.reserve ? 0 : pty_reserve))) { mutex_unlock(&allocated_ptys_lock); return -ENOSPC; } @@ -571,30 +530,6 @@ void devpts_kill_index(struct pts_fs_info *fsi, int idx) mutex_unlock(&allocated_ptys_lock); } -/* - * pty code needs to hold extra references in case of last /dev/tty close - */ -struct pts_fs_info *devpts_get_ref(struct inode *ptmx_inode, struct file *file) -{ - struct super_block *sb; - struct pts_fs_info *fsi; - - sb = pts_sb_from_inode(ptmx_inode); - if (!sb) - return NULL; - fsi = DEVPTS_SB(sb); - if (!fsi) - return NULL; - - atomic_inc(&sb->s_active); - return fsi; -} - -void devpts_put_ref(struct pts_fs_info *fsi) -{ - deactivate_super(fsi->sb); -} - /** * devpts_pty_new -- create a new inode in /dev/pts/ * @ptmx_inode: inode of the master @@ -607,16 +542,12 @@ void devpts_put_ref(struct pts_fs_info *fsi) struct dentry *devpts_pty_new(struct pts_fs_info *fsi, int index, void *priv) { struct dentry *dentry; - struct super_block *sb; + struct super_block *sb = fsi->sb; struct inode *inode; struct dentry *root; struct pts_mount_opts *opts; char s[12]; - if (!fsi) - return ERR_PTR(-ENODEV); - - sb = fsi->sb; root = sb->s_root; opts = &fsi->mount_opts; @@ -676,20 +607,8 @@ void devpts_pty_kill(struct dentry *dentry) static int __init init_devpts_fs(void) { int err = register_filesystem(&devpts_fs_type); - struct ctl_table_header *table; - if (!err) { - struct vfsmount *mnt; - - table = register_sysctl_table(pty_root_table); - mnt = kern_mount(&devpts_fs_type); - if (IS_ERR(mnt)) { - err = PTR_ERR(mnt); - unregister_filesystem(&devpts_fs_type); - unregister_sysctl_table(table); - } else { - devpts_mnt = mnt; - } + register_sysctl_table(pty_root_table); } return err; } diff --git a/fs/direct-io.c b/fs/direct-io.c index 3bf3f20f8ecc..7c3ce73cb617 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -108,7 +108,8 @@ struct dio_submit { /* dio_state communicated between submission path and end_io */ struct dio { int flags; /* doesn't change */ - int rw; + int op; + int op_flags; blk_qc_t bio_cookie; struct block_device *bio_bdev; struct inode *inode; @@ -163,7 +164,7 @@ static inline int dio_refill_pages(struct dio *dio, struct dio_submit *sdio) ret = iov_iter_get_pages(sdio->iter, dio->pages, LONG_MAX, DIO_PAGES, &sdio->from); - if (ret < 0 && sdio->blocks_available && (dio->rw & WRITE)) { + if (ret < 0 && sdio->blocks_available && (dio->op == REQ_OP_WRITE)) { struct page *page = ZERO_PAGE(0); /* * A memory fault, but the filesystem has some outstanding @@ -242,7 +243,8 @@ static ssize_t dio_complete(struct dio *dio, ssize_t ret, bool is_async) transferred = dio->result; /* Check for short read case */ - if ((dio->rw == READ) && ((offset + transferred) > dio->i_size)) + if ((dio->op == REQ_OP_READ) && + ((offset + transferred) > dio->i_size)) transferred = dio->i_size - offset; } @@ -273,7 +275,7 @@ static ssize_t dio_complete(struct dio *dio, ssize_t ret, bool is_async) */ dio->iocb->ki_pos += transferred; - if (dio->rw & WRITE) + if (dio->op == REQ_OP_WRITE) ret = generic_write_sync(dio->iocb, transferred); dio->iocb->ki_complete(dio->iocb, ret, 0); } @@ -375,6 +377,7 @@ dio_bio_alloc(struct dio *dio, struct dio_submit *sdio, bio->bi_bdev = bdev; bio->bi_iter.bi_sector = first_sector; + bio_set_op_attrs(bio, dio->op, dio->op_flags); if (dio->is_async) bio->bi_end_io = dio_bio_end_aio; else @@ -402,17 +405,16 @@ static inline void dio_bio_submit(struct dio *dio, struct dio_submit *sdio) dio->refcount++; spin_unlock_irqrestore(&dio->bio_lock, flags); - if (dio->is_async && dio->rw == READ && dio->should_dirty) + if (dio->is_async && dio->op == REQ_OP_READ && dio->should_dirty) bio_set_pages_dirty(bio); dio->bio_bdev = bio->bi_bdev; if (sdio->submit_io) { - sdio->submit_io(dio->rw, bio, dio->inode, - sdio->logical_offset_in_bio); + sdio->submit_io(bio, dio->inode, sdio->logical_offset_in_bio); dio->bio_cookie = BLK_QC_T_NONE; } else - dio->bio_cookie = submit_bio(dio->rw, bio); + dio->bio_cookie = submit_bio(bio); sdio->bio = NULL; sdio->boundary = 0; @@ -478,14 +480,14 @@ static int dio_bio_complete(struct dio *dio, struct bio *bio) if (bio->bi_error) dio->io_error = -EIO; - if (dio->is_async && dio->rw == READ && dio->should_dirty) { + if (dio->is_async && dio->op == REQ_OP_READ && dio->should_dirty) { err = bio->bi_error; bio_check_pages_dirty(bio); /* transfers ownership */ } else { bio_for_each_segment_all(bvec, bio, i) { struct page *page = bvec->bv_page; - if (dio->rw == READ && !PageCompound(page) && + if (dio->op == REQ_OP_READ && !PageCompound(page) && dio->should_dirty) set_page_dirty_lock(page); put_page(page); @@ -628,20 +630,20 @@ static int get_more_blocks(struct dio *dio, struct dio_submit *sdio, map_bh->b_size = fs_count << i_blkbits; /* - * For writes inside i_size on a DIO_SKIP_HOLES filesystem we - * forbid block creations: only overwrites are permitted. - * We will return early to the caller once we see an - * unmapped buffer head returned, and the caller will fall - * back to buffered I/O. + * For writes that could fill holes inside i_size on a + * DIO_SKIP_HOLES filesystem we forbid block creations: only + * overwrites are permitted. We will return early to the caller + * once we see an unmapped buffer head returned, and the caller + * will fall back to buffered I/O. * * Otherwise the decision is left to the get_blocks method, * which may decide to handle it or also return an unmapped * buffer head. */ - create = dio->rw & WRITE; + create = dio->op == REQ_OP_WRITE; if (dio->flags & DIO_SKIP_HOLES) { - if (sdio->block_in_file < (i_size_read(dio->inode) >> - sdio->blkbits)) + if (fs_startblk <= ((i_size_read(dio->inode) - 1) >> + i_blkbits)) create = 0; } @@ -788,7 +790,7 @@ submit_page_section(struct dio *dio, struct dio_submit *sdio, struct page *page, { int ret = 0; - if (dio->rw & WRITE) { + if (dio->op == REQ_OP_WRITE) { /* * Read accounting is performed in submit_bio() */ @@ -988,7 +990,7 @@ do_holes: loff_t i_size_aligned; /* AKPM: eargh, -ENOTBLK is a hack */ - if (dio->rw & WRITE) { + if (dio->op == REQ_OP_WRITE) { put_page(page); return -ENOTBLK; } @@ -1202,7 +1204,12 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, dio->is_async = true; dio->inode = inode; - dio->rw = iov_iter_rw(iter) == WRITE ? WRITE_ODIRECT : READ; + if (iov_iter_rw(iter) == WRITE) { + dio->op = REQ_OP_WRITE; + dio->op_flags = WRITE_ODIRECT; + } else { + dio->op = REQ_OP_READ; + } /* * For AIO O_(D)SYNC writes we need to defer completions to a workqueue diff --git a/fs/dlm/config.c b/fs/dlm/config.c index 1669f6291c95..df955d2209ce 100644 --- a/fs/dlm/config.c +++ b/fs/dlm/config.c @@ -73,6 +73,7 @@ struct dlm_cluster { unsigned int cl_toss_secs; unsigned int cl_scan_secs; unsigned int cl_log_debug; + unsigned int cl_log_info; unsigned int cl_protocol; unsigned int cl_timewarn_cs; unsigned int cl_waitwarn_us; @@ -95,6 +96,7 @@ enum { CLUSTER_ATTR_TOSS_SECS, CLUSTER_ATTR_SCAN_SECS, CLUSTER_ATTR_LOG_DEBUG, + CLUSTER_ATTR_LOG_INFO, CLUSTER_ATTR_PROTOCOL, CLUSTER_ATTR_TIMEWARN_CS, CLUSTER_ATTR_WAITWARN_US, @@ -166,6 +168,7 @@ CLUSTER_ATTR(recover_timer, 1); CLUSTER_ATTR(toss_secs, 1); CLUSTER_ATTR(scan_secs, 1); CLUSTER_ATTR(log_debug, 0); +CLUSTER_ATTR(log_info, 0); CLUSTER_ATTR(protocol, 0); CLUSTER_ATTR(timewarn_cs, 1); CLUSTER_ATTR(waitwarn_us, 0); @@ -180,6 +183,7 @@ static struct configfs_attribute *cluster_attrs[] = { [CLUSTER_ATTR_TOSS_SECS] = &cluster_attr_toss_secs, [CLUSTER_ATTR_SCAN_SECS] = &cluster_attr_scan_secs, [CLUSTER_ATTR_LOG_DEBUG] = &cluster_attr_log_debug, + [CLUSTER_ATTR_LOG_INFO] = &cluster_attr_log_info, [CLUSTER_ATTR_PROTOCOL] = &cluster_attr_protocol, [CLUSTER_ATTR_TIMEWARN_CS] = &cluster_attr_timewarn_cs, [CLUSTER_ATTR_WAITWARN_US] = &cluster_attr_waitwarn_us, @@ -365,6 +369,7 @@ static struct config_group *make_cluster(struct config_group *g, cl->cl_toss_secs = dlm_config.ci_toss_secs; cl->cl_scan_secs = dlm_config.ci_scan_secs; cl->cl_log_debug = dlm_config.ci_log_debug; + cl->cl_log_info = dlm_config.ci_log_info; cl->cl_protocol = dlm_config.ci_protocol; cl->cl_timewarn_cs = dlm_config.ci_timewarn_cs; cl->cl_waitwarn_us = dlm_config.ci_waitwarn_us; @@ -850,6 +855,7 @@ int dlm_our_addr(struct sockaddr_storage *addr, int num) #define DEFAULT_TOSS_SECS 10 #define DEFAULT_SCAN_SECS 5 #define DEFAULT_LOG_DEBUG 0 +#define DEFAULT_LOG_INFO 1 #define DEFAULT_PROTOCOL 0 #define DEFAULT_TIMEWARN_CS 500 /* 5 sec = 500 centiseconds */ #define DEFAULT_WAITWARN_US 0 @@ -865,6 +871,7 @@ struct dlm_config_info dlm_config = { .ci_toss_secs = DEFAULT_TOSS_SECS, .ci_scan_secs = DEFAULT_SCAN_SECS, .ci_log_debug = DEFAULT_LOG_DEBUG, + .ci_log_info = DEFAULT_LOG_INFO, .ci_protocol = DEFAULT_PROTOCOL, .ci_timewarn_cs = DEFAULT_TIMEWARN_CS, .ci_waitwarn_us = DEFAULT_WAITWARN_US, diff --git a/fs/dlm/config.h b/fs/dlm/config.h index f30697bc2780..6041eec886ab 100644 --- a/fs/dlm/config.h +++ b/fs/dlm/config.h @@ -31,6 +31,7 @@ struct dlm_config_info { int ci_toss_secs; int ci_scan_secs; int ci_log_debug; + int ci_log_info; int ci_protocol; int ci_timewarn_cs; int ci_waitwarn_us; diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h index 5eff6ea3e27f..216b61604ef9 100644 --- a/fs/dlm/dlm_internal.h +++ b/fs/dlm/dlm_internal.h @@ -65,8 +65,16 @@ struct dlm_mhandle; printk(KERN_ERR "dlm: "fmt"\n" , ##args) #define log_error(ls, fmt, args...) \ printk(KERN_ERR "dlm: %s: " fmt "\n", (ls)->ls_name , ##args) + #define log_rinfo(ls, fmt, args...) \ - printk(KERN_INFO "dlm: %s: " fmt "\n", (ls)->ls_name , ##args); +do { \ + if (dlm_config.ci_log_info) \ + printk(KERN_INFO "dlm: %s: " fmt "\n", \ + (ls)->ls_name, ##args); \ + else if (dlm_config.ci_log_debug) \ + printk(KERN_DEBUG "dlm: %s: " fmt "\n", \ + (ls)->ls_name , ##args); \ +} while (0) #define log_debug(ls, fmt, args...) \ do { \ diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index 1ab012a27d9f..963016c8f3d1 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -1279,10 +1279,9 @@ static void init_local(void) if (dlm_our_addr(&sas, i)) break; - addr = kmalloc(sizeof(*addr), GFP_NOFS); + addr = kmemdup(&sas, sizeof(*addr), GFP_NOFS); if (!addr) break; - memcpy(addr, &sas, sizeof(*addr)); dlm_local_addr[dlm_local_count++] = addr; } } diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c index ebd40f46ed4c..e5e29f8c920b 100644 --- a/fs/ecryptfs/crypto.c +++ b/fs/ecryptfs/crypto.c @@ -45,7 +45,7 @@ * ecryptfs_to_hex * @dst: Buffer to take hex character representation of contents of * src; must be at least of size (src_size * 2) - * @src: Buffer to be converted to a hex string respresentation + * @src: Buffer to be converted to a hex string representation * @src_size: number of bytes to convert */ void ecryptfs_to_hex(char *dst, char *src, size_t src_size) @@ -60,7 +60,7 @@ void ecryptfs_to_hex(char *dst, char *src, size_t src_size) * ecryptfs_from_hex * @dst: Buffer to take the bytes from src hex; must be at least of * size (src_size / 2) - * @src: Buffer to be converted from a hex string respresentation to raw value + * @src: Buffer to be converted from a hex string representation to raw value * @dst_size: size of dst buffer, or number of hex characters pairs to convert */ void ecryptfs_from_hex(char *dst, char *src, int dst_size) @@ -953,7 +953,7 @@ struct ecryptfs_cipher_code_str_map_elem { }; /* Add support for additional ciphers by adding elements here. The - * cipher_code is whatever OpenPGP applicatoins use to identify the + * cipher_code is whatever OpenPGP applications use to identify the * ciphers. List in order of probability. */ static struct ecryptfs_cipher_code_str_map_elem ecryptfs_cipher_code_str_map[] = { @@ -1141,12 +1141,13 @@ ecryptfs_write_metadata_to_contents(struct inode *ecryptfs_inode, static int ecryptfs_write_metadata_to_xattr(struct dentry *ecryptfs_dentry, + struct inode *ecryptfs_inode, char *page_virt, size_t size) { int rc; - rc = ecryptfs_setxattr(ecryptfs_dentry, ECRYPTFS_XATTR_NAME, page_virt, - size, 0); + rc = ecryptfs_setxattr(ecryptfs_dentry, ecryptfs_inode, + ECRYPTFS_XATTR_NAME, page_virt, size, 0); return rc; } @@ -1215,8 +1216,8 @@ int ecryptfs_write_metadata(struct dentry *ecryptfs_dentry, goto out_free; } if (crypt_stat->flags & ECRYPTFS_METADATA_IN_XATTR) - rc = ecryptfs_write_metadata_to_xattr(ecryptfs_dentry, virt, - size); + rc = ecryptfs_write_metadata_to_xattr(ecryptfs_dentry, ecryptfs_inode, + virt, size); else rc = ecryptfs_write_metadata_to_contents(ecryptfs_inode, virt, virt_len); @@ -1409,7 +1410,7 @@ int ecryptfs_read_and_validate_xattr_region(struct dentry *dentry, * * Common entry point for reading file metadata. From here, we could * retrieve the header information from the header region of the file, - * the xattr region of the file, or some other repostory that is + * the xattr region of the file, or some other repository that is * stored separately from the file itself. The current implementation * supports retrieving the metadata information from the file contents * and from the xattr region. diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h index 3ec495db7e82..4ba1547bb9ad 100644 --- a/fs/ecryptfs/ecryptfs_kernel.h +++ b/fs/ecryptfs/ecryptfs_kernel.h @@ -609,8 +609,8 @@ ssize_t ecryptfs_getxattr_lower(struct dentry *lower_dentry, struct inode *lower_inode, const char *name, void *value, size_t size); int -ecryptfs_setxattr(struct dentry *dentry, const char *name, const void *value, - size_t size, int flags); +ecryptfs_setxattr(struct dentry *dentry, struct inode *inode, const char *name, + const void *value, size_t size, int flags); int ecryptfs_read_xattr_region(char *page_virt, struct inode *ecryptfs_inode); #ifdef CONFIG_ECRYPT_FS_MESSAGING int ecryptfs_process_response(struct ecryptfs_daemon *daemon, diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c index 7000b96b783e..ca4e83750214 100644 --- a/fs/ecryptfs/file.c +++ b/fs/ecryptfs/file.c @@ -169,9 +169,22 @@ out: return rc; } +static int ecryptfs_mmap(struct file *file, struct vm_area_struct *vma) +{ + struct file *lower_file = ecryptfs_file_to_lower(file); + /* + * Don't allow mmap on top of file systems that don't support it + * natively. If FILESYSTEM_MAX_STACK_DEPTH > 2 or ecryptfs + * allows recursive mounting, this will need to be extended. + */ + if (!lower_file->f_op->mmap) + return -ENODEV; + return generic_file_mmap(file, vma); +} + /** * ecryptfs_open - * @inode: inode speciying file to open + * @inode: inode specifying file to open * @file: Structure to return filled in * * Opens the file specified by inode. @@ -240,7 +253,7 @@ out: /** * ecryptfs_dir_open - * @inode: inode speciying file to open + * @inode: inode specifying file to open * @file: Structure to return filled in * * Opens the file specified by inode. @@ -403,7 +416,7 @@ const struct file_operations ecryptfs_main_fops = { #ifdef CONFIG_COMPAT .compat_ioctl = ecryptfs_compat_ioctl, #endif - .mmap = generic_file_mmap, + .mmap = ecryptfs_mmap, .open = ecryptfs_open, .flush = ecryptfs_flush, .release = ecryptfs_release, diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index 318b04689d76..9d153b6a1d72 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -1001,7 +1001,8 @@ static int ecryptfs_getattr(struct vfsmount *mnt, struct dentry *dentry, } int -ecryptfs_setxattr(struct dentry *dentry, const char *name, const void *value, +ecryptfs_setxattr(struct dentry *dentry, struct inode *inode, + const char *name, const void *value, size_t size, int flags) { int rc = 0; @@ -1014,8 +1015,8 @@ ecryptfs_setxattr(struct dentry *dentry, const char *name, const void *value, } rc = vfs_setxattr(lower_dentry, name, value, size, flags); - if (!rc && d_really_is_positive(dentry)) - fsstack_copy_attr_all(d_inode(dentry), d_inode(lower_dentry)); + if (!rc && inode) + fsstack_copy_attr_all(inode, d_inode(lower_dentry)); out: return rc; } diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c index 1698132d0e57..612004495141 100644 --- a/fs/ecryptfs/main.c +++ b/fs/ecryptfs/main.c @@ -738,8 +738,7 @@ static void ecryptfs_free_kmem_caches(void) struct ecryptfs_cache_info *info; info = &ecryptfs_cache_infos[i]; - if (*(info->cache)) - kmem_cache_destroy(*(info->cache)); + kmem_cache_destroy(*(info->cache)); } } diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c index 148d11b514fb..9c3437c8a5b1 100644 --- a/fs/ecryptfs/mmap.c +++ b/fs/ecryptfs/mmap.c @@ -442,7 +442,8 @@ static int ecryptfs_write_inode_size_to_xattr(struct inode *ecryptfs_inode) if (size < 0) size = 8; put_unaligned_be64(i_size_read(ecryptfs_inode), xattr_virt); - rc = lower_inode->i_op->setxattr(lower_dentry, ECRYPTFS_XATTR_NAME, + rc = lower_inode->i_op->setxattr(lower_dentry, lower_inode, + ECRYPTFS_XATTR_NAME, xattr_virt, size, 0); inode_unlock(lower_inode); if (rc) diff --git a/fs/efivarfs/inode.c b/fs/efivarfs/inode.c index e2ab6d0497f2..1d73fc6dba13 100644 --- a/fs/efivarfs/inode.c +++ b/fs/efivarfs/inode.c @@ -11,6 +11,7 @@ #include <linux/fs.h> #include <linux/ctype.h> #include <linux/slab.h> +#include <linux/uuid.h> #include "internal.h" @@ -46,11 +47,7 @@ struct inode *efivarfs_get_inode(struct super_block *sb, */ bool efivarfs_valid_name(const char *str, int len) { - static const char dashes[EFI_VARIABLE_GUID_LEN] = { - [8] = 1, [13] = 1, [18] = 1, [23] = 1 - }; const char *s = str + len - EFI_VARIABLE_GUID_LEN; - int i; /* * We need a GUID, plus at least one letter for the variable name, @@ -68,37 +65,7 @@ bool efivarfs_valid_name(const char *str, int len) * * 12345678-1234-1234-1234-123456789abc */ - for (i = 0; i < EFI_VARIABLE_GUID_LEN; i++) { - if (dashes[i]) { - if (*s++ != '-') - return false; - } else { - if (!isxdigit(*s++)) - return false; - } - } - - return true; -} - -static void efivarfs_hex_to_guid(const char *str, efi_guid_t *guid) -{ - guid->b[0] = hex_to_bin(str[6]) << 4 | hex_to_bin(str[7]); - guid->b[1] = hex_to_bin(str[4]) << 4 | hex_to_bin(str[5]); - guid->b[2] = hex_to_bin(str[2]) << 4 | hex_to_bin(str[3]); - guid->b[3] = hex_to_bin(str[0]) << 4 | hex_to_bin(str[1]); - guid->b[4] = hex_to_bin(str[11]) << 4 | hex_to_bin(str[12]); - guid->b[5] = hex_to_bin(str[9]) << 4 | hex_to_bin(str[10]); - guid->b[6] = hex_to_bin(str[16]) << 4 | hex_to_bin(str[17]); - guid->b[7] = hex_to_bin(str[14]) << 4 | hex_to_bin(str[15]); - guid->b[8] = hex_to_bin(str[19]) << 4 | hex_to_bin(str[20]); - guid->b[9] = hex_to_bin(str[21]) << 4 | hex_to_bin(str[22]); - guid->b[10] = hex_to_bin(str[24]) << 4 | hex_to_bin(str[25]); - guid->b[11] = hex_to_bin(str[26]) << 4 | hex_to_bin(str[27]); - guid->b[12] = hex_to_bin(str[28]) << 4 | hex_to_bin(str[29]); - guid->b[13] = hex_to_bin(str[30]) << 4 | hex_to_bin(str[31]); - guid->b[14] = hex_to_bin(str[32]) << 4 | hex_to_bin(str[33]); - guid->b[15] = hex_to_bin(str[34]) << 4 | hex_to_bin(str[35]); + return uuid_is_valid(s); } static int efivarfs_create(struct inode *dir, struct dentry *dentry, @@ -119,8 +86,7 @@ static int efivarfs_create(struct inode *dir, struct dentry *dentry, /* length of the variable name itself: remove GUID and separator */ namelen = dentry->d_name.len - EFI_VARIABLE_GUID_LEN - 1; - efivarfs_hex_to_guid(dentry->d_name.name + namelen + 1, - &var->var.VendorGuid); + uuid_le_to_bin(dentry->d_name.name + namelen + 1, &var->var.VendorGuid); if (efivar_variable_is_removable(var->var.VendorGuid, dentry->d_name.name, namelen)) diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c index 9cb54a38832d..a5e607e8f056 100644 --- a/fs/efivarfs/super.c +++ b/fs/efivarfs/super.c @@ -65,7 +65,7 @@ static int efivarfs_d_compare(const struct dentry *parent, static int efivarfs_d_hash(const struct dentry *dentry, struct qstr *qstr) { - unsigned long hash = init_name_hash(); + unsigned long hash = init_name_hash(dentry); const unsigned char *s = qstr->name; unsigned int len = qstr->len; @@ -98,7 +98,7 @@ static struct dentry *efivarfs_alloc_dentry(struct dentry *parent, char *name) q.name = name; q.len = strlen(name); - err = efivarfs_d_hash(NULL, &q); + err = efivarfs_d_hash(parent, &q); if (err) return ERR_PTR(err); diff --git a/fs/efs/super.c b/fs/efs/super.c index cb68dac4f9d3..368f7dd21c61 100644 --- a/fs/efs/super.c +++ b/fs/efs/super.c @@ -275,7 +275,7 @@ static int efs_fill_super(struct super_block *s, void *d, int silent) if (!bh) { pr_err("cannot read volume header\n"); - return -EINVAL; + return -EIO; } /* @@ -293,7 +293,7 @@ static int efs_fill_super(struct super_block *s, void *d, int silent) bh = sb_bread(s, sb->fs_start + EFS_SUPER); if (!bh) { pr_err("cannot read superblock\n"); - return -EINVAL; + return -EIO; } if (efs_validate_super(sb, (struct efs_super *) bh->b_data)) { diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 8a74a2a52e0f..10db91218933 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -1583,15 +1583,15 @@ static int ep_send_events(struct eventpoll *ep, return ep_scan_ready_list(ep, ep_send_events_proc, &esed, 0, false); } -static inline struct timespec ep_set_mstimeout(long ms) +static inline struct timespec64 ep_set_mstimeout(long ms) { - struct timespec now, ts = { + struct timespec64 now, ts = { .tv_sec = ms / MSEC_PER_SEC, .tv_nsec = NSEC_PER_MSEC * (ms % MSEC_PER_SEC), }; - ktime_get_ts(&now); - return timespec_add_safe(now, ts); + ktime_get_ts64(&now); + return timespec64_add_safe(now, ts); } /** @@ -1621,11 +1621,11 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, ktime_t expires, *to = NULL; if (timeout > 0) { - struct timespec end_time = ep_set_mstimeout(timeout); + struct timespec64 end_time = ep_set_mstimeout(timeout); slack = select_estimate_accuracy(&end_time); to = &expires; - *to = timespec_to_ktime(end_time); + *to = timespec64_to_ktime(end_time); } else if (timeout == 0) { /* * Avoid the unnecessary trip to the wait queue loop, if the diff --git a/fs/exec.c b/fs/exec.c index a98b21d47385..887c1c955df8 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -243,10 +243,6 @@ static void put_arg_page(struct page *page) put_page(page); } -static void free_arg_page(struct linux_binprm *bprm, int i) -{ -} - static void free_arg_pages(struct linux_binprm *bprm) { } @@ -267,7 +263,10 @@ static int __bprm_mm_init(struct linux_binprm *bprm) if (!vma) return -ENOMEM; - down_write(&mm->mmap_sem); + if (down_write_killable(&mm->mmap_sem)) { + err = -EINTR; + goto err_free; + } vma->vm_mm = mm; /* @@ -294,6 +293,7 @@ static int __bprm_mm_init(struct linux_binprm *bprm) return 0; err: up_write(&mm->mmap_sem); +err_free: bprm->vma = NULL; kmem_cache_free(vm_area_cachep, vma); return err; @@ -700,7 +700,9 @@ int setup_arg_pages(struct linux_binprm *bprm, bprm->loader -= stack_shift; bprm->exec -= stack_shift; - down_write(&mm->mmap_sem); + if (down_write_killable(&mm->mmap_sem)) + return -EINTR; + vm_flags = VM_STACK_FLAGS; /* @@ -850,15 +852,25 @@ int kernel_read_file(struct file *file, void **buf, loff_t *size, if (ret) return ret; + ret = deny_write_access(file); + if (ret) + return ret; + i_size = i_size_read(file_inode(file)); - if (max_size > 0 && i_size > max_size) - return -EFBIG; - if (i_size <= 0) - return -EINVAL; + if (max_size > 0 && i_size > max_size) { + ret = -EFBIG; + goto out; + } + if (i_size <= 0) { + ret = -EINVAL; + goto out; + } *buf = vmalloc(i_size); - if (!*buf) - return -ENOMEM; + if (!*buf) { + ret = -ENOMEM; + goto out; + } pos = 0; while (pos < i_size) { @@ -876,18 +888,21 @@ int kernel_read_file(struct file *file, void **buf, loff_t *size, if (pos != i_size) { ret = -EIO; - goto out; + goto out_free; } ret = security_kernel_post_read_file(file, *buf, i_size, id); if (!ret) *size = pos; -out: +out_free: if (ret < 0) { vfree(*buf); *buf = NULL; } + +out: + allow_write_access(file); return ret; } EXPORT_SYMBOL_GPL(kernel_read_file); @@ -1486,9 +1501,6 @@ int remove_arg_zero(struct linux_binprm *bprm) kunmap_atomic(kaddr); put_arg_page(page); - - if (offset == PAGE_SIZE) - free_arg_page(bprm, (bprm->p >> PAGE_SHIFT) - 1); } while (offset == PAGE_SIZE); bprm->p++; diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c index 7bd8ac8dfb28..8bb72807e70d 100644 --- a/fs/exofs/ore.c +++ b/fs/exofs/ore.c @@ -878,7 +878,7 @@ static int _write_mirror(struct ore_io_state *ios, int cur_comp) } else { bio = master_dev->bio; /* FIXME: bio_set_dir() */ - bio->bi_rw |= REQ_WRITE; + bio_set_op_attrs(bio, REQ_OP_WRITE, 0); } osd_req_write(or, _ios_obj(ios, cur_comp), diff --git a/fs/ext2/balloc.c b/fs/ext2/balloc.c index 9f9992b37924..4c40c0786e16 100644 --- a/fs/ext2/balloc.c +++ b/fs/ext2/balloc.c @@ -1194,6 +1194,27 @@ static int ext2_has_free_blocks(struct ext2_sb_info *sbi) } /* + * Returns 1 if the passed-in block region is valid; 0 if some part overlaps + * with filesystem metadata blocksi. + */ +int ext2_data_block_valid(struct ext2_sb_info *sbi, ext2_fsblk_t start_blk, + unsigned int count) +{ + if ((start_blk <= le32_to_cpu(sbi->s_es->s_first_data_block)) || + (start_blk + count < start_blk) || + (start_blk > le32_to_cpu(sbi->s_es->s_blocks_count))) + return 0; + + /* Ensure we do not step over superblock */ + if ((start_blk <= sbi->s_sb_block) && + (start_blk + count >= sbi->s_sb_block)) + return 0; + + + return 1; +} + +/* * ext2_new_blocks() -- core block(s) allocation function * @inode: file inode * @goal: given target block(filesystem wide) diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h index 170939f379d7..3fb93681bf7f 100644 --- a/fs/ext2/ext2.h +++ b/fs/ext2/ext2.h @@ -367,6 +367,7 @@ struct ext2_inode { */ #define EXT2_VALID_FS 0x0001 /* Unmounted cleanly */ #define EXT2_ERROR_FS 0x0002 /* Errors detected */ +#define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */ /* * Mount flags @@ -739,6 +740,8 @@ extern unsigned long ext2_bg_num_gdb(struct super_block *sb, int group); extern ext2_fsblk_t ext2_new_block(struct inode *, unsigned long, int *); extern ext2_fsblk_t ext2_new_blocks(struct inode *, unsigned long, unsigned long *, int *); +extern int ext2_data_block_valid(struct ext2_sb_info *sbi, ext2_fsblk_t start_blk, + unsigned int count); extern void ext2_free_blocks (struct inode *, unsigned long, unsigned long); extern unsigned long ext2_count_free_blocks (struct super_block *); diff --git a/fs/ext2/file.c b/fs/ext2/file.c index c1400b109805..5efeefe17abb 100644 --- a/fs/ext2/file.c +++ b/fs/ext2/file.c @@ -51,7 +51,7 @@ static int ext2_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf) } down_read(&ei->dax_sem); - ret = __dax_fault(vma, vmf, ext2_get_block, NULL); + ret = dax_fault(vma, vmf, ext2_get_block); up_read(&ei->dax_sem); if (vmf->flags & FAULT_FLAG_WRITE) @@ -72,7 +72,7 @@ static int ext2_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr, } down_read(&ei->dax_sem); - ret = __dax_pmd_fault(vma, addr, pmd, flags, ext2_get_block, NULL); + ret = dax_pmd_fault(vma, addr, pmd, flags, ext2_get_block); up_read(&ei->dax_sem); if (flags & FAULT_FLAG_WRITE) diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index b675610391b8..d5c7d09919f3 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -26,6 +26,7 @@ #include <linux/highuid.h> #include <linux/pagemap.h> #include <linux/dax.h> +#include <linux/blkdev.h> #include <linux/quotaops.h> #include <linux/writeback.h> #include <linux/buffer_head.h> @@ -737,19 +738,18 @@ static int ext2_get_blocks(struct inode *inode, * so that it's not found by another thread before it's * initialised */ - err = dax_clear_sectors(inode->i_sb->s_bdev, - le32_to_cpu(chain[depth-1].key) << - (inode->i_blkbits - 9), - 1 << inode->i_blkbits); + err = sb_issue_zeroout(inode->i_sb, + le32_to_cpu(chain[depth-1].key), count, + GFP_NOFS); if (err) { mutex_unlock(&ei->truncate_mutex); goto cleanup; } - } + } else + set_buffer_new(bh_result); ext2_splice_branch(inode, iblock, partial, indirect_blks, count); mutex_unlock(&ei->truncate_mutex); - set_buffer_new(bh_result); got_it: map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key)); if (count > blocks_to_boundary) @@ -1389,6 +1389,16 @@ struct inode *ext2_iget (struct super_block *sb, unsigned long ino) ei->i_frag_size = raw_inode->i_fsize; ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl); ei->i_dir_acl = 0; + + if (ei->i_file_acl && + !ext2_data_block_valid(EXT2_SB(sb), ei->i_file_acl, 1)) { + ext2_error(sb, "ext2_iget", "bad extended attribute block %u", + ei->i_file_acl); + brelse(bh); + ret = -EFSCORRUPTED; + goto bad_inode; + } + if (S_ISREG(inode->i_mode)) inode->i_size |= ((__u64)le32_to_cpu(raw_inode->i_size_high)) << 32; else diff --git a/fs/ext2/super.c b/fs/ext2/super.c index b78caf25f746..1d9379568aa8 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -922,16 +922,9 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) blocksize = BLOCK_SIZE << le32_to_cpu(sbi->s_es->s_log_block_size); if (sbi->s_mount_opt & EXT2_MOUNT_DAX) { - if (blocksize != PAGE_SIZE) { - ext2_msg(sb, KERN_ERR, - "error: unsupported blocksize for dax"); + err = bdev_dax_supported(sb, blocksize); + if (err) goto failed_mount; - } - if (!sb->s_bdev->bd_disk->fops->direct_access) { - ext2_msg(sb, KERN_ERR, - "error: device does not support dax"); - goto failed_mount; - } } /* If the blocksize doesn't match, re-read the thing.. */ diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c index 1a5e3bff0b63..b7f896f3f7a7 100644 --- a/fs/ext2/xattr.c +++ b/fs/ext2/xattr.c @@ -759,10 +759,19 @@ void ext2_xattr_delete_inode(struct inode *inode) { struct buffer_head *bh = NULL; + struct ext2_sb_info *sbi = EXT2_SB(inode->i_sb); down_write(&EXT2_I(inode)->xattr_sem); if (!EXT2_I(inode)->i_file_acl) goto cleanup; + + if (!ext2_data_block_valid(sbi, EXT2_I(inode)->i_file_acl, 0)) { + ext2_error(inode->i_sb, "ext2_xattr_delete_inode", + "inode %ld: xattr block %d is out of data blocks range", + inode->i_ino, EXT2_I(inode)->i_file_acl); + goto cleanup; + } + bh = sb_bread(inode->i_sb, EXT2_I(inode)->i_file_acl); if (!bh) { ext2_error(inode->i_sb, "ext2_xattr_delete_inode", diff --git a/fs/ext2/xattr_security.c b/fs/ext2/xattr_security.c index 7fd3b867ce65..7b9e9c1842d5 100644 --- a/fs/ext2/xattr_security.c +++ b/fs/ext2/xattr_security.c @@ -18,10 +18,11 @@ ext2_xattr_security_get(const struct xattr_handler *handler, static int ext2_xattr_security_set(const struct xattr_handler *handler, - struct dentry *dentry, const char *name, - const void *value, size_t size, int flags) + struct dentry *unused, struct inode *inode, + const char *name, const void *value, + size_t size, int flags) { - return ext2_xattr_set(d_inode(dentry), EXT2_XATTR_INDEX_SECURITY, name, + return ext2_xattr_set(inode, EXT2_XATTR_INDEX_SECURITY, name, value, size, flags); } diff --git a/fs/ext2/xattr_trusted.c b/fs/ext2/xattr_trusted.c index 0f85705ff519..65049b71af13 100644 --- a/fs/ext2/xattr_trusted.c +++ b/fs/ext2/xattr_trusted.c @@ -25,10 +25,11 @@ ext2_xattr_trusted_get(const struct xattr_handler *handler, static int ext2_xattr_trusted_set(const struct xattr_handler *handler, - struct dentry *dentry, const char *name, - const void *value, size_t size, int flags) + struct dentry *unused, struct inode *inode, + const char *name, const void *value, + size_t size, int flags) { - return ext2_xattr_set(d_inode(dentry), EXT2_XATTR_INDEX_TRUSTED, name, + return ext2_xattr_set(inode, EXT2_XATTR_INDEX_TRUSTED, name, value, size, flags); } diff --git a/fs/ext2/xattr_user.c b/fs/ext2/xattr_user.c index 1fafd27037cc..fb2f992ae763 100644 --- a/fs/ext2/xattr_user.c +++ b/fs/ext2/xattr_user.c @@ -29,13 +29,14 @@ ext2_xattr_user_get(const struct xattr_handler *handler, static int ext2_xattr_user_set(const struct xattr_handler *handler, - struct dentry *dentry, const char *name, - const void *value, size_t size, int flags) + struct dentry *unused, struct inode *inode, + const char *name, const void *value, + size_t size, int flags) { - if (!test_opt(dentry->d_sb, XATTR_USER)) + if (!test_opt(inode->i_sb, XATTR_USER)) return -EOPNOTSUPP; - return ext2_xattr_set(d_inode(dentry), EXT2_XATTR_INDEX_USER, + return ext2_xattr_set(inode, EXT2_XATTR_INDEX_USER, name, value, size, flags); } diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig index b46e9fc64196..e38039fd96ff 100644 --- a/fs/ext4/Kconfig +++ b/fs/ext4/Kconfig @@ -99,17 +99,9 @@ config EXT4_FS_SECURITY extended attributes for file security labels, say N. config EXT4_ENCRYPTION - tristate "Ext4 Encryption" + bool "Ext4 Encryption" depends on EXT4_FS - select CRYPTO_AES - select CRYPTO_CBC - select CRYPTO_ECB - select CRYPTO_XTS - select CRYPTO_CTS - select CRYPTO_CTR - select CRYPTO_SHA256 - select KEYS - select ENCRYPTED_KEYS + select FS_ENCRYPTION help Enable encryption of ext4 files and directories. This feature is similar to ecryptfs, but it is more memory diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile index f52cf54f0cbc..354103f3490c 100644 --- a/fs/ext4/Makefile +++ b/fs/ext4/Makefile @@ -12,5 +12,3 @@ ext4-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o page-io.o \ ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o ext4-$(CONFIG_EXT4_FS_SECURITY) += xattr_security.o -ext4-$(CONFIG_EXT4_FS_ENCRYPTION) += crypto_policy.o crypto.o \ - crypto_key.o crypto_fname.o diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index fe1f50fe764f..e04ec868e37e 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -208,6 +208,9 @@ static int ext4_init_block_bitmap(struct super_block *sb, memset(bh->b_data, 0, sb->s_blocksize); bit_max = ext4_num_base_meta_clusters(sb, block_group); + if ((bit_max >> 3) >= bh->b_size) + return -EFSCORRUPTED; + for (bit = 0; bit < bit_max; bit++) ext4_set_bit(bit, bh->b_data); @@ -470,7 +473,7 @@ ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group) trace_ext4_read_block_bitmap_load(sb, block_group); bh->b_end_io = ext4_end_bitmap_read; get_bh(bh); - submit_bh(READ | REQ_META | REQ_PRIO, bh); + submit_bh(REQ_OP_READ, REQ_META | REQ_PRIO, bh); return bh; verify: err = ext4_validate_block_bitmap(sb, desc, block_group, bh); @@ -610,7 +613,10 @@ int ext4_should_retry_alloc(struct super_block *sb, int *retries) jbd_debug(1, "%s: retrying operation after ENOSPC\n", sb->s_id); - return jbd2_journal_force_commit_nested(EXT4_SB(sb)->s_journal); + smp_mb(); + if (EXT4_SB(sb)->s_mb_free_pending) + jbd2_journal_force_commit_nested(EXT4_SB(sb)->s_journal); + return 1; } /* diff --git a/fs/ext4/crypto.c b/fs/ext4/crypto.c deleted file mode 100644 index 6a6c27373b54..000000000000 --- a/fs/ext4/crypto.c +++ /dev/null @@ -1,536 +0,0 @@ -/* - * linux/fs/ext4/crypto.c - * - * Copyright (C) 2015, Google, Inc. - * - * This contains encryption functions for ext4 - * - * Written by Michael Halcrow, 2014. - * - * Filename encryption additions - * Uday Savagaonkar, 2014 - * Encryption policy handling additions - * Ildar Muslukhov, 2014 - * - * This has not yet undergone a rigorous security audit. - * - * The usage of AES-XTS should conform to recommendations in NIST - * Special Publication 800-38E and IEEE P1619/D16. - */ - -#include <crypto/skcipher.h> -#include <keys/user-type.h> -#include <keys/encrypted-type.h> -#include <linux/ecryptfs.h> -#include <linux/gfp.h> -#include <linux/kernel.h> -#include <linux/key.h> -#include <linux/list.h> -#include <linux/mempool.h> -#include <linux/module.h> -#include <linux/mutex.h> -#include <linux/random.h> -#include <linux/scatterlist.h> -#include <linux/spinlock_types.h> -#include <linux/namei.h> - -#include "ext4_extents.h" -#include "xattr.h" - -/* Encryption added and removed here! (L: */ - -static unsigned int num_prealloc_crypto_pages = 32; -static unsigned int num_prealloc_crypto_ctxs = 128; - -module_param(num_prealloc_crypto_pages, uint, 0444); -MODULE_PARM_DESC(num_prealloc_crypto_pages, - "Number of crypto pages to preallocate"); -module_param(num_prealloc_crypto_ctxs, uint, 0444); -MODULE_PARM_DESC(num_prealloc_crypto_ctxs, - "Number of crypto contexts to preallocate"); - -static mempool_t *ext4_bounce_page_pool; - -static LIST_HEAD(ext4_free_crypto_ctxs); -static DEFINE_SPINLOCK(ext4_crypto_ctx_lock); - -static struct kmem_cache *ext4_crypto_ctx_cachep; -struct kmem_cache *ext4_crypt_info_cachep; - -/** - * ext4_release_crypto_ctx() - Releases an encryption context - * @ctx: The encryption context to release. - * - * If the encryption context was allocated from the pre-allocated pool, returns - * it to that pool. Else, frees it. - * - * If there's a bounce page in the context, this frees that. - */ -void ext4_release_crypto_ctx(struct ext4_crypto_ctx *ctx) -{ - unsigned long flags; - - if (ctx->flags & EXT4_WRITE_PATH_FL && ctx->w.bounce_page) - mempool_free(ctx->w.bounce_page, ext4_bounce_page_pool); - ctx->w.bounce_page = NULL; - ctx->w.control_page = NULL; - if (ctx->flags & EXT4_CTX_REQUIRES_FREE_ENCRYPT_FL) { - kmem_cache_free(ext4_crypto_ctx_cachep, ctx); - } else { - spin_lock_irqsave(&ext4_crypto_ctx_lock, flags); - list_add(&ctx->free_list, &ext4_free_crypto_ctxs); - spin_unlock_irqrestore(&ext4_crypto_ctx_lock, flags); - } -} - -/** - * ext4_get_crypto_ctx() - Gets an encryption context - * @inode: The inode for which we are doing the crypto - * - * Allocates and initializes an encryption context. - * - * Return: An allocated and initialized encryption context on success; error - * value or NULL otherwise. - */ -struct ext4_crypto_ctx *ext4_get_crypto_ctx(struct inode *inode, - gfp_t gfp_flags) -{ - struct ext4_crypto_ctx *ctx = NULL; - int res = 0; - unsigned long flags; - struct ext4_crypt_info *ci = EXT4_I(inode)->i_crypt_info; - - if (ci == NULL) - return ERR_PTR(-ENOKEY); - - /* - * We first try getting the ctx from a free list because in - * the common case the ctx will have an allocated and - * initialized crypto tfm, so it's probably a worthwhile - * optimization. For the bounce page, we first try getting it - * from the kernel allocator because that's just about as fast - * as getting it from a list and because a cache of free pages - * should generally be a "last resort" option for a filesystem - * to be able to do its job. - */ - spin_lock_irqsave(&ext4_crypto_ctx_lock, flags); - ctx = list_first_entry_or_null(&ext4_free_crypto_ctxs, - struct ext4_crypto_ctx, free_list); - if (ctx) - list_del(&ctx->free_list); - spin_unlock_irqrestore(&ext4_crypto_ctx_lock, flags); - if (!ctx) { - ctx = kmem_cache_zalloc(ext4_crypto_ctx_cachep, gfp_flags); - if (!ctx) { - res = -ENOMEM; - goto out; - } - ctx->flags |= EXT4_CTX_REQUIRES_FREE_ENCRYPT_FL; - } else { - ctx->flags &= ~EXT4_CTX_REQUIRES_FREE_ENCRYPT_FL; - } - ctx->flags &= ~EXT4_WRITE_PATH_FL; - -out: - if (res) { - if (!IS_ERR_OR_NULL(ctx)) - ext4_release_crypto_ctx(ctx); - ctx = ERR_PTR(res); - } - return ctx; -} - -struct workqueue_struct *ext4_read_workqueue; -static DEFINE_MUTEX(crypto_init); - -/** - * ext4_exit_crypto() - Shutdown the ext4 encryption system - */ -void ext4_exit_crypto(void) -{ - struct ext4_crypto_ctx *pos, *n; - - list_for_each_entry_safe(pos, n, &ext4_free_crypto_ctxs, free_list) - kmem_cache_free(ext4_crypto_ctx_cachep, pos); - INIT_LIST_HEAD(&ext4_free_crypto_ctxs); - if (ext4_bounce_page_pool) - mempool_destroy(ext4_bounce_page_pool); - ext4_bounce_page_pool = NULL; - if (ext4_read_workqueue) - destroy_workqueue(ext4_read_workqueue); - ext4_read_workqueue = NULL; - if (ext4_crypto_ctx_cachep) - kmem_cache_destroy(ext4_crypto_ctx_cachep); - ext4_crypto_ctx_cachep = NULL; - if (ext4_crypt_info_cachep) - kmem_cache_destroy(ext4_crypt_info_cachep); - ext4_crypt_info_cachep = NULL; -} - -/** - * ext4_init_crypto() - Set up for ext4 encryption. - * - * We only call this when we start accessing encrypted files, since it - * results in memory getting allocated that wouldn't otherwise be used. - * - * Return: Zero on success, non-zero otherwise. - */ -int ext4_init_crypto(void) -{ - int i, res = -ENOMEM; - - mutex_lock(&crypto_init); - if (ext4_read_workqueue) - goto already_initialized; - ext4_read_workqueue = alloc_workqueue("ext4_crypto", WQ_HIGHPRI, 0); - if (!ext4_read_workqueue) - goto fail; - - ext4_crypto_ctx_cachep = KMEM_CACHE(ext4_crypto_ctx, - SLAB_RECLAIM_ACCOUNT); - if (!ext4_crypto_ctx_cachep) - goto fail; - - ext4_crypt_info_cachep = KMEM_CACHE(ext4_crypt_info, - SLAB_RECLAIM_ACCOUNT); - if (!ext4_crypt_info_cachep) - goto fail; - - for (i = 0; i < num_prealloc_crypto_ctxs; i++) { - struct ext4_crypto_ctx *ctx; - - ctx = kmem_cache_zalloc(ext4_crypto_ctx_cachep, GFP_NOFS); - if (!ctx) { - res = -ENOMEM; - goto fail; - } - list_add(&ctx->free_list, &ext4_free_crypto_ctxs); - } - - ext4_bounce_page_pool = - mempool_create_page_pool(num_prealloc_crypto_pages, 0); - if (!ext4_bounce_page_pool) { - res = -ENOMEM; - goto fail; - } -already_initialized: - mutex_unlock(&crypto_init); - return 0; -fail: - ext4_exit_crypto(); - mutex_unlock(&crypto_init); - return res; -} - -void ext4_restore_control_page(struct page *data_page) -{ - struct ext4_crypto_ctx *ctx = - (struct ext4_crypto_ctx *)page_private(data_page); - - set_page_private(data_page, (unsigned long)NULL); - ClearPagePrivate(data_page); - unlock_page(data_page); - ext4_release_crypto_ctx(ctx); -} - -/** - * ext4_crypt_complete() - The completion callback for page encryption - * @req: The asynchronous encryption request context - * @res: The result of the encryption operation - */ -static void ext4_crypt_complete(struct crypto_async_request *req, int res) -{ - struct ext4_completion_result *ecr = req->data; - - if (res == -EINPROGRESS) - return; - ecr->res = res; - complete(&ecr->completion); -} - -typedef enum { - EXT4_DECRYPT = 0, - EXT4_ENCRYPT, -} ext4_direction_t; - -static int ext4_page_crypto(struct inode *inode, - ext4_direction_t rw, - pgoff_t index, - struct page *src_page, - struct page *dest_page, - gfp_t gfp_flags) - -{ - u8 xts_tweak[EXT4_XTS_TWEAK_SIZE]; - struct skcipher_request *req = NULL; - DECLARE_EXT4_COMPLETION_RESULT(ecr); - struct scatterlist dst, src; - struct ext4_crypt_info *ci = EXT4_I(inode)->i_crypt_info; - struct crypto_skcipher *tfm = ci->ci_ctfm; - int res = 0; - - req = skcipher_request_alloc(tfm, gfp_flags); - if (!req) { - printk_ratelimited(KERN_ERR - "%s: crypto_request_alloc() failed\n", - __func__); - return -ENOMEM; - } - skcipher_request_set_callback( - req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, - ext4_crypt_complete, &ecr); - - BUILD_BUG_ON(EXT4_XTS_TWEAK_SIZE < sizeof(index)); - memcpy(xts_tweak, &index, sizeof(index)); - memset(&xts_tweak[sizeof(index)], 0, - EXT4_XTS_TWEAK_SIZE - sizeof(index)); - - sg_init_table(&dst, 1); - sg_set_page(&dst, dest_page, PAGE_SIZE, 0); - sg_init_table(&src, 1); - sg_set_page(&src, src_page, PAGE_SIZE, 0); - skcipher_request_set_crypt(req, &src, &dst, PAGE_SIZE, - xts_tweak); - if (rw == EXT4_DECRYPT) - res = crypto_skcipher_decrypt(req); - else - res = crypto_skcipher_encrypt(req); - if (res == -EINPROGRESS || res == -EBUSY) { - wait_for_completion(&ecr.completion); - res = ecr.res; - } - skcipher_request_free(req); - if (res) { - printk_ratelimited( - KERN_ERR - "%s: crypto_skcipher_encrypt() returned %d\n", - __func__, res); - return res; - } - return 0; -} - -static struct page *alloc_bounce_page(struct ext4_crypto_ctx *ctx, - gfp_t gfp_flags) -{ - ctx->w.bounce_page = mempool_alloc(ext4_bounce_page_pool, gfp_flags); - if (ctx->w.bounce_page == NULL) - return ERR_PTR(-ENOMEM); - ctx->flags |= EXT4_WRITE_PATH_FL; - return ctx->w.bounce_page; -} - -/** - * ext4_encrypt() - Encrypts a page - * @inode: The inode for which the encryption should take place - * @plaintext_page: The page to encrypt. Must be locked. - * - * Allocates a ciphertext page and encrypts plaintext_page into it using the ctx - * encryption context. - * - * Called on the page write path. The caller must call - * ext4_restore_control_page() on the returned ciphertext page to - * release the bounce buffer and the encryption context. - * - * Return: An allocated page with the encrypted content on success. Else, an - * error value or NULL. - */ -struct page *ext4_encrypt(struct inode *inode, - struct page *plaintext_page, - gfp_t gfp_flags) -{ - struct ext4_crypto_ctx *ctx; - struct page *ciphertext_page = NULL; - int err; - - BUG_ON(!PageLocked(plaintext_page)); - - ctx = ext4_get_crypto_ctx(inode, gfp_flags); - if (IS_ERR(ctx)) - return (struct page *) ctx; - - /* The encryption operation will require a bounce page. */ - ciphertext_page = alloc_bounce_page(ctx, gfp_flags); - if (IS_ERR(ciphertext_page)) - goto errout; - ctx->w.control_page = plaintext_page; - err = ext4_page_crypto(inode, EXT4_ENCRYPT, plaintext_page->index, - plaintext_page, ciphertext_page, gfp_flags); - if (err) { - ciphertext_page = ERR_PTR(err); - errout: - ext4_release_crypto_ctx(ctx); - return ciphertext_page; - } - SetPagePrivate(ciphertext_page); - set_page_private(ciphertext_page, (unsigned long)ctx); - lock_page(ciphertext_page); - return ciphertext_page; -} - -/** - * ext4_decrypt() - Decrypts a page in-place - * @ctx: The encryption context. - * @page: The page to decrypt. Must be locked. - * - * Decrypts page in-place using the ctx encryption context. - * - * Called from the read completion callback. - * - * Return: Zero on success, non-zero otherwise. - */ -int ext4_decrypt(struct page *page) -{ - BUG_ON(!PageLocked(page)); - - return ext4_page_crypto(page->mapping->host, EXT4_DECRYPT, - page->index, page, page, GFP_NOFS); -} - -int ext4_encrypted_zeroout(struct inode *inode, ext4_lblk_t lblk, - ext4_fsblk_t pblk, ext4_lblk_t len) -{ - struct ext4_crypto_ctx *ctx; - struct page *ciphertext_page = NULL; - struct bio *bio; - int ret, err = 0; - -#if 0 - ext4_msg(inode->i_sb, KERN_CRIT, - "ext4_encrypted_zeroout ino %lu lblk %u len %u", - (unsigned long) inode->i_ino, lblk, len); -#endif - - BUG_ON(inode->i_sb->s_blocksize != PAGE_SIZE); - - ctx = ext4_get_crypto_ctx(inode, GFP_NOFS); - if (IS_ERR(ctx)) - return PTR_ERR(ctx); - - ciphertext_page = alloc_bounce_page(ctx, GFP_NOWAIT); - if (IS_ERR(ciphertext_page)) { - err = PTR_ERR(ciphertext_page); - goto errout; - } - - while (len--) { - err = ext4_page_crypto(inode, EXT4_ENCRYPT, lblk, - ZERO_PAGE(0), ciphertext_page, - GFP_NOFS); - if (err) - goto errout; - - bio = bio_alloc(GFP_NOWAIT, 1); - if (!bio) { - err = -ENOMEM; - goto errout; - } - bio->bi_bdev = inode->i_sb->s_bdev; - bio->bi_iter.bi_sector = - pblk << (inode->i_sb->s_blocksize_bits - 9); - ret = bio_add_page(bio, ciphertext_page, - inode->i_sb->s_blocksize, 0); - if (ret != inode->i_sb->s_blocksize) { - /* should never happen! */ - ext4_msg(inode->i_sb, KERN_ERR, - "bio_add_page failed: %d", ret); - WARN_ON(1); - bio_put(bio); - err = -EIO; - goto errout; - } - err = submit_bio_wait(WRITE, bio); - if ((err == 0) && bio->bi_error) - err = -EIO; - bio_put(bio); - if (err) - goto errout; - lblk++; pblk++; - } - err = 0; -errout: - ext4_release_crypto_ctx(ctx); - return err; -} - -bool ext4_valid_contents_enc_mode(uint32_t mode) -{ - return (mode == EXT4_ENCRYPTION_MODE_AES_256_XTS); -} - -/** - * ext4_validate_encryption_key_size() - Validate the encryption key size - * @mode: The key mode. - * @size: The key size to validate. - * - * Return: The validated key size for @mode. Zero if invalid. - */ -uint32_t ext4_validate_encryption_key_size(uint32_t mode, uint32_t size) -{ - if (size == ext4_encryption_key_size(mode)) - return size; - return 0; -} - -/* - * Validate dentries for encrypted directories to make sure we aren't - * potentially caching stale data after a key has been added or - * removed. - */ -static int ext4_d_revalidate(struct dentry *dentry, unsigned int flags) -{ - struct dentry *dir; - struct ext4_crypt_info *ci; - int dir_has_key, cached_with_key; - - if (flags & LOOKUP_RCU) - return -ECHILD; - - dir = dget_parent(dentry); - if (!ext4_encrypted_inode(d_inode(dir))) { - dput(dir); - return 0; - } - ci = EXT4_I(d_inode(dir))->i_crypt_info; - if (ci && ci->ci_keyring_key && - (ci->ci_keyring_key->flags & ((1 << KEY_FLAG_INVALIDATED) | - (1 << KEY_FLAG_REVOKED) | - (1 << KEY_FLAG_DEAD)))) - ci = NULL; - - /* this should eventually be an flag in d_flags */ - cached_with_key = dentry->d_fsdata != NULL; - dir_has_key = (ci != NULL); - dput(dir); - - /* - * If the dentry was cached without the key, and it is a - * negative dentry, it might be a valid name. We can't check - * if the key has since been made available due to locking - * reasons, so we fail the validation so ext4_lookup() can do - * this check. - * - * We also fail the validation if the dentry was created with - * the key present, but we no longer have the key, or vice versa. - */ - if ((!cached_with_key && d_is_negative(dentry)) || - (!cached_with_key && dir_has_key) || - (cached_with_key && !dir_has_key)) { -#if 0 /* Revalidation debug */ - char buf[80]; - char *cp = simple_dname(dentry, buf, sizeof(buf)); - - if (IS_ERR(cp)) - cp = (char *) "???"; - pr_err("revalidate: %s %p %d %d %d\n", cp, dentry->d_fsdata, - cached_with_key, d_is_negative(dentry), - dir_has_key); -#endif - return 0; - } - return 1; -} - -const struct dentry_operations ext4_encrypted_d_ops = { - .d_revalidate = ext4_d_revalidate, -}; diff --git a/fs/ext4/crypto_fname.c b/fs/ext4/crypto_fname.c deleted file mode 100644 index 1a2f360405db..000000000000 --- a/fs/ext4/crypto_fname.c +++ /dev/null @@ -1,468 +0,0 @@ -/* - * linux/fs/ext4/crypto_fname.c - * - * Copyright (C) 2015, Google, Inc. - * - * This contains functions for filename crypto management in ext4 - * - * Written by Uday Savagaonkar, 2014. - * - * This has not yet undergone a rigorous security audit. - * - */ - -#include <crypto/skcipher.h> -#include <keys/encrypted-type.h> -#include <keys/user-type.h> -#include <linux/gfp.h> -#include <linux/kernel.h> -#include <linux/key.h> -#include <linux/list.h> -#include <linux/mempool.h> -#include <linux/random.h> -#include <linux/scatterlist.h> -#include <linux/spinlock_types.h> - -#include "ext4.h" -#include "ext4_crypto.h" -#include "xattr.h" - -/** - * ext4_dir_crypt_complete() - - */ -static void ext4_dir_crypt_complete(struct crypto_async_request *req, int res) -{ - struct ext4_completion_result *ecr = req->data; - - if (res == -EINPROGRESS) - return; - ecr->res = res; - complete(&ecr->completion); -} - -bool ext4_valid_filenames_enc_mode(uint32_t mode) -{ - return (mode == EXT4_ENCRYPTION_MODE_AES_256_CTS); -} - -static unsigned max_name_len(struct inode *inode) -{ - return S_ISLNK(inode->i_mode) ? inode->i_sb->s_blocksize : - EXT4_NAME_LEN; -} - -/** - * ext4_fname_encrypt() - - * - * This function encrypts the input filename, and returns the length of the - * ciphertext. Errors are returned as negative numbers. We trust the caller to - * allocate sufficient memory to oname string. - */ -static int ext4_fname_encrypt(struct inode *inode, - const struct qstr *iname, - struct ext4_str *oname) -{ - u32 ciphertext_len; - struct skcipher_request *req = NULL; - DECLARE_EXT4_COMPLETION_RESULT(ecr); - struct ext4_crypt_info *ci = EXT4_I(inode)->i_crypt_info; - struct crypto_skcipher *tfm = ci->ci_ctfm; - int res = 0; - char iv[EXT4_CRYPTO_BLOCK_SIZE]; - struct scatterlist src_sg, dst_sg; - int padding = 4 << (ci->ci_flags & EXT4_POLICY_FLAGS_PAD_MASK); - char *workbuf, buf[32], *alloc_buf = NULL; - unsigned lim = max_name_len(inode); - - if (iname->len <= 0 || iname->len > lim) - return -EIO; - - ciphertext_len = (iname->len < EXT4_CRYPTO_BLOCK_SIZE) ? - EXT4_CRYPTO_BLOCK_SIZE : iname->len; - ciphertext_len = ext4_fname_crypto_round_up(ciphertext_len, padding); - ciphertext_len = (ciphertext_len > lim) - ? lim : ciphertext_len; - - if (ciphertext_len <= sizeof(buf)) { - workbuf = buf; - } else { - alloc_buf = kmalloc(ciphertext_len, GFP_NOFS); - if (!alloc_buf) - return -ENOMEM; - workbuf = alloc_buf; - } - - /* Allocate request */ - req = skcipher_request_alloc(tfm, GFP_NOFS); - if (!req) { - printk_ratelimited( - KERN_ERR "%s: crypto_request_alloc() failed\n", __func__); - kfree(alloc_buf); - return -ENOMEM; - } - skcipher_request_set_callback(req, - CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, - ext4_dir_crypt_complete, &ecr); - - /* Copy the input */ - memcpy(workbuf, iname->name, iname->len); - if (iname->len < ciphertext_len) - memset(workbuf + iname->len, 0, ciphertext_len - iname->len); - - /* Initialize IV */ - memset(iv, 0, EXT4_CRYPTO_BLOCK_SIZE); - - /* Create encryption request */ - sg_init_one(&src_sg, workbuf, ciphertext_len); - sg_init_one(&dst_sg, oname->name, ciphertext_len); - skcipher_request_set_crypt(req, &src_sg, &dst_sg, ciphertext_len, iv); - res = crypto_skcipher_encrypt(req); - if (res == -EINPROGRESS || res == -EBUSY) { - wait_for_completion(&ecr.completion); - res = ecr.res; - } - kfree(alloc_buf); - skcipher_request_free(req); - if (res < 0) { - printk_ratelimited( - KERN_ERR "%s: Error (error code %d)\n", __func__, res); - } - oname->len = ciphertext_len; - return res; -} - -/* - * ext4_fname_decrypt() - * This function decrypts the input filename, and returns - * the length of the plaintext. - * Errors are returned as negative numbers. - * We trust the caller to allocate sufficient memory to oname string. - */ -static int ext4_fname_decrypt(struct inode *inode, - const struct ext4_str *iname, - struct ext4_str *oname) -{ - struct ext4_str tmp_in[2], tmp_out[1]; - struct skcipher_request *req = NULL; - DECLARE_EXT4_COMPLETION_RESULT(ecr); - struct scatterlist src_sg, dst_sg; - struct ext4_crypt_info *ci = EXT4_I(inode)->i_crypt_info; - struct crypto_skcipher *tfm = ci->ci_ctfm; - int res = 0; - char iv[EXT4_CRYPTO_BLOCK_SIZE]; - unsigned lim = max_name_len(inode); - - if (iname->len <= 0 || iname->len > lim) - return -EIO; - - tmp_in[0].name = iname->name; - tmp_in[0].len = iname->len; - tmp_out[0].name = oname->name; - - /* Allocate request */ - req = skcipher_request_alloc(tfm, GFP_NOFS); - if (!req) { - printk_ratelimited( - KERN_ERR "%s: crypto_request_alloc() failed\n", __func__); - return -ENOMEM; - } - skcipher_request_set_callback(req, - CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, - ext4_dir_crypt_complete, &ecr); - - /* Initialize IV */ - memset(iv, 0, EXT4_CRYPTO_BLOCK_SIZE); - - /* Create encryption request */ - sg_init_one(&src_sg, iname->name, iname->len); - sg_init_one(&dst_sg, oname->name, oname->len); - skcipher_request_set_crypt(req, &src_sg, &dst_sg, iname->len, iv); - res = crypto_skcipher_decrypt(req); - if (res == -EINPROGRESS || res == -EBUSY) { - wait_for_completion(&ecr.completion); - res = ecr.res; - } - skcipher_request_free(req); - if (res < 0) { - printk_ratelimited( - KERN_ERR "%s: Error in ext4_fname_encrypt (error code %d)\n", - __func__, res); - return res; - } - - oname->len = strnlen(oname->name, iname->len); - return oname->len; -} - -static const char *lookup_table = - "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+,"; - -/** - * ext4_fname_encode_digest() - - * - * Encodes the input digest using characters from the set [a-zA-Z0-9_+]. - * The encoded string is roughly 4/3 times the size of the input string. - */ -static int digest_encode(const char *src, int len, char *dst) -{ - int i = 0, bits = 0, ac = 0; - char *cp = dst; - - while (i < len) { - ac += (((unsigned char) src[i]) << bits); - bits += 8; - do { - *cp++ = lookup_table[ac & 0x3f]; - ac >>= 6; - bits -= 6; - } while (bits >= 6); - i++; - } - if (bits) - *cp++ = lookup_table[ac & 0x3f]; - return cp - dst; -} - -static int digest_decode(const char *src, int len, char *dst) -{ - int i = 0, bits = 0, ac = 0; - const char *p; - char *cp = dst; - - while (i < len) { - p = strchr(lookup_table, src[i]); - if (p == NULL || src[i] == 0) - return -2; - ac += (p - lookup_table) << bits; - bits += 6; - if (bits >= 8) { - *cp++ = ac & 0xff; - ac >>= 8; - bits -= 8; - } - i++; - } - if (ac) - return -1; - return cp - dst; -} - -/** - * ext4_fname_crypto_round_up() - - * - * Return: The next multiple of block size - */ -u32 ext4_fname_crypto_round_up(u32 size, u32 blksize) -{ - return ((size+blksize-1)/blksize)*blksize; -} - -unsigned ext4_fname_encrypted_size(struct inode *inode, u32 ilen) -{ - struct ext4_crypt_info *ci = EXT4_I(inode)->i_crypt_info; - int padding = 32; - - if (ci) - padding = 4 << (ci->ci_flags & EXT4_POLICY_FLAGS_PAD_MASK); - if (ilen < EXT4_CRYPTO_BLOCK_SIZE) - ilen = EXT4_CRYPTO_BLOCK_SIZE; - return ext4_fname_crypto_round_up(ilen, padding); -} - -/* - * ext4_fname_crypto_alloc_buffer() - - * - * Allocates an output buffer that is sufficient for the crypto operation - * specified by the context and the direction. - */ -int ext4_fname_crypto_alloc_buffer(struct inode *inode, - u32 ilen, struct ext4_str *crypto_str) -{ - unsigned int olen = ext4_fname_encrypted_size(inode, ilen); - - crypto_str->len = olen; - if (olen < EXT4_FNAME_CRYPTO_DIGEST_SIZE*2) - olen = EXT4_FNAME_CRYPTO_DIGEST_SIZE*2; - /* Allocated buffer can hold one more character to null-terminate the - * string */ - crypto_str->name = kmalloc(olen+1, GFP_NOFS); - if (!(crypto_str->name)) - return -ENOMEM; - return 0; -} - -/** - * ext4_fname_crypto_free_buffer() - - * - * Frees the buffer allocated for crypto operation. - */ -void ext4_fname_crypto_free_buffer(struct ext4_str *crypto_str) -{ - if (!crypto_str) - return; - kfree(crypto_str->name); - crypto_str->name = NULL; -} - -/** - * ext4_fname_disk_to_usr() - converts a filename from disk space to user space - */ -int _ext4_fname_disk_to_usr(struct inode *inode, - struct dx_hash_info *hinfo, - const struct ext4_str *iname, - struct ext4_str *oname) -{ - char buf[24]; - int ret; - - if (iname->len < 3) { - /*Check for . and .. */ - if (iname->name[0] == '.' && iname->name[iname->len-1] == '.') { - oname->name[0] = '.'; - oname->name[iname->len-1] = '.'; - oname->len = iname->len; - return oname->len; - } - } - if (iname->len < EXT4_CRYPTO_BLOCK_SIZE) { - EXT4_ERROR_INODE(inode, "encrypted inode too small"); - return -EUCLEAN; - } - if (EXT4_I(inode)->i_crypt_info) - return ext4_fname_decrypt(inode, iname, oname); - - if (iname->len <= EXT4_FNAME_CRYPTO_DIGEST_SIZE) { - ret = digest_encode(iname->name, iname->len, oname->name); - oname->len = ret; - return ret; - } - if (hinfo) { - memcpy(buf, &hinfo->hash, 4); - memcpy(buf+4, &hinfo->minor_hash, 4); - } else - memset(buf, 0, 8); - memcpy(buf + 8, iname->name + iname->len - 16, 16); - oname->name[0] = '_'; - ret = digest_encode(buf, 24, oname->name+1); - oname->len = ret + 1; - return ret + 1; -} - -int ext4_fname_disk_to_usr(struct inode *inode, - struct dx_hash_info *hinfo, - const struct ext4_dir_entry_2 *de, - struct ext4_str *oname) -{ - struct ext4_str iname = {.name = (unsigned char *) de->name, - .len = de->name_len }; - - return _ext4_fname_disk_to_usr(inode, hinfo, &iname, oname); -} - - -/** - * ext4_fname_usr_to_disk() - converts a filename from user space to disk space - */ -int ext4_fname_usr_to_disk(struct inode *inode, - const struct qstr *iname, - struct ext4_str *oname) -{ - int res; - struct ext4_crypt_info *ci = EXT4_I(inode)->i_crypt_info; - - if (iname->len < 3) { - /*Check for . and .. */ - if (iname->name[0] == '.' && - iname->name[iname->len-1] == '.') { - oname->name[0] = '.'; - oname->name[iname->len-1] = '.'; - oname->len = iname->len; - return oname->len; - } - } - if (ci) { - res = ext4_fname_encrypt(inode, iname, oname); - return res; - } - /* Without a proper key, a user is not allowed to modify the filenames - * in a directory. Consequently, a user space name cannot be mapped to - * a disk-space name */ - return -EACCES; -} - -int ext4_fname_setup_filename(struct inode *dir, const struct qstr *iname, - int lookup, struct ext4_filename *fname) -{ - struct ext4_crypt_info *ci; - int ret = 0, bigname = 0; - - memset(fname, 0, sizeof(struct ext4_filename)); - fname->usr_fname = iname; - - if (!ext4_encrypted_inode(dir) || - ((iname->name[0] == '.') && - ((iname->len == 1) || - ((iname->name[1] == '.') && (iname->len == 2))))) { - fname->disk_name.name = (unsigned char *) iname->name; - fname->disk_name.len = iname->len; - return 0; - } - ret = ext4_get_encryption_info(dir); - if (ret) - return ret; - ci = EXT4_I(dir)->i_crypt_info; - if (ci) { - ret = ext4_fname_crypto_alloc_buffer(dir, iname->len, - &fname->crypto_buf); - if (ret < 0) - return ret; - ret = ext4_fname_encrypt(dir, iname, &fname->crypto_buf); - if (ret < 0) - goto errout; - fname->disk_name.name = fname->crypto_buf.name; - fname->disk_name.len = fname->crypto_buf.len; - return 0; - } - if (!lookup) - return -EACCES; - - /* We don't have the key and we are doing a lookup; decode the - * user-supplied name - */ - if (iname->name[0] == '_') - bigname = 1; - if ((bigname && (iname->len != 33)) || - (!bigname && (iname->len > 43))) - return -ENOENT; - - fname->crypto_buf.name = kmalloc(32, GFP_KERNEL); - if (fname->crypto_buf.name == NULL) - return -ENOMEM; - ret = digest_decode(iname->name + bigname, iname->len - bigname, - fname->crypto_buf.name); - if (ret < 0) { - ret = -ENOENT; - goto errout; - } - fname->crypto_buf.len = ret; - if (bigname) { - memcpy(&fname->hinfo.hash, fname->crypto_buf.name, 4); - memcpy(&fname->hinfo.minor_hash, fname->crypto_buf.name + 4, 4); - } else { - fname->disk_name.name = fname->crypto_buf.name; - fname->disk_name.len = fname->crypto_buf.len; - } - return 0; -errout: - kfree(fname->crypto_buf.name); - fname->crypto_buf.name = NULL; - return ret; -} - -void ext4_fname_free_filename(struct ext4_filename *fname) -{ - kfree(fname->crypto_buf.name); - fname->crypto_buf.name = NULL; - fname->usr_fname = NULL; - fname->disk_name.name = NULL; -} diff --git a/fs/ext4/crypto_key.c b/fs/ext4/crypto_key.c deleted file mode 100644 index 0129d688d1f7..000000000000 --- a/fs/ext4/crypto_key.c +++ /dev/null @@ -1,274 +0,0 @@ -/* - * linux/fs/ext4/crypto_key.c - * - * Copyright (C) 2015, Google, Inc. - * - * This contains encryption key functions for ext4 - * - * Written by Michael Halcrow, Ildar Muslukhov, and Uday Savagaonkar, 2015. - */ - -#include <crypto/skcipher.h> -#include <keys/encrypted-type.h> -#include <keys/user-type.h> -#include <linux/random.h> -#include <linux/scatterlist.h> -#include <uapi/linux/keyctl.h> - -#include "ext4.h" -#include "xattr.h" - -static void derive_crypt_complete(struct crypto_async_request *req, int rc) -{ - struct ext4_completion_result *ecr = req->data; - - if (rc == -EINPROGRESS) - return; - - ecr->res = rc; - complete(&ecr->completion); -} - -/** - * ext4_derive_key_aes() - Derive a key using AES-128-ECB - * @deriving_key: Encryption key used for derivation. - * @source_key: Source key to which to apply derivation. - * @derived_key: Derived key. - * - * Return: Zero on success; non-zero otherwise. - */ -static int ext4_derive_key_aes(char deriving_key[EXT4_AES_128_ECB_KEY_SIZE], - char source_key[EXT4_AES_256_XTS_KEY_SIZE], - char derived_key[EXT4_AES_256_XTS_KEY_SIZE]) -{ - int res = 0; - struct skcipher_request *req = NULL; - DECLARE_EXT4_COMPLETION_RESULT(ecr); - struct scatterlist src_sg, dst_sg; - struct crypto_skcipher *tfm = crypto_alloc_skcipher("ecb(aes)", 0, 0); - - if (IS_ERR(tfm)) { - res = PTR_ERR(tfm); - tfm = NULL; - goto out; - } - crypto_skcipher_set_flags(tfm, CRYPTO_TFM_REQ_WEAK_KEY); - req = skcipher_request_alloc(tfm, GFP_NOFS); - if (!req) { - res = -ENOMEM; - goto out; - } - skcipher_request_set_callback(req, - CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, - derive_crypt_complete, &ecr); - res = crypto_skcipher_setkey(tfm, deriving_key, - EXT4_AES_128_ECB_KEY_SIZE); - if (res < 0) - goto out; - sg_init_one(&src_sg, source_key, EXT4_AES_256_XTS_KEY_SIZE); - sg_init_one(&dst_sg, derived_key, EXT4_AES_256_XTS_KEY_SIZE); - skcipher_request_set_crypt(req, &src_sg, &dst_sg, - EXT4_AES_256_XTS_KEY_SIZE, NULL); - res = crypto_skcipher_encrypt(req); - if (res == -EINPROGRESS || res == -EBUSY) { - wait_for_completion(&ecr.completion); - res = ecr.res; - } - -out: - skcipher_request_free(req); - crypto_free_skcipher(tfm); - return res; -} - -void ext4_free_crypt_info(struct ext4_crypt_info *ci) -{ - if (!ci) - return; - - if (ci->ci_keyring_key) - key_put(ci->ci_keyring_key); - crypto_free_skcipher(ci->ci_ctfm); - kmem_cache_free(ext4_crypt_info_cachep, ci); -} - -void ext4_free_encryption_info(struct inode *inode, - struct ext4_crypt_info *ci) -{ - struct ext4_inode_info *ei = EXT4_I(inode); - struct ext4_crypt_info *prev; - - if (ci == NULL) - ci = ACCESS_ONCE(ei->i_crypt_info); - if (ci == NULL) - return; - prev = cmpxchg(&ei->i_crypt_info, ci, NULL); - if (prev != ci) - return; - - ext4_free_crypt_info(ci); -} - -int _ext4_get_encryption_info(struct inode *inode) -{ - struct ext4_inode_info *ei = EXT4_I(inode); - struct ext4_crypt_info *crypt_info; - char full_key_descriptor[EXT4_KEY_DESC_PREFIX_SIZE + - (EXT4_KEY_DESCRIPTOR_SIZE * 2) + 1]; - struct key *keyring_key = NULL; - struct ext4_encryption_key *master_key; - struct ext4_encryption_context ctx; - const struct user_key_payload *ukp; - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); - struct crypto_skcipher *ctfm; - const char *cipher_str; - char raw_key[EXT4_MAX_KEY_SIZE]; - char mode; - int res; - - if (!ext4_read_workqueue) { - res = ext4_init_crypto(); - if (res) - return res; - } - -retry: - crypt_info = ACCESS_ONCE(ei->i_crypt_info); - if (crypt_info) { - if (!crypt_info->ci_keyring_key || - key_validate(crypt_info->ci_keyring_key) == 0) - return 0; - ext4_free_encryption_info(inode, crypt_info); - goto retry; - } - - res = ext4_xattr_get(inode, EXT4_XATTR_INDEX_ENCRYPTION, - EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, - &ctx, sizeof(ctx)); - if (res < 0) { - if (!DUMMY_ENCRYPTION_ENABLED(sbi)) - return res; - ctx.contents_encryption_mode = EXT4_ENCRYPTION_MODE_AES_256_XTS; - ctx.filenames_encryption_mode = - EXT4_ENCRYPTION_MODE_AES_256_CTS; - ctx.flags = 0; - } else if (res != sizeof(ctx)) - return -EINVAL; - res = 0; - - crypt_info = kmem_cache_alloc(ext4_crypt_info_cachep, GFP_KERNEL); - if (!crypt_info) - return -ENOMEM; - - crypt_info->ci_flags = ctx.flags; - crypt_info->ci_data_mode = ctx.contents_encryption_mode; - crypt_info->ci_filename_mode = ctx.filenames_encryption_mode; - crypt_info->ci_ctfm = NULL; - crypt_info->ci_keyring_key = NULL; - memcpy(crypt_info->ci_master_key, ctx.master_key_descriptor, - sizeof(crypt_info->ci_master_key)); - if (S_ISREG(inode->i_mode)) - mode = crypt_info->ci_data_mode; - else if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) - mode = crypt_info->ci_filename_mode; - else - BUG(); - switch (mode) { - case EXT4_ENCRYPTION_MODE_AES_256_XTS: - cipher_str = "xts(aes)"; - break; - case EXT4_ENCRYPTION_MODE_AES_256_CTS: - cipher_str = "cts(cbc(aes))"; - break; - default: - printk_once(KERN_WARNING - "ext4: unsupported key mode %d (ino %u)\n", - mode, (unsigned) inode->i_ino); - res = -ENOKEY; - goto out; - } - if (DUMMY_ENCRYPTION_ENABLED(sbi)) { - memset(raw_key, 0x42, EXT4_AES_256_XTS_KEY_SIZE); - goto got_key; - } - memcpy(full_key_descriptor, EXT4_KEY_DESC_PREFIX, - EXT4_KEY_DESC_PREFIX_SIZE); - sprintf(full_key_descriptor + EXT4_KEY_DESC_PREFIX_SIZE, - "%*phN", EXT4_KEY_DESCRIPTOR_SIZE, - ctx.master_key_descriptor); - full_key_descriptor[EXT4_KEY_DESC_PREFIX_SIZE + - (2 * EXT4_KEY_DESCRIPTOR_SIZE)] = '\0'; - keyring_key = request_key(&key_type_logon, full_key_descriptor, NULL); - if (IS_ERR(keyring_key)) { - res = PTR_ERR(keyring_key); - keyring_key = NULL; - goto out; - } - crypt_info->ci_keyring_key = keyring_key; - if (keyring_key->type != &key_type_logon) { - printk_once(KERN_WARNING - "ext4: key type must be logon\n"); - res = -ENOKEY; - goto out; - } - down_read(&keyring_key->sem); - ukp = user_key_payload(keyring_key); - if (ukp->datalen != sizeof(struct ext4_encryption_key)) { - res = -EINVAL; - up_read(&keyring_key->sem); - goto out; - } - master_key = (struct ext4_encryption_key *)ukp->data; - BUILD_BUG_ON(EXT4_AES_128_ECB_KEY_SIZE != - EXT4_KEY_DERIVATION_NONCE_SIZE); - if (master_key->size != EXT4_AES_256_XTS_KEY_SIZE) { - printk_once(KERN_WARNING - "ext4: key size incorrect: %d\n", - master_key->size); - res = -ENOKEY; - up_read(&keyring_key->sem); - goto out; - } - res = ext4_derive_key_aes(ctx.nonce, master_key->raw, - raw_key); - up_read(&keyring_key->sem); - if (res) - goto out; -got_key: - ctfm = crypto_alloc_skcipher(cipher_str, 0, 0); - if (!ctfm || IS_ERR(ctfm)) { - res = ctfm ? PTR_ERR(ctfm) : -ENOMEM; - printk(KERN_DEBUG - "%s: error %d (inode %u) allocating crypto tfm\n", - __func__, res, (unsigned) inode->i_ino); - goto out; - } - crypt_info->ci_ctfm = ctfm; - crypto_skcipher_clear_flags(ctfm, ~0); - crypto_tfm_set_flags(crypto_skcipher_tfm(ctfm), - CRYPTO_TFM_REQ_WEAK_KEY); - res = crypto_skcipher_setkey(ctfm, raw_key, - ext4_encryption_key_size(mode)); - if (res) - goto out; - memzero_explicit(raw_key, sizeof(raw_key)); - if (cmpxchg(&ei->i_crypt_info, NULL, crypt_info) != NULL) { - ext4_free_crypt_info(crypt_info); - goto retry; - } - return 0; - -out: - if (res == -ENOKEY) - res = 0; - ext4_free_crypt_info(crypt_info); - memzero_explicit(raw_key, sizeof(raw_key)); - return res; -} - -int ext4_has_encryption_key(struct inode *inode) -{ - struct ext4_inode_info *ei = EXT4_I(inode); - - return (ei->i_crypt_info != NULL); -} diff --git a/fs/ext4/crypto_policy.c b/fs/ext4/crypto_policy.c deleted file mode 100644 index ad050698143f..000000000000 --- a/fs/ext4/crypto_policy.c +++ /dev/null @@ -1,229 +0,0 @@ -/* - * linux/fs/ext4/crypto_policy.c - * - * Copyright (C) 2015, Google, Inc. - * - * This contains encryption policy functions for ext4 - * - * Written by Michael Halcrow, 2015. - */ - -#include <linux/random.h> -#include <linux/string.h> -#include <linux/types.h> - -#include "ext4_jbd2.h" -#include "ext4.h" -#include "xattr.h" - -static int ext4_inode_has_encryption_context(struct inode *inode) -{ - int res = ext4_xattr_get(inode, EXT4_XATTR_INDEX_ENCRYPTION, - EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, NULL, 0); - return (res > 0); -} - -/* - * check whether the policy is consistent with the encryption context - * for the inode - */ -static int ext4_is_encryption_context_consistent_with_policy( - struct inode *inode, const struct ext4_encryption_policy *policy) -{ - struct ext4_encryption_context ctx; - int res = ext4_xattr_get(inode, EXT4_XATTR_INDEX_ENCRYPTION, - EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, &ctx, - sizeof(ctx)); - if (res != sizeof(ctx)) - return 0; - return (memcmp(ctx.master_key_descriptor, policy->master_key_descriptor, - EXT4_KEY_DESCRIPTOR_SIZE) == 0 && - (ctx.flags == - policy->flags) && - (ctx.contents_encryption_mode == - policy->contents_encryption_mode) && - (ctx.filenames_encryption_mode == - policy->filenames_encryption_mode)); -} - -static int ext4_create_encryption_context_from_policy( - struct inode *inode, const struct ext4_encryption_policy *policy) -{ - struct ext4_encryption_context ctx; - handle_t *handle; - int res, res2; - - res = ext4_convert_inline_data(inode); - if (res) - return res; - - ctx.format = EXT4_ENCRYPTION_CONTEXT_FORMAT_V1; - memcpy(ctx.master_key_descriptor, policy->master_key_descriptor, - EXT4_KEY_DESCRIPTOR_SIZE); - if (!ext4_valid_contents_enc_mode(policy->contents_encryption_mode)) { - printk(KERN_WARNING - "%s: Invalid contents encryption mode %d\n", __func__, - policy->contents_encryption_mode); - return -EINVAL; - } - if (!ext4_valid_filenames_enc_mode(policy->filenames_encryption_mode)) { - printk(KERN_WARNING - "%s: Invalid filenames encryption mode %d\n", __func__, - policy->filenames_encryption_mode); - return -EINVAL; - } - if (policy->flags & ~EXT4_POLICY_FLAGS_VALID) - return -EINVAL; - ctx.contents_encryption_mode = policy->contents_encryption_mode; - ctx.filenames_encryption_mode = policy->filenames_encryption_mode; - ctx.flags = policy->flags; - BUILD_BUG_ON(sizeof(ctx.nonce) != EXT4_KEY_DERIVATION_NONCE_SIZE); - get_random_bytes(ctx.nonce, EXT4_KEY_DERIVATION_NONCE_SIZE); - - handle = ext4_journal_start(inode, EXT4_HT_MISC, - ext4_jbd2_credits_xattr(inode)); - if (IS_ERR(handle)) - return PTR_ERR(handle); - res = ext4_xattr_set(inode, EXT4_XATTR_INDEX_ENCRYPTION, - EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, &ctx, - sizeof(ctx), 0); - if (!res) { - ext4_set_inode_flag(inode, EXT4_INODE_ENCRYPT); - res = ext4_mark_inode_dirty(handle, inode); - if (res) - EXT4_ERROR_INODE(inode, "Failed to mark inode dirty"); - } - res2 = ext4_journal_stop(handle); - if (!res) - res = res2; - return res; -} - -int ext4_process_policy(const struct ext4_encryption_policy *policy, - struct inode *inode) -{ - if (policy->version != 0) - return -EINVAL; - - if (!ext4_inode_has_encryption_context(inode)) { - if (!S_ISDIR(inode->i_mode)) - return -EINVAL; - if (!ext4_empty_dir(inode)) - return -ENOTEMPTY; - return ext4_create_encryption_context_from_policy(inode, - policy); - } - - if (ext4_is_encryption_context_consistent_with_policy(inode, policy)) - return 0; - - printk(KERN_WARNING "%s: Policy inconsistent with encryption context\n", - __func__); - return -EINVAL; -} - -int ext4_get_policy(struct inode *inode, struct ext4_encryption_policy *policy) -{ - struct ext4_encryption_context ctx; - - int res = ext4_xattr_get(inode, EXT4_XATTR_INDEX_ENCRYPTION, - EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, - &ctx, sizeof(ctx)); - if (res != sizeof(ctx)) - return -ENOENT; - if (ctx.format != EXT4_ENCRYPTION_CONTEXT_FORMAT_V1) - return -EINVAL; - policy->version = 0; - policy->contents_encryption_mode = ctx.contents_encryption_mode; - policy->filenames_encryption_mode = ctx.filenames_encryption_mode; - policy->flags = ctx.flags; - memcpy(&policy->master_key_descriptor, ctx.master_key_descriptor, - EXT4_KEY_DESCRIPTOR_SIZE); - return 0; -} - -int ext4_is_child_context_consistent_with_parent(struct inode *parent, - struct inode *child) -{ - struct ext4_crypt_info *parent_ci, *child_ci; - int res; - - if ((parent == NULL) || (child == NULL)) { - pr_err("parent %p child %p\n", parent, child); - WARN_ON(1); /* Should never happen */ - return 0; - } - /* no restrictions if the parent directory is not encrypted */ - if (!ext4_encrypted_inode(parent)) - return 1; - /* if the child directory is not encrypted, this is always a problem */ - if (!ext4_encrypted_inode(child)) - return 0; - res = ext4_get_encryption_info(parent); - if (res) - return 0; - res = ext4_get_encryption_info(child); - if (res) - return 0; - parent_ci = EXT4_I(parent)->i_crypt_info; - child_ci = EXT4_I(child)->i_crypt_info; - if (!parent_ci && !child_ci) - return 1; - if (!parent_ci || !child_ci) - return 0; - - return (memcmp(parent_ci->ci_master_key, - child_ci->ci_master_key, - EXT4_KEY_DESCRIPTOR_SIZE) == 0 && - (parent_ci->ci_data_mode == child_ci->ci_data_mode) && - (parent_ci->ci_filename_mode == child_ci->ci_filename_mode) && - (parent_ci->ci_flags == child_ci->ci_flags)); -} - -/** - * ext4_inherit_context() - Sets a child context from its parent - * @parent: Parent inode from which the context is inherited. - * @child: Child inode that inherits the context from @parent. - * - * Return: Zero on success, non-zero otherwise - */ -int ext4_inherit_context(struct inode *parent, struct inode *child) -{ - struct ext4_encryption_context ctx; - struct ext4_crypt_info *ci; - int res; - - res = ext4_get_encryption_info(parent); - if (res < 0) - return res; - ci = EXT4_I(parent)->i_crypt_info; - if (ci == NULL) - return -ENOKEY; - - ctx.format = EXT4_ENCRYPTION_CONTEXT_FORMAT_V1; - if (DUMMY_ENCRYPTION_ENABLED(EXT4_SB(parent->i_sb))) { - ctx.contents_encryption_mode = EXT4_ENCRYPTION_MODE_AES_256_XTS; - ctx.filenames_encryption_mode = - EXT4_ENCRYPTION_MODE_AES_256_CTS; - ctx.flags = 0; - memset(ctx.master_key_descriptor, 0x42, - EXT4_KEY_DESCRIPTOR_SIZE); - res = 0; - } else { - ctx.contents_encryption_mode = ci->ci_data_mode; - ctx.filenames_encryption_mode = ci->ci_filename_mode; - ctx.flags = ci->ci_flags; - memcpy(ctx.master_key_descriptor, ci->ci_master_key, - EXT4_KEY_DESCRIPTOR_SIZE); - } - get_random_bytes(ctx.nonce, EXT4_KEY_DERIVATION_NONCE_SIZE); - res = ext4_xattr_set(child, EXT4_XATTR_INDEX_ENCRYPTION, - EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, &ctx, - sizeof(ctx), 0); - if (!res) { - ext4_set_inode_flag(child, EXT4_INODE_ENCRYPT); - ext4_clear_inode_state(child, EXT4_STATE_MAY_INLINE_DATA); - res = ext4_get_encryption_info(child); - } - return res; -} diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index 5d00bf060254..67415e0e6af0 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -109,10 +109,10 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx) struct super_block *sb = inode->i_sb; struct buffer_head *bh = NULL; int dir_has_error = 0; - struct ext4_str fname_crypto_str = {.name = NULL, .len = 0}; + struct fscrypt_str fstr = FSTR_INIT(NULL, 0); if (ext4_encrypted_inode(inode)) { - err = ext4_get_encryption_info(inode); + err = fscrypt_get_encryption_info(inode); if (err && err != -ENOKEY) return err; } @@ -139,8 +139,7 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx) } if (ext4_encrypted_inode(inode)) { - err = ext4_fname_crypto_alloc_buffer(inode, EXT4_NAME_LEN, - &fname_crypto_str); + err = fscrypt_fname_alloc_buffer(inode, EXT4_NAME_LEN, &fstr); if (err < 0) return err; } @@ -150,6 +149,11 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx) while (ctx->pos < inode->i_size) { struct ext4_map_blocks map; + if (fatal_signal_pending(current)) { + err = -ERESTARTSYS; + goto errout; + } + cond_resched(); map.m_lblk = ctx->pos >> EXT4_BLOCK_SIZE_BITS(sb); map.m_len = 1; err = ext4_map_blocks(NULL, inode, &map, 0); @@ -248,16 +252,19 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx) get_dtype(sb, de->file_type))) goto done; } else { - int save_len = fname_crypto_str.len; + int save_len = fstr.len; + struct fscrypt_str de_name = + FSTR_INIT(de->name, + de->name_len); /* Directory is encrypted */ - err = ext4_fname_disk_to_usr(inode, - NULL, de, &fname_crypto_str); - fname_crypto_str.len = save_len; + err = fscrypt_fname_disk_to_usr(inode, + 0, 0, &de_name, &fstr); + fstr.len = save_len; if (err < 0) goto errout; if (!dir_emit(ctx, - fname_crypto_str.name, err, + fstr.name, err, le32_to_cpu(de->inode), get_dtype(sb, de->file_type))) goto done; @@ -276,7 +283,7 @@ done: err = 0; errout: #ifdef CONFIG_EXT4_FS_ENCRYPTION - ext4_fname_crypto_free_buffer(&fname_crypto_str); + fscrypt_fname_free_buffer(&fstr); #endif brelse(bh); return err; @@ -427,7 +434,7 @@ void ext4_htree_free_dir_info(struct dir_private_info *p) int ext4_htree_store_dirent(struct file *dir_file, __u32 hash, __u32 minor_hash, struct ext4_dir_entry_2 *dirent, - struct ext4_str *ent_name) + struct fscrypt_str *ent_name) { struct rb_node **p, *parent = NULL; struct fname *fname, *new_fn; @@ -604,7 +611,7 @@ finished: static int ext4_dir_open(struct inode * inode, struct file * filp) { if (ext4_encrypted_inode(inode)) - return ext4_get_encryption_info(inode) ? -EACCES : 0; + return fscrypt_get_encryption_info(inode) ? -EACCES : 0; return 0; } diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 72f4c9e00e97..ea31931386ec 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -32,7 +32,9 @@ #include <linux/percpu_counter.h> #include <linux/ratelimit.h> #include <crypto/hash.h> +#include <linux/fscrypto.h> #include <linux/falloc.h> +#include <linux/percpu-rwsem.h> #ifdef __KERNEL__ #include <linux/compat.h> #endif @@ -581,6 +583,9 @@ enum { #define EXT4_GET_BLOCKS_ZERO 0x0200 #define EXT4_GET_BLOCKS_CREATE_ZERO (EXT4_GET_BLOCKS_CREATE |\ EXT4_GET_BLOCKS_ZERO) + /* Caller will submit data before dropping transaction handle. This + * allows jbd2 to avoid submitting data before commit. */ +#define EXT4_GET_BLOCKS_IO_SUBMIT 0x0400 /* * The bit position of these flags must not overlap with any of the @@ -604,15 +609,6 @@ enum { #define EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER 0x0010 #define EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER 0x0020 -/* Encryption algorithms */ -#define EXT4_ENCRYPTION_MODE_INVALID 0 -#define EXT4_ENCRYPTION_MODE_AES_256_XTS 1 -#define EXT4_ENCRYPTION_MODE_AES_256_GCM 2 -#define EXT4_ENCRYPTION_MODE_AES_256_CBC 3 -#define EXT4_ENCRYPTION_MODE_AES_256_CTS 4 - -#include "ext4_crypto.h" - /* * ioctl commands */ @@ -634,9 +630,9 @@ enum { #define EXT4_IOC_RESIZE_FS _IOW('f', 16, __u64) #define EXT4_IOC_SWAP_BOOT _IO('f', 17) #define EXT4_IOC_PRECACHE_EXTENTS _IO('f', 18) -#define EXT4_IOC_SET_ENCRYPTION_POLICY _IOR('f', 19, struct ext4_encryption_policy) -#define EXT4_IOC_GET_ENCRYPTION_PWSALT _IOW('f', 20, __u8[16]) -#define EXT4_IOC_GET_ENCRYPTION_POLICY _IOW('f', 21, struct ext4_encryption_policy) +#define EXT4_IOC_SET_ENCRYPTION_POLICY FS_IOC_SET_ENCRYPTION_POLICY +#define EXT4_IOC_GET_ENCRYPTION_PWSALT FS_IOC_GET_ENCRYPTION_PWSALT +#define EXT4_IOC_GET_ENCRYPTION_POLICY FS_IOC_GET_ENCRYPTION_POLICY #ifndef FS_IOC_FSGETXATTR /* Until the uapi changes get merged for project quota... */ @@ -1078,10 +1074,6 @@ struct ext4_inode_info { /* Precomputed uuid+inum+igen checksum for seeding inode checksums */ __u32 i_csum_seed; -#ifdef CONFIG_EXT4_FS_ENCRYPTION - /* Encryption params */ - struct ext4_crypt_info *i_crypt_info; -#endif kprojid_t i_projid; }; @@ -1340,6 +1332,11 @@ struct ext4_super_block { /* Number of quota types we support */ #define EXT4_MAXQUOTAS 3 +#ifdef CONFIG_EXT4_FS_ENCRYPTION +#define EXT4_KEY_DESC_PREFIX "ext4:" +#define EXT4_KEY_DESC_PREFIX_SIZE 5 +#endif + /* * fourth extended-fs super-block data in memory */ @@ -1426,6 +1423,7 @@ struct ext4_sb_info { unsigned short *s_mb_offsets; unsigned int *s_mb_maxs; unsigned int s_group_info_size; + unsigned int s_mb_free_pending; /* tunables */ unsigned long s_stripe; @@ -1505,6 +1503,15 @@ struct ext4_sb_info { struct ratelimit_state s_err_ratelimit_state; struct ratelimit_state s_warning_ratelimit_state; struct ratelimit_state s_msg_ratelimit_state; + + /* Barrier between changing inodes' journal flags and writepages ops. */ + struct percpu_rw_semaphore s_journal_flag_rwsem; + + /* Encryption support */ +#ifdef CONFIG_EXT4_FS_ENCRYPTION + u8 key_prefix[EXT4_KEY_DESC_PREFIX_SIZE]; + u8 key_prefix_size; +#endif }; static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb) @@ -1549,7 +1556,6 @@ enum { EXT4_STATE_DIOREAD_LOCK, /* Disable support for dio read nolocking */ EXT4_STATE_MAY_INLINE_DATA, /* may have in-inode data */ - EXT4_STATE_ORDERED_MODE, /* data=ordered mode */ EXT4_STATE_EXT_PRECACHED, /* extents have been precached */ }; @@ -1604,15 +1610,6 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei) /* * Returns true if the inode is inode is encrypted */ -static inline int ext4_encrypted_inode(struct inode *inode) -{ -#ifdef CONFIG_EXT4_FS_ENCRYPTION - return ext4_test_inode_flag(inode, EXT4_INODE_ENCRYPT); -#else - return 0; -#endif -} - #define NEXT_ORPHAN(inode) EXT4_I(inode)->i_dtime /* @@ -2076,10 +2073,10 @@ struct dx_hash_info struct ext4_filename { const struct qstr *usr_fname; - struct ext4_str disk_name; + struct fscrypt_str disk_name; struct dx_hash_info hinfo; #ifdef CONFIG_EXT4_FS_ENCRYPTION - struct ext4_str crypto_buf; + struct fscrypt_str crypto_buf; #endif }; @@ -2290,132 +2287,82 @@ extern unsigned ext4_free_clusters_after_init(struct super_block *sb, struct ext4_group_desc *gdp); ext4_fsblk_t ext4_inode_to_goal_block(struct inode *); -/* crypto_policy.c */ -int ext4_is_child_context_consistent_with_parent(struct inode *parent, - struct inode *child); -int ext4_inherit_context(struct inode *parent, struct inode *child); -void ext4_to_hex(char *dst, char *src, size_t src_size); -int ext4_process_policy(const struct ext4_encryption_policy *policy, - struct inode *inode); -int ext4_get_policy(struct inode *inode, - struct ext4_encryption_policy *policy); - -/* crypto.c */ -extern struct kmem_cache *ext4_crypt_info_cachep; -bool ext4_valid_contents_enc_mode(uint32_t mode); -uint32_t ext4_validate_encryption_key_size(uint32_t mode, uint32_t size); -extern struct workqueue_struct *ext4_read_workqueue; -struct ext4_crypto_ctx *ext4_get_crypto_ctx(struct inode *inode, - gfp_t gfp_flags); -void ext4_release_crypto_ctx(struct ext4_crypto_ctx *ctx); -void ext4_restore_control_page(struct page *data_page); -struct page *ext4_encrypt(struct inode *inode, - struct page *plaintext_page, - gfp_t gfp_flags); -int ext4_decrypt(struct page *page); -int ext4_encrypted_zeroout(struct inode *inode, ext4_lblk_t lblk, - ext4_fsblk_t pblk, ext4_lblk_t len); -extern const struct dentry_operations ext4_encrypted_d_ops; - -#ifdef CONFIG_EXT4_FS_ENCRYPTION -int ext4_init_crypto(void); -void ext4_exit_crypto(void); static inline int ext4_sb_has_crypto(struct super_block *sb) { return ext4_has_feature_encrypt(sb); } -#else -static inline int ext4_init_crypto(void) { return 0; } -static inline void ext4_exit_crypto(void) { } -static inline int ext4_sb_has_crypto(struct super_block *sb) + +static inline bool ext4_encrypted_inode(struct inode *inode) { - return 0; + return ext4_test_inode_flag(inode, EXT4_INODE_ENCRYPT); } -#endif -/* crypto_fname.c */ -bool ext4_valid_filenames_enc_mode(uint32_t mode); -u32 ext4_fname_crypto_round_up(u32 size, u32 blksize); -unsigned ext4_fname_encrypted_size(struct inode *inode, u32 ilen); -int ext4_fname_crypto_alloc_buffer(struct inode *inode, - u32 ilen, struct ext4_str *crypto_str); -int _ext4_fname_disk_to_usr(struct inode *inode, - struct dx_hash_info *hinfo, - const struct ext4_str *iname, - struct ext4_str *oname); -int ext4_fname_disk_to_usr(struct inode *inode, - struct dx_hash_info *hinfo, - const struct ext4_dir_entry_2 *de, - struct ext4_str *oname); -int ext4_fname_usr_to_disk(struct inode *inode, - const struct qstr *iname, - struct ext4_str *oname); #ifdef CONFIG_EXT4_FS_ENCRYPTION -void ext4_fname_crypto_free_buffer(struct ext4_str *crypto_str); -int ext4_fname_setup_filename(struct inode *dir, const struct qstr *iname, - int lookup, struct ext4_filename *fname); -void ext4_fname_free_filename(struct ext4_filename *fname); -#else -static inline -int ext4_setup_fname_crypto(struct inode *inode) -{ - return 0; -} -static inline void ext4_fname_crypto_free_buffer(struct ext4_str *p) { } static inline int ext4_fname_setup_filename(struct inode *dir, - const struct qstr *iname, - int lookup, struct ext4_filename *fname) + const struct qstr *iname, + int lookup, struct ext4_filename *fname) { - fname->usr_fname = iname; - fname->disk_name.name = (unsigned char *) iname->name; - fname->disk_name.len = iname->len; - return 0; -} -static inline void ext4_fname_free_filename(struct ext4_filename *fname) { } -#endif - + struct fscrypt_name name; + int err; -/* crypto_key.c */ -void ext4_free_crypt_info(struct ext4_crypt_info *ci); -void ext4_free_encryption_info(struct inode *inode, struct ext4_crypt_info *ci); -int _ext4_get_encryption_info(struct inode *inode); + memset(fname, 0, sizeof(struct ext4_filename)); -#ifdef CONFIG_EXT4_FS_ENCRYPTION -int ext4_has_encryption_key(struct inode *inode); + err = fscrypt_setup_filename(dir, iname, lookup, &name); -static inline int ext4_get_encryption_info(struct inode *inode) -{ - struct ext4_crypt_info *ci = EXT4_I(inode)->i_crypt_info; - - if (!ci || - (ci->ci_keyring_key && - (ci->ci_keyring_key->flags & ((1 << KEY_FLAG_INVALIDATED) | - (1 << KEY_FLAG_REVOKED) | - (1 << KEY_FLAG_DEAD))))) - return _ext4_get_encryption_info(inode); - return 0; + fname->usr_fname = name.usr_fname; + fname->disk_name = name.disk_name; + fname->hinfo.hash = name.hash; + fname->hinfo.minor_hash = name.minor_hash; + fname->crypto_buf = name.crypto_buf; + return err; } -static inline struct ext4_crypt_info *ext4_encryption_info(struct inode *inode) +static inline void ext4_fname_free_filename(struct ext4_filename *fname) { - return EXT4_I(inode)->i_crypt_info; -} + struct fscrypt_name name; -#else -static inline int ext4_has_encryption_key(struct inode *inode) -{ - return 0; + name.crypto_buf = fname->crypto_buf; + fscrypt_free_filename(&name); + + fname->crypto_buf.name = NULL; + fname->usr_fname = NULL; + fname->disk_name.name = NULL; } -static inline int ext4_get_encryption_info(struct inode *inode) +#else +static inline int ext4_fname_setup_filename(struct inode *dir, + const struct qstr *iname, + int lookup, struct ext4_filename *fname) { + fname->usr_fname = iname; + fname->disk_name.name = (unsigned char *) iname->name; + fname->disk_name.len = iname->len; return 0; } -static inline struct ext4_crypt_info *ext4_encryption_info(struct inode *inode) -{ - return NULL; -} -#endif +static inline void ext4_fname_free_filename(struct ext4_filename *fname) { } +#define fscrypt_set_d_op(i) +#define fscrypt_get_ctx fscrypt_notsupp_get_ctx +#define fscrypt_release_ctx fscrypt_notsupp_release_ctx +#define fscrypt_encrypt_page fscrypt_notsupp_encrypt_page +#define fscrypt_decrypt_page fscrypt_notsupp_decrypt_page +#define fscrypt_decrypt_bio_pages fscrypt_notsupp_decrypt_bio_pages +#define fscrypt_pullback_bio_page fscrypt_notsupp_pullback_bio_page +#define fscrypt_restore_control_page fscrypt_notsupp_restore_control_page +#define fscrypt_zeroout_range fscrypt_notsupp_zeroout_range +#define fscrypt_process_policy fscrypt_notsupp_process_policy +#define fscrypt_get_policy fscrypt_notsupp_get_policy +#define fscrypt_has_permitted_context fscrypt_notsupp_has_permitted_context +#define fscrypt_inherit_context fscrypt_notsupp_inherit_context +#define fscrypt_get_encryption_info fscrypt_notsupp_get_encryption_info +#define fscrypt_put_encryption_info fscrypt_notsupp_put_encryption_info +#define fscrypt_setup_filename fscrypt_notsupp_setup_filename +#define fscrypt_free_filename fscrypt_notsupp_free_filename +#define fscrypt_fname_encrypted_size fscrypt_notsupp_fname_encrypted_size +#define fscrypt_fname_alloc_buffer fscrypt_notsupp_fname_alloc_buffer +#define fscrypt_fname_free_buffer fscrypt_notsupp_fname_free_buffer +#define fscrypt_fname_disk_to_usr fscrypt_notsupp_fname_disk_to_usr +#define fscrypt_fname_usr_to_disk fscrypt_notsupp_fname_usr_to_disk +#endif /* dir.c */ extern int __ext4_check_dir_entry(const char *, unsigned int, struct inode *, @@ -2429,7 +2376,7 @@ extern int __ext4_check_dir_entry(const char *, unsigned int, struct inode *, extern int ext4_htree_store_dirent(struct file *dir_file, __u32 hash, __u32 minor_hash, struct ext4_dir_entry_2 *dirent, - struct ext4_str *ent_name); + struct fscrypt_str *ent_name); extern void ext4_htree_free_dir_info(struct dir_private_info *p); extern int ext4_find_dest_de(struct inode *dir, struct inode *inode, struct buffer_head *bh, @@ -2521,8 +2468,8 @@ struct buffer_head *ext4_getblk(handle_t *, struct inode *, ext4_lblk_t, int); struct buffer_head *ext4_bread(handle_t *, struct inode *, ext4_lblk_t, int); int ext4_get_block_unwritten(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create); -int ext4_dax_mmap_get_block(struct inode *inode, sector_t iblock, - struct buffer_head *bh_result, int create); +int ext4_dax_get_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create); int ext4_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create); int ext4_dio_get_block(struct inode *inode, sector_t iblock, @@ -2581,7 +2528,6 @@ extern int ext4_get_next_extent(struct inode *inode, ext4_lblk_t lblk, /* indirect.c */ extern int ext4_ind_map_blocks(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, int flags); -extern ssize_t ext4_ind_direct_IO(struct kiocb *iocb, struct iov_iter *iter); extern int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock); extern int ext4_ind_trans_blocks(struct inode *inode, int nrblocks); extern void ext4_ind_truncate(handle_t *, struct inode *inode); @@ -2618,7 +2564,7 @@ extern int ext4_generic_delete_entry(handle_t *handle, void *entry_buf, int buf_size, int csum_size); -extern int ext4_empty_dir(struct inode *inode); +extern bool ext4_empty_dir(struct inode *inode); /* resize.c */ extern int ext4_group_add(struct super_block *sb, @@ -3100,7 +3046,7 @@ extern int ext4_delete_inline_entry(handle_t *handle, struct ext4_dir_entry_2 *de_del, struct buffer_head *bh, int *has_inline_data); -extern int empty_inline_dir(struct inode *dir, int *has_inline_data); +extern bool empty_inline_dir(struct inode *dir, int *has_inline_data); extern struct buffer_head *ext4_get_first_inline_block(struct inode *inode, struct ext4_dir_entry_2 **parent_de, int *retval); @@ -3329,6 +3275,13 @@ static inline void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end) } } +static inline bool ext4_aligned_io(struct inode *inode, loff_t off, loff_t len) +{ + int blksize = 1 << inode->i_blkbits; + + return IS_ALIGNED(off, blksize) && IS_ALIGNED(len, blksize); +} + #endif /* __KERNEL__ */ #define EFSBADCRC EBADMSG /* Bad CRC detected */ diff --git a/fs/ext4/ext4_crypto.h b/fs/ext4/ext4_crypto.h deleted file mode 100644 index 1f73c29717e1..000000000000 --- a/fs/ext4/ext4_crypto.h +++ /dev/null @@ -1,159 +0,0 @@ -/* - * linux/fs/ext4/ext4_crypto.h - * - * Copyright (C) 2015, Google, Inc. - * - * This contains encryption header content for ext4 - * - * Written by Michael Halcrow, 2015. - */ - -#ifndef _EXT4_CRYPTO_H -#define _EXT4_CRYPTO_H - -#include <linux/fs.h> - -#define EXT4_KEY_DESCRIPTOR_SIZE 8 - -/* Policy provided via an ioctl on the topmost directory */ -struct ext4_encryption_policy { - char version; - char contents_encryption_mode; - char filenames_encryption_mode; - char flags; - char master_key_descriptor[EXT4_KEY_DESCRIPTOR_SIZE]; -} __attribute__((__packed__)); - -#define EXT4_ENCRYPTION_CONTEXT_FORMAT_V1 1 -#define EXT4_KEY_DERIVATION_NONCE_SIZE 16 - -#define EXT4_POLICY_FLAGS_PAD_4 0x00 -#define EXT4_POLICY_FLAGS_PAD_8 0x01 -#define EXT4_POLICY_FLAGS_PAD_16 0x02 -#define EXT4_POLICY_FLAGS_PAD_32 0x03 -#define EXT4_POLICY_FLAGS_PAD_MASK 0x03 -#define EXT4_POLICY_FLAGS_VALID 0x03 - -/** - * Encryption context for inode - * - * Protector format: - * 1 byte: Protector format (1 = this version) - * 1 byte: File contents encryption mode - * 1 byte: File names encryption mode - * 1 byte: Reserved - * 8 bytes: Master Key descriptor - * 16 bytes: Encryption Key derivation nonce - */ -struct ext4_encryption_context { - char format; - char contents_encryption_mode; - char filenames_encryption_mode; - char flags; - char master_key_descriptor[EXT4_KEY_DESCRIPTOR_SIZE]; - char nonce[EXT4_KEY_DERIVATION_NONCE_SIZE]; -} __attribute__((__packed__)); - -/* Encryption parameters */ -#define EXT4_XTS_TWEAK_SIZE 16 -#define EXT4_AES_128_ECB_KEY_SIZE 16 -#define EXT4_AES_256_GCM_KEY_SIZE 32 -#define EXT4_AES_256_CBC_KEY_SIZE 32 -#define EXT4_AES_256_CTS_KEY_SIZE 32 -#define EXT4_AES_256_XTS_KEY_SIZE 64 -#define EXT4_MAX_KEY_SIZE 64 - -#define EXT4_KEY_DESC_PREFIX "ext4:" -#define EXT4_KEY_DESC_PREFIX_SIZE 5 - -/* This is passed in from userspace into the kernel keyring */ -struct ext4_encryption_key { - __u32 mode; - char raw[EXT4_MAX_KEY_SIZE]; - __u32 size; -} __attribute__((__packed__)); - -struct ext4_crypt_info { - char ci_data_mode; - char ci_filename_mode; - char ci_flags; - struct crypto_skcipher *ci_ctfm; - struct key *ci_keyring_key; - char ci_master_key[EXT4_KEY_DESCRIPTOR_SIZE]; -}; - -#define EXT4_CTX_REQUIRES_FREE_ENCRYPT_FL 0x00000001 -#define EXT4_WRITE_PATH_FL 0x00000002 - -struct ext4_crypto_ctx { - union { - struct { - struct page *bounce_page; /* Ciphertext page */ - struct page *control_page; /* Original page */ - } w; - struct { - struct bio *bio; - struct work_struct work; - } r; - struct list_head free_list; /* Free list */ - }; - char flags; /* Flags */ - char mode; /* Encryption mode for tfm */ -}; - -struct ext4_completion_result { - struct completion completion; - int res; -}; - -#define DECLARE_EXT4_COMPLETION_RESULT(ecr) \ - struct ext4_completion_result ecr = { \ - COMPLETION_INITIALIZER((ecr).completion), 0 } - -static inline int ext4_encryption_key_size(int mode) -{ - switch (mode) { - case EXT4_ENCRYPTION_MODE_AES_256_XTS: - return EXT4_AES_256_XTS_KEY_SIZE; - case EXT4_ENCRYPTION_MODE_AES_256_GCM: - return EXT4_AES_256_GCM_KEY_SIZE; - case EXT4_ENCRYPTION_MODE_AES_256_CBC: - return EXT4_AES_256_CBC_KEY_SIZE; - case EXT4_ENCRYPTION_MODE_AES_256_CTS: - return EXT4_AES_256_CTS_KEY_SIZE; - default: - BUG(); - } - return 0; -} - -#define EXT4_FNAME_NUM_SCATTER_ENTRIES 4 -#define EXT4_CRYPTO_BLOCK_SIZE 16 -#define EXT4_FNAME_CRYPTO_DIGEST_SIZE 32 - -struct ext4_str { - unsigned char *name; - u32 len; -}; - -/** - * For encrypted symlinks, the ciphertext length is stored at the beginning - * of the string in little-endian format. - */ -struct ext4_encrypted_symlink_data { - __le16 len; - char encrypted_path[1]; -} __attribute__((__packed__)); - -/** - * This function is used to calculate the disk space required to - * store a filename of length l in encrypted symlink format. - */ -static inline u32 encrypted_symlink_data_len(u32 l) -{ - if (l < EXT4_CRYPTO_BLOCK_SIZE) - l = EXT4_CRYPTO_BLOCK_SIZE; - return (l + sizeof(struct ext4_encrypted_symlink_data) - 1); -} - -#endif /* _EXT4_CRYPTO_H */ diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h index 5f5846211095..b1d52c14098e 100644 --- a/fs/ext4/ext4_jbd2.h +++ b/fs/ext4/ext4_jbd2.h @@ -175,6 +175,13 @@ struct ext4_journal_cb_entry { * There is no guaranteed calling order of multiple registered callbacks on * the same transaction. */ +static inline void _ext4_journal_callback_add(handle_t *handle, + struct ext4_journal_cb_entry *jce) +{ + /* Add the jce to transaction's private list */ + list_add_tail(&jce->jce_list, &handle->h_transaction->t_private_list); +} + static inline void ext4_journal_callback_add(handle_t *handle, void (*func)(struct super_block *sb, struct ext4_journal_cb_entry *jce, @@ -187,10 +194,11 @@ static inline void ext4_journal_callback_add(handle_t *handle, /* Add the jce to transaction's private list */ jce->jce_func = func; spin_lock(&sbi->s_md_lock); - list_add_tail(&jce->jce_list, &handle->h_transaction->t_private_list); + _ext4_journal_callback_add(handle, jce); spin_unlock(&sbi->s_md_lock); } + /** * ext4_journal_callback_del: delete a registered callback * @handle: active journal transaction handle on which callback was registered @@ -359,10 +367,21 @@ static inline int ext4_journal_force_commit(journal_t *journal) return 0; } -static inline int ext4_jbd2_file_inode(handle_t *handle, struct inode *inode) +static inline int ext4_jbd2_inode_add_write(handle_t *handle, + struct inode *inode) +{ + if (ext4_handle_valid(handle)) + return jbd2_journal_inode_add_write(handle, + EXT4_I(inode)->jinode); + return 0; +} + +static inline int ext4_jbd2_inode_add_wait(handle_t *handle, + struct inode *inode) { if (ext4_handle_valid(handle)) - return jbd2_journal_file_inode(handle, EXT4_I(inode)->jinode); + return jbd2_journal_inode_add_wait(handle, + EXT4_I(inode)->jinode); return 0; } diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 95bf4679ac54..d7ccb7f51dfc 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -120,9 +120,14 @@ static int ext4_ext_truncate_extend_restart(handle_t *handle, if (!ext4_handle_valid(handle)) return 0; - if (handle->h_buffer_credits > needed) + if (handle->h_buffer_credits >= needed) return 0; - err = ext4_journal_extend(handle, needed); + /* + * If we need to extend the journal get a few extra blocks + * while we're at it for efficiency's sake. + */ + needed += 3; + err = ext4_journal_extend(handle, needed - handle->h_buffer_credits); if (err <= 0) return err; err = ext4_truncate_restart_trans(handle, inode, needed); @@ -376,9 +381,13 @@ static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext) ext4_fsblk_t block = ext4_ext_pblock(ext); int len = ext4_ext_get_actual_len(ext); ext4_lblk_t lblock = le32_to_cpu(ext->ee_block); - ext4_lblk_t last = lblock + len - 1; - if (len == 0 || lblock > last) + /* + * We allow neither: + * - zero length + * - overflow/wrap-around + */ + if (lblock + len <= lblock) return 0; return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, len); } @@ -469,6 +478,10 @@ static int __ext4_ext_check(const char *function, unsigned int line, error_msg = "invalid extent entries"; goto corrupted; } + if (unlikely(depth > 32)) { + error_msg = "too large eh_depth"; + goto corrupted; + } /* Verify checksum on non-root extent tree nodes */ if (ext_depth(inode) != depth && !ext4_extent_block_csum_verify(inode, eh)) { @@ -907,13 +920,6 @@ ext4_find_extent(struct inode *inode, ext4_lblk_t block, eh = ext_block_hdr(bh); ppos++; - if (unlikely(ppos > depth)) { - put_bh(bh); - EXT4_ERROR_INODE(inode, - "ppos %d > depth %d", ppos, depth); - ret = -EFSCORRUPTED; - goto err; - } path[ppos].p_bh = bh; path[ppos].p_hdr = eh; } @@ -2583,7 +2589,7 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode, } } else ext4_error(sbi->s_sb, "strange request: removal(2) " - "%u-%u from %u:%u\n", + "%u-%u from %u:%u", from, to, le32_to_cpu(ex->ee_block), ee_len); return 0; } @@ -3738,7 +3744,7 @@ static int ext4_convert_unwritten_extents_endio(handle_t *handle, if (ee_block != map->m_lblk || ee_len > map->m_len) { #ifdef EXT4_DEBUG ext4_warning("Inode (%ld) finished: extent logical block %llu," - " len %u; IO logical block %llu, len %u\n", + " len %u; IO logical block %llu, len %u", inode->i_ino, (unsigned long long)ee_block, ee_len, (unsigned long long)map->m_lblk, map->m_len); #endif diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index e38b987ac7f5..37e059202cd2 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -707,7 +707,7 @@ int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk, (status & EXTENT_STATUS_WRITTEN)) { ext4_warning(inode->i_sb, "Inserting extent [%u/%u] as " " delayed and written which can potentially " - " cause data loss.\n", lblk, len); + " cause data loss.", lblk, len); WARN_ON(1); } diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 00ff6912adb3..261ac3734c58 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -202,7 +202,7 @@ static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf) if (IS_ERR(handle)) result = VM_FAULT_SIGBUS; else - result = __dax_fault(vma, vmf, ext4_dax_mmap_get_block, NULL); + result = dax_fault(vma, vmf, ext4_dax_get_block); if (write) { if (!IS_ERR(handle)) @@ -237,8 +237,8 @@ static int ext4_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr, if (IS_ERR(handle)) result = VM_FAULT_SIGBUS; else - result = __dax_pmd_fault(vma, addr, pmd, flags, - ext4_dax_mmap_get_block, NULL); + result = dax_pmd_fault(vma, addr, pmd, flags, + ext4_dax_get_block); if (write) { if (!IS_ERR(handle)) @@ -303,10 +303,10 @@ static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma) struct inode *inode = file->f_mapping->host; if (ext4_encrypted_inode(inode)) { - int err = ext4_get_encryption_info(inode); + int err = fscrypt_get_encryption_info(inode); if (err) return 0; - if (ext4_encryption_info(inode) == NULL) + if (!fscrypt_has_encryption_key(inode)) return -ENOKEY; } file_accessed(file); @@ -362,18 +362,18 @@ static int ext4_file_open(struct inode * inode, struct file * filp) } } if (ext4_encrypted_inode(inode)) { - ret = ext4_get_encryption_info(inode); + ret = fscrypt_get_encryption_info(inode); if (ret) return -EACCES; - if (ext4_encryption_info(inode) == NULL) + if (!fscrypt_has_encryption_key(inode)) return -ENOKEY; } dir = dget_parent(file_dentry(filp)); if (ext4_encrypted_inode(d_inode(dir)) && - !ext4_is_child_context_consistent_with_parent(d_inode(dir), inode)) { + !fscrypt_has_permitted_context(d_inode(dir), inode)) { ext4_warning(inode->i_sb, - "Inconsistent encryption contexts: %lu/%lu\n", + "Inconsistent encryption contexts: %lu/%lu", (unsigned long) d_inode(dir)->i_ino, (unsigned long) inode->i_ino); dput(dir); diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c index 8850254136ae..5c4372512ef7 100644 --- a/fs/ext4/fsync.c +++ b/fs/ext4/fsync.c @@ -106,9 +106,11 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync) } if (!journal) { - ret = generic_file_fsync(file, start, end, datasync); + ret = __generic_file_fsync(file, start, end, datasync); if (!ret && !hlist_empty(&inode->i_dentry)) ret = ext4_sync_parent(inode); + if (test_opt(inode->i_sb, BARRIER)) + goto issue_flush; goto out; } @@ -140,6 +142,7 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync) needs_barrier = true; ret = jbd2_complete_transaction(journal, commit_tid); if (needs_barrier) { + issue_flush: err = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL); if (!ret) ret = err; diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 237b877d316d..9e66cd1d7b78 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -214,7 +214,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group) trace_ext4_load_inode_bitmap(sb, block_group); bh->b_end_io = ext4_end_bitmap_read; get_bh(bh); - submit_bh(READ | REQ_META | REQ_PRIO, bh); + submit_bh(REQ_OP_READ, REQ_META | REQ_PRIO, bh); wait_on_buffer(bh); if (!buffer_uptodate(bh)) { put_bh(bh); @@ -767,10 +767,10 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir, if ((ext4_encrypted_inode(dir) || DUMMY_ENCRYPTION_ENABLED(EXT4_SB(dir->i_sb))) && (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))) { - err = ext4_get_encryption_info(dir); + err = fscrypt_get_encryption_info(dir); if (err) return ERR_PTR(err); - if (ext4_encryption_info(dir) == NULL) + if (!fscrypt_has_encryption_key(dir)) return ERR_PTR(-EPERM); if (!handle) nblocks += EXT4_DATA_TRANS_BLOCKS(dir->i_sb); @@ -1115,7 +1115,8 @@ got: } if (encrypt) { - err = ext4_inherit_context(dir, inode); + /* give pointer to avoid set_context with journal ops. */ + err = fscrypt_inherit_context(dir, inode, &encrypt, true); if (err) goto fail_free_drop; } @@ -1150,25 +1151,20 @@ struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino) unsigned long max_ino = le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count); ext4_group_t block_group; int bit; - struct buffer_head *bitmap_bh; + struct buffer_head *bitmap_bh = NULL; struct inode *inode = NULL; - long err = -EIO; + int err = -EFSCORRUPTED; - /* Error cases - e2fsck has already cleaned up for us */ - if (ino > max_ino) { - ext4_warning(sb, "bad orphan ino %lu! e2fsck was run?", ino); - err = -EFSCORRUPTED; - goto error; - } + if (ino < EXT4_FIRST_INO(sb) || ino > max_ino) + goto bad_orphan; block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb); bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb); bitmap_bh = ext4_read_inode_bitmap(sb, block_group); if (IS_ERR(bitmap_bh)) { - err = PTR_ERR(bitmap_bh); - ext4_warning(sb, "inode bitmap error %ld for orphan %lu", - ino, err); - goto error; + ext4_error(sb, "inode bitmap error %ld for orphan %lu", + ino, PTR_ERR(bitmap_bh)); + return (struct inode *) bitmap_bh; } /* Having the inode bit set should be a 100% indicator that this @@ -1179,15 +1175,21 @@ struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino) goto bad_orphan; inode = ext4_iget(sb, ino); - if (IS_ERR(inode)) - goto iget_failed; + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + ext4_error(sb, "couldn't read orphan inode %lu (err %d)", + ino, err); + return inode; + } /* - * If the orphans has i_nlinks > 0 then it should be able to be - * truncated, otherwise it won't be removed from the orphan list - * during processing and an infinite loop will result. + * If the orphans has i_nlinks > 0 then it should be able to + * be truncated, otherwise it won't be removed from the orphan + * list during processing and an infinite loop will result. + * Similarly, it must not be a bad inode. */ - if (inode->i_nlink && !ext4_can_truncate(inode)) + if ((inode->i_nlink && !ext4_can_truncate(inode)) || + is_bad_inode(inode)) goto bad_orphan; if (NEXT_ORPHAN(inode) > max_ino) @@ -1195,29 +1197,25 @@ struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino) brelse(bitmap_bh); return inode; -iget_failed: - err = PTR_ERR(inode); - inode = NULL; bad_orphan: - ext4_warning(sb, "bad orphan inode %lu! e2fsck was run?", ino); - printk(KERN_WARNING "ext4_test_bit(bit=%d, block=%llu) = %d\n", - bit, (unsigned long long)bitmap_bh->b_blocknr, - ext4_test_bit(bit, bitmap_bh->b_data)); - printk(KERN_WARNING "inode=%p\n", inode); + ext4_error(sb, "bad orphan inode %lu", ino); + if (bitmap_bh) + printk(KERN_ERR "ext4_test_bit(bit=%d, block=%llu) = %d\n", + bit, (unsigned long long)bitmap_bh->b_blocknr, + ext4_test_bit(bit, bitmap_bh->b_data)); if (inode) { - printk(KERN_WARNING "is_bad_inode(inode)=%d\n", + printk(KERN_ERR "is_bad_inode(inode)=%d\n", is_bad_inode(inode)); - printk(KERN_WARNING "NEXT_ORPHAN(inode)=%u\n", + printk(KERN_ERR "NEXT_ORPHAN(inode)=%u\n", NEXT_ORPHAN(inode)); - printk(KERN_WARNING "max_ino=%lu\n", max_ino); - printk(KERN_WARNING "i_nlink=%u\n", inode->i_nlink); + printk(KERN_ERR "max_ino=%lu\n", max_ino); + printk(KERN_ERR "i_nlink=%u\n", inode->i_nlink); /* Avoid freeing blocks if we got a bad deleted inode */ if (inode->i_nlink == 0) inode->i_blocks = 0; iput(inode); } brelse(bitmap_bh); -error: return ERR_PTR(err); } diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c index 627b7e8f9ef3..bc15c2c17633 100644 --- a/fs/ext4/indirect.c +++ b/fs/ext4/indirect.c @@ -649,133 +649,6 @@ out: } /* - * O_DIRECT for ext3 (or indirect map) based files - * - * If the O_DIRECT write will extend the file then add this inode to the - * orphan list. So recovery will truncate it back to the original size - * if the machine crashes during the write. - * - * If the O_DIRECT write is intantiating holes inside i_size and the machine - * crashes then stale disk data _may_ be exposed inside the file. But current - * VFS code falls back into buffered path in that case so we are safe. - */ -ssize_t ext4_ind_direct_IO(struct kiocb *iocb, struct iov_iter *iter) -{ - struct file *file = iocb->ki_filp; - struct inode *inode = file->f_mapping->host; - struct ext4_inode_info *ei = EXT4_I(inode); - loff_t offset = iocb->ki_pos; - handle_t *handle; - ssize_t ret; - int orphan = 0; - size_t count = iov_iter_count(iter); - int retries = 0; - - if (iov_iter_rw(iter) == WRITE) { - loff_t final_size = offset + count; - - if (final_size > inode->i_size) { - /* Credits for sb + inode write */ - handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); - if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - goto out; - } - ret = ext4_orphan_add(handle, inode); - if (ret) { - ext4_journal_stop(handle); - goto out; - } - orphan = 1; - ei->i_disksize = inode->i_size; - ext4_journal_stop(handle); - } - } - -retry: - if (iov_iter_rw(iter) == READ && ext4_should_dioread_nolock(inode)) { - /* - * Nolock dioread optimization may be dynamically disabled - * via ext4_inode_block_unlocked_dio(). Check inode's state - * while holding extra i_dio_count ref. - */ - inode_dio_begin(inode); - smp_mb(); - if (unlikely(ext4_test_inode_state(inode, - EXT4_STATE_DIOREAD_LOCK))) { - inode_dio_end(inode); - goto locked; - } - if (IS_DAX(inode)) - ret = dax_do_io(iocb, inode, iter, - ext4_dio_get_block, NULL, 0); - else - ret = __blockdev_direct_IO(iocb, inode, - inode->i_sb->s_bdev, iter, - ext4_dio_get_block, - NULL, NULL, 0); - inode_dio_end(inode); - } else { -locked: - if (IS_DAX(inode)) - ret = dax_do_io(iocb, inode, iter, - ext4_dio_get_block, NULL, DIO_LOCKING); - else - ret = blockdev_direct_IO(iocb, inode, iter, - ext4_dio_get_block); - - if (unlikely(iov_iter_rw(iter) == WRITE && ret < 0)) { - loff_t isize = i_size_read(inode); - loff_t end = offset + count; - - if (end > isize) - ext4_truncate_failed_write(inode); - } - } - if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) - goto retry; - - if (orphan) { - int err; - - /* Credits for sb + inode write */ - handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); - if (IS_ERR(handle)) { - /* This is really bad luck. We've written the data - * but cannot extend i_size. Bail out and pretend - * the write failed... */ - ret = PTR_ERR(handle); - if (inode->i_nlink) - ext4_orphan_del(NULL, inode); - - goto out; - } - if (inode->i_nlink) - ext4_orphan_del(handle, inode); - if (ret > 0) { - loff_t end = offset + ret; - if (end > inode->i_size) { - ei->i_disksize = end; - i_size_write(inode, end); - /* - * We're going to return a positive `ret' - * here due to non-zero-length I/O, so there's - * no way of reporting error returns from - * ext4_mark_inode_dirty() to userspace. So - * ignore it. - */ - ext4_mark_inode_dirty(handle, inode); - } - } - err = ext4_journal_stop(handle); - if (ret == 0) - ret = err; - } -out: - return ret; -} - -/* * Calculate the number of metadata blocks need to reserve * to allocate a new block at @lblocks for non extent file based file */ diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index 7bc6c855cc18..f74d5ee2cdec 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -1326,7 +1326,7 @@ int htree_inlinedir_to_tree(struct file *dir_file, struct ext4_iloc iloc; void *dir_buf = NULL; struct ext4_dir_entry_2 fake; - struct ext4_str tmp_str; + struct fscrypt_str tmp_str; ret = ext4_get_inode_loc(inode, &iloc); if (ret) @@ -1739,20 +1739,20 @@ ext4_get_inline_entry(struct inode *inode, return (struct ext4_dir_entry_2 *)(inline_pos + offset); } -int empty_inline_dir(struct inode *dir, int *has_inline_data) +bool empty_inline_dir(struct inode *dir, int *has_inline_data) { int err, inline_size; struct ext4_iloc iloc; void *inline_pos; unsigned int offset; struct ext4_dir_entry_2 *de; - int ret = 1; + bool ret = true; err = ext4_get_inode_loc(dir, &iloc); if (err) { EXT4_ERROR_INODE(dir, "error %d getting inode %lu block", err, dir->i_ino); - return 1; + return true; } down_read(&EXT4_I(dir)->xattr_sem); @@ -1766,7 +1766,7 @@ int empty_inline_dir(struct inode *dir, int *has_inline_data) ext4_warning(dir->i_sb, "bad inline directory (dir #%lu) - no `..'", dir->i_ino); - ret = 1; + ret = true; goto out; } @@ -1780,15 +1780,15 @@ int empty_inline_dir(struct inode *dir, int *has_inline_data) ext4_warning(dir->i_sb, "bad inline directory (dir #%lu) - " "inode %u, rec_len %u, name_len %d" - "inline size %d\n", + "inline size %d", dir->i_ino, le32_to_cpu(de->inode), le16_to_cpu(de->rec_len), de->name_len, inline_size); - ret = 1; + ret = true; goto out; } if (le32_to_cpu(de->inode)) { - ret = 0; + ret = false; goto out; } offset += ext4_rec_len_from_disk(de->rec_len, inline_size); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 79b298d397b4..3131747199e1 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -51,25 +51,31 @@ static __u32 ext4_inode_csum(struct inode *inode, struct ext4_inode *raw, struct ext4_inode_info *ei) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); - __u16 csum_lo; - __u16 csum_hi = 0; __u32 csum; + __u16 dummy_csum = 0; + int offset = offsetof(struct ext4_inode, i_checksum_lo); + unsigned int csum_size = sizeof(dummy_csum); - csum_lo = le16_to_cpu(raw->i_checksum_lo); - raw->i_checksum_lo = 0; - if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE && - EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi)) { - csum_hi = le16_to_cpu(raw->i_checksum_hi); - raw->i_checksum_hi = 0; - } - - csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)raw, - EXT4_INODE_SIZE(inode->i_sb)); + csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)raw, offset); + csum = ext4_chksum(sbi, csum, (__u8 *)&dummy_csum, csum_size); + offset += csum_size; + csum = ext4_chksum(sbi, csum, (__u8 *)raw + offset, + EXT4_GOOD_OLD_INODE_SIZE - offset); - raw->i_checksum_lo = cpu_to_le16(csum_lo); - if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE && - EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi)) - raw->i_checksum_hi = cpu_to_le16(csum_hi); + if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { + offset = offsetof(struct ext4_inode, i_checksum_hi); + csum = ext4_chksum(sbi, csum, (__u8 *)raw + + EXT4_GOOD_OLD_INODE_SIZE, + offset - EXT4_GOOD_OLD_INODE_SIZE); + if (EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi)) { + csum = ext4_chksum(sbi, csum, (__u8 *)&dummy_csum, + csum_size); + offset += csum_size; + csum = ext4_chksum(sbi, csum, (__u8 *)raw + offset, + EXT4_INODE_SIZE(inode->i_sb) - + offset); + } + } return csum; } @@ -205,9 +211,9 @@ void ext4_evict_inode(struct inode *inode) * Note that directories do not have this problem because they * don't use page cache. */ - if (ext4_should_journal_data(inode) && - (S_ISLNK(inode->i_mode) || S_ISREG(inode->i_mode)) && - inode->i_ino != EXT4_JOURNAL_INO) { + if (inode->i_ino != EXT4_JOURNAL_INO && + ext4_should_journal_data(inode) && + (S_ISLNK(inode->i_mode) || S_ISREG(inode->i_mode))) { journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; tid_t commit_tid = EXT4_I(inode)->i_datasync_tid; @@ -386,7 +392,7 @@ int ext4_issue_zeroout(struct inode *inode, ext4_lblk_t lblk, ext4_fsblk_t pblk, int ret; if (ext4_encrypted_inode(inode)) - return ext4_encrypted_zeroout(inode, lblk, pblk, len); + return fscrypt_zeroout_range(inode, lblk, pblk, len); ret = sb_issue_zeroout(inode->i_sb, pblk, len, GFP_NOFS); if (ret > 0) @@ -684,6 +690,24 @@ out_sem: ret = check_block_validity(inode, map); if (ret != 0) return ret; + + /* + * Inodes with freshly allocated blocks where contents will be + * visible after transaction commit must be on transaction's + * ordered data list. + */ + if (map->m_flags & EXT4_MAP_NEW && + !(map->m_flags & EXT4_MAP_UNWRITTEN) && + !(flags & EXT4_GET_BLOCKS_ZERO) && + !IS_NOQUOTA(inode) && + ext4_should_order_data(inode)) { + if (flags & EXT4_GET_BLOCKS_IO_SUBMIT) + ret = ext4_jbd2_inode_add_wait(handle, inode); + else + ret = ext4_jbd2_inode_add_write(handle, inode); + if (ret) + return ret; + } } return retval; } @@ -963,7 +987,7 @@ struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode, return bh; if (!bh || buffer_uptodate(bh)) return bh; - ll_rw_block(READ | REQ_META | REQ_PRIO, 1, &bh); + ll_rw_block(REQ_OP_READ, REQ_META | REQ_PRIO, 1, &bh); wait_on_buffer(bh); if (buffer_uptodate(bh)) return bh; @@ -1117,7 +1141,7 @@ static int ext4_block_write_begin(struct page *page, loff_t pos, unsigned len, if (!buffer_uptodate(bh) && !buffer_delay(bh) && !buffer_unwritten(bh) && (block_start < from || block_end > to)) { - ll_rw_block(READ, 1, &bh); + ll_rw_block(REQ_OP_READ, 0, 1, &bh); *wait_bh++ = bh; decrypt = ext4_encrypted_inode(inode) && S_ISREG(inode->i_mode); @@ -1134,7 +1158,7 @@ static int ext4_block_write_begin(struct page *page, loff_t pos, unsigned len, if (unlikely(err)) page_zero_new_buffers(page, from, to); else if (decrypt) - err = ext4_decrypt(page); + err = fscrypt_decrypt_page(page); return err; } #endif @@ -1289,15 +1313,6 @@ static int ext4_write_end(struct file *file, int i_size_changed = 0; trace_ext4_write_end(inode, pos, len, copied); - if (ext4_test_inode_state(inode, EXT4_STATE_ORDERED_MODE)) { - ret = ext4_jbd2_file_inode(handle, inode); - if (ret) { - unlock_page(page); - put_page(page); - goto errout; - } - } - if (ext4_has_inline_data(inode)) { ret = ext4_write_inline_data_end(inode, pos, len, copied, page); @@ -2313,7 +2328,8 @@ static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd) * the data was copied into the page cache. */ get_blocks_flags = EXT4_GET_BLOCKS_CREATE | - EXT4_GET_BLOCKS_METADATA_NOFAIL; + EXT4_GET_BLOCKS_METADATA_NOFAIL | + EXT4_GET_BLOCKS_IO_SUBMIT; dioread_nolock = ext4_should_dioread_nolock(inode); if (dioread_nolock) get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT; @@ -2602,11 +2618,14 @@ static int ext4_writepages(struct address_space *mapping, struct blk_plug plug; bool give_up_on_write = false; + percpu_down_read(&sbi->s_journal_flag_rwsem); trace_ext4_writepages(inode, wbc); - if (dax_mapping(mapping)) - return dax_writeback_mapping_range(mapping, inode->i_sb->s_bdev, - wbc); + if (dax_mapping(mapping)) { + ret = dax_writeback_mapping_range(mapping, inode->i_sb->s_bdev, + wbc); + goto out_writepages; + } /* * No pages to write? This is mainly a kludge to avoid starting @@ -2735,13 +2754,36 @@ retry: done = true; } } - ext4_journal_stop(handle); + /* + * Caution: If the handle is synchronous, + * ext4_journal_stop() can wait for transaction commit + * to finish which may depend on writeback of pages to + * complete or on page lock to be released. In that + * case, we have to wait until after after we have + * submitted all the IO, released page locks we hold, + * and dropped io_end reference (for extent conversion + * to be able to complete) before stopping the handle. + */ + if (!ext4_handle_valid(handle) || handle->h_sync == 0) { + ext4_journal_stop(handle); + handle = NULL; + } /* Submit prepared bio */ ext4_io_submit(&mpd.io_submit); /* Unlock pages we didn't use */ mpage_release_unused_pages(&mpd, give_up_on_write); - /* Drop our io_end reference we got from init */ - ext4_put_io_end(mpd.io_submit.io_end); + /* + * Drop our io_end reference we got from init. We have + * to be careful and use deferred io_end finishing if + * we are still holding the transaction as we can + * release the last reference to io_end which may end + * up doing unwritten extent conversion. + */ + if (handle) { + ext4_put_io_end_defer(mpd.io_submit.io_end); + ext4_journal_stop(handle); + } else + ext4_put_io_end(mpd.io_submit.io_end); if (ret == -ENOSPC && sbi->s_journal) { /* @@ -2776,6 +2818,7 @@ retry: out_writepages: trace_ext4_writepages_result(inode, wbc, ret, nr_to_write - wbc->nr_to_write); + percpu_up_read(&sbi->s_journal_flag_rwsem); return ret; } @@ -3215,75 +3258,52 @@ static int ext4_releasepage(struct page *page, gfp_t wait) } #ifdef CONFIG_FS_DAX -int ext4_dax_mmap_get_block(struct inode *inode, sector_t iblock, - struct buffer_head *bh_result, int create) +/* + * Get block function for DAX IO and mmap faults. It takes care of converting + * unwritten extents to written ones and initializes new / converted blocks + * to zeros. + */ +int ext4_dax_get_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) { - int ret, err; - int credits; - struct ext4_map_blocks map; - handle_t *handle = NULL; - int flags = 0; - - ext4_debug("ext4_dax_mmap_get_block: inode %lu, create flag %d\n", - inode->i_ino, create); - map.m_lblk = iblock; - map.m_len = bh_result->b_size >> inode->i_blkbits; - credits = ext4_chunk_trans_blocks(inode, map.m_len); - if (create) { - flags |= EXT4_GET_BLOCKS_PRE_IO | EXT4_GET_BLOCKS_CREATE_ZERO; - handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, credits); - if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - return ret; - } - } + int ret; - ret = ext4_map_blocks(handle, inode, &map, flags); - if (create) { - err = ext4_journal_stop(handle); - if (ret >= 0 && err < 0) - ret = err; - } - if (ret <= 0) - goto out; - if (map.m_flags & EXT4_MAP_UNWRITTEN) { - int err2; + ext4_debug("inode %lu, create flag %d\n", inode->i_ino, create); + if (!create) + return _ext4_get_block(inode, iblock, bh_result, 0); - /* - * We are protected by i_mmap_sem so we know block cannot go - * away from under us even though we dropped i_data_sem. - * Convert extent to written and write zeros there. - * - * Note: We may get here even when create == 0. - */ - handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, credits); - if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - goto out; - } + ret = ext4_get_block_trans(inode, iblock, bh_result, + EXT4_GET_BLOCKS_PRE_IO | + EXT4_GET_BLOCKS_CREATE_ZERO); + if (ret < 0) + return ret; - err = ext4_map_blocks(handle, inode, &map, - EXT4_GET_BLOCKS_CONVERT | EXT4_GET_BLOCKS_CREATE_ZERO); - if (err < 0) - ret = err; - err2 = ext4_journal_stop(handle); - if (err2 < 0 && ret > 0) - ret = err2; - } -out: - WARN_ON_ONCE(ret == 0 && create); - if (ret > 0) { - map_bh(bh_result, inode->i_sb, map.m_pblk); + if (buffer_unwritten(bh_result)) { /* - * At least for now we have to clear BH_New so that DAX code - * doesn't attempt to zero blocks again in a racy way. + * We are protected by i_mmap_sem or i_mutex so we know block + * cannot go away from under us even though we dropped + * i_data_sem. Convert extent to written and write zeros there. */ - map.m_flags &= ~EXT4_MAP_NEW; - ext4_update_bh_state(bh_result, map.m_flags); - bh_result->b_size = map.m_len << inode->i_blkbits; - ret = 0; + ret = ext4_get_block_trans(inode, iblock, bh_result, + EXT4_GET_BLOCKS_CONVERT | + EXT4_GET_BLOCKS_CREATE_ZERO); + if (ret < 0) + return ret; } - return ret; + /* + * At least for now we have to clear BH_New so that DAX code + * doesn't attempt to zero blocks again in a racy way. + */ + clear_buffer_new(bh_result); + return 0; +} +#else +/* Just define empty function, it will never get called. */ +int ext4_dax_get_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) +{ + BUG(); + return 0; } #endif @@ -3316,7 +3336,9 @@ static int ext4_end_io_dio(struct kiocb *iocb, loff_t offset, } /* - * For ext4 extent files, ext4 will do direct-io write to holes, + * Handling of direct IO writes. + * + * For ext4 extent files, ext4 will do direct-io write even to holes, * preallocated extents, and those write extend the file, no need to * fall back to buffered IO. * @@ -3334,10 +3356,11 @@ static int ext4_end_io_dio(struct kiocb *iocb, loff_t offset, * if the machine crashes during the write. * */ -static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter) +static ssize_t ext4_direct_IO_write(struct kiocb *iocb, struct iov_iter *iter) { struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; + struct ext4_inode_info *ei = EXT4_I(inode); ssize_t ret; loff_t offset = iocb->ki_pos; size_t count = iov_iter_count(iter); @@ -3345,10 +3368,25 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter) get_block_t *get_block_func = NULL; int dio_flags = 0; loff_t final_size = offset + count; + int orphan = 0; + handle_t *handle; - /* Use the old path for reads and writes beyond i_size. */ - if (iov_iter_rw(iter) != WRITE || final_size > inode->i_size) - return ext4_ind_direct_IO(iocb, iter); + if (final_size > inode->i_size) { + /* Credits for sb + inode write */ + handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + goto out; + } + ret = ext4_orphan_add(handle, inode); + if (ret) { + ext4_journal_stop(handle); + goto out; + } + orphan = 1; + ei->i_disksize = inode->i_size; + ext4_journal_stop(handle); + } BUG_ON(iocb->private == NULL); @@ -3357,8 +3395,7 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter) * conversion. This also disallows race between truncate() and * overwrite DIO as i_dio_count needs to be incremented under i_mutex. */ - if (iov_iter_rw(iter) == WRITE) - inode_dio_begin(inode); + inode_dio_begin(inode); /* If we do a overwrite dio, i_mutex locking can be released */ overwrite = *((int *)iocb->private); @@ -3367,7 +3404,7 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter) inode_unlock(inode); /* - * We could direct write to holes and fallocate. + * For extent mapped files we could direct write to holes and fallocate. * * Allocated blocks to fill the hole are marked as unwritten to prevent * parallel buffered read to expose the stale data before DIO complete @@ -3389,7 +3426,23 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter) iocb->private = NULL; if (overwrite) get_block_func = ext4_dio_get_block_overwrite; - else if (is_sync_kiocb(iocb)) { + else if (IS_DAX(inode)) { + /* + * We can avoid zeroing for aligned DAX writes beyond EOF. Other + * writes need zeroing either because they can race with page + * faults or because they use partial blocks. + */ + if (round_down(offset, 1<<inode->i_blkbits) >= inode->i_size && + ext4_aligned_io(inode, offset, count)) + get_block_func = ext4_dio_get_block; + else + get_block_func = ext4_dax_get_block; + dio_flags = DIO_LOCKING; + } else if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) || + round_down(offset, 1 << inode->i_blkbits) >= inode->i_size) { + get_block_func = ext4_dio_get_block; + dio_flags = DIO_LOCKING | DIO_SKIP_HOLES; + } else if (is_sync_kiocb(iocb)) { get_block_func = ext4_dio_get_block_unwritten_sync; dio_flags = DIO_LOCKING; } else { @@ -3399,10 +3452,10 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter) #ifdef CONFIG_EXT4_FS_ENCRYPTION BUG_ON(ext4_encrypted_inode(inode) && S_ISREG(inode->i_mode)); #endif - if (IS_DAX(inode)) + if (IS_DAX(inode)) { ret = dax_do_io(iocb, inode, iter, get_block_func, ext4_end_io_dio, dio_flags); - else + } else ret = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, iter, get_block_func, @@ -3422,12 +3475,86 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter) ext4_clear_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN); } - if (iov_iter_rw(iter) == WRITE) - inode_dio_end(inode); + inode_dio_end(inode); /* take i_mutex locking again if we do a ovewrite dio */ if (overwrite) inode_lock(inode); + if (ret < 0 && final_size > inode->i_size) + ext4_truncate_failed_write(inode); + + /* Handle extending of i_size after direct IO write */ + if (orphan) { + int err; + + /* Credits for sb + inode write */ + handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); + if (IS_ERR(handle)) { + /* This is really bad luck. We've written the data + * but cannot extend i_size. Bail out and pretend + * the write failed... */ + ret = PTR_ERR(handle); + if (inode->i_nlink) + ext4_orphan_del(NULL, inode); + + goto out; + } + if (inode->i_nlink) + ext4_orphan_del(handle, inode); + if (ret > 0) { + loff_t end = offset + ret; + if (end > inode->i_size) { + ei->i_disksize = end; + i_size_write(inode, end); + /* + * We're going to return a positive `ret' + * here due to non-zero-length I/O, so there's + * no way of reporting error returns from + * ext4_mark_inode_dirty() to userspace. So + * ignore it. + */ + ext4_mark_inode_dirty(handle, inode); + } + } + err = ext4_journal_stop(handle); + if (ret == 0) + ret = err; + } +out: + return ret; +} + +static ssize_t ext4_direct_IO_read(struct kiocb *iocb, struct iov_iter *iter) +{ + int unlocked = 0; + struct inode *inode = iocb->ki_filp->f_mapping->host; + ssize_t ret; + + if (ext4_should_dioread_nolock(inode)) { + /* + * Nolock dioread optimization may be dynamically disabled + * via ext4_inode_block_unlocked_dio(). Check inode's state + * while holding extra i_dio_count ref. + */ + inode_dio_begin(inode); + smp_mb(); + if (unlikely(ext4_test_inode_state(inode, + EXT4_STATE_DIOREAD_LOCK))) + inode_dio_end(inode); + else + unlocked = 1; + } + if (IS_DAX(inode)) { + ret = dax_do_io(iocb, inode, iter, ext4_dio_get_block, + NULL, unlocked ? 0 : DIO_LOCKING); + } else { + ret = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, + iter, ext4_dio_get_block, + NULL, NULL, + unlocked ? 0 : DIO_LOCKING); + } + if (unlocked) + inode_dio_end(inode); return ret; } @@ -3455,10 +3582,10 @@ static ssize_t ext4_direct_IO(struct kiocb *iocb, struct iov_iter *iter) return 0; trace_ext4_direct_IO_enter(inode, offset, count, iov_iter_rw(iter)); - if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) - ret = ext4_ext_direct_IO(iocb, iter); + if (iov_iter_rw(iter) == READ) + ret = ext4_direct_IO_read(iocb, iter); else - ret = ext4_ind_direct_IO(iocb, iter); + ret = ext4_direct_IO_write(iocb, iter); trace_ext4_direct_IO_exit(inode, offset, count, iov_iter_rw(iter), ret); return ret; } @@ -3534,10 +3661,7 @@ void ext4_set_aops(struct inode *inode) { switch (ext4_inode_journal_mode(inode)) { case EXT4_INODE_ORDERED_DATA_MODE: - ext4_set_inode_state(inode, EXT4_STATE_ORDERED_MODE); - break; case EXT4_INODE_WRITEBACK_DATA_MODE: - ext4_clear_inode_state(inode, EXT4_STATE_ORDERED_MODE); break; case EXT4_INODE_JOURNAL_DATA_MODE: inode->i_mapping->a_ops = &ext4_journalled_aops; @@ -3603,7 +3727,7 @@ static int __ext4_block_zero_page_range(handle_t *handle, if (!buffer_uptodate(bh)) { err = -EIO; - ll_rw_block(READ, 1, &bh); + ll_rw_block(REQ_OP_READ, 0, 1, &bh); wait_on_buffer(bh); /* Uhhuh. Read error. Complain and punt. */ if (!buffer_uptodate(bh)) @@ -3611,9 +3735,9 @@ static int __ext4_block_zero_page_range(handle_t *handle, if (S_ISREG(inode->i_mode) && ext4_encrypted_inode(inode)) { /* We expect the key to be set. */ - BUG_ON(!ext4_has_encryption_key(inode)); + BUG_ON(!fscrypt_has_encryption_key(inode)); BUG_ON(blocksize != PAGE_SIZE); - WARN_ON_ONCE(ext4_decrypt(page)); + WARN_ON_ONCE(fscrypt_decrypt_page(page)); } } if (ext4_should_journal_data(inode)) { @@ -3630,8 +3754,8 @@ static int __ext4_block_zero_page_range(handle_t *handle, } else { err = 0; mark_buffer_dirty(bh); - if (ext4_test_inode_state(inode, EXT4_STATE_ORDERED_MODE)) - err = ext4_jbd2_file_inode(handle, inode); + if (ext4_should_order_data(inode)) + err = ext4_jbd2_inode_add_write(handle, inode); } unlock: @@ -4186,7 +4310,7 @@ make_io: trace_ext4_load_inode(inode); get_bh(bh); bh->b_end_io = end_buffer_read_sync; - submit_bh(READ | REQ_META | REQ_PRIO, bh); + submit_bh(REQ_OP_READ, REQ_META | REQ_PRIO, bh); wait_on_buffer(bh); if (!buffer_uptodate(bh)) { EXT4_ERROR_INODE_BLOCK(inode, block, @@ -5429,6 +5553,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val) journal_t *journal; handle_t *handle; int err; + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); /* * We have to be very careful here: changing a data block's @@ -5445,22 +5570,30 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val) return 0; if (is_journal_aborted(journal)) return -EROFS; - /* We have to allocate physical blocks for delalloc blocks - * before flushing journal. otherwise delalloc blocks can not - * be allocated any more. even more truncate on delalloc blocks - * could trigger BUG by flushing delalloc blocks in journal. - * There is no delalloc block in non-journal data mode. - */ - if (val && test_opt(inode->i_sb, DELALLOC)) { - err = ext4_alloc_da_blocks(inode); - if (err < 0) - return err; - } /* Wait for all existing dio workers */ ext4_inode_block_unlocked_dio(inode); inode_dio_wait(inode); + /* + * Before flushing the journal and switching inode's aops, we have + * to flush all dirty data the inode has. There can be outstanding + * delayed allocations, there can be unwritten extents created by + * fallocate or buffered writes in dioread_nolock mode covered by + * dirty data which can be converted only after flushing the dirty + * data (and journalled aops don't know how to handle these cases). + */ + if (val) { + down_write(&EXT4_I(inode)->i_mmap_sem); + err = filemap_write_and_wait(inode->i_mapping); + if (err < 0) { + up_write(&EXT4_I(inode)->i_mmap_sem); + ext4_inode_resume_unlocked_dio(inode); + return err; + } + } + + percpu_down_write(&sbi->s_journal_flag_rwsem); jbd2_journal_lock_updates(journal); /* @@ -5477,6 +5610,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val) err = jbd2_journal_flush(journal); if (err < 0) { jbd2_journal_unlock_updates(journal); + percpu_up_write(&sbi->s_journal_flag_rwsem); ext4_inode_resume_unlocked_dio(inode); return err; } @@ -5485,6 +5619,10 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val) ext4_set_aops(inode); jbd2_journal_unlock_updates(journal); + percpu_up_write(&sbi->s_journal_flag_rwsem); + + if (val) + up_write(&EXT4_I(inode)->i_mmap_sem); ext4_inode_resume_unlocked_dio(inode); /* Finally we can mark the inode as dirty. */ diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index eae5917c534e..10686fd67fb4 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -13,8 +13,8 @@ #include <linux/compat.h> #include <linux/mount.h> #include <linux/file.h> -#include <linux/random.h> #include <linux/quotaops.h> +#include <linux/uuid.h> #include <asm/uaccess.h> #include "ext4_jbd2.h" #include "ext4.h" @@ -308,6 +308,7 @@ static int ext4_ioctl_setproject(struct file *filp, __u32 projid) kprojid_t kprojid; struct ext4_iloc iloc; struct ext4_inode *raw_inode; + struct dquot *transfer_to[MAXQUOTAS] = { }; if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_PROJECT)) { @@ -361,17 +362,14 @@ static int ext4_ioctl_setproject(struct file *filp, __u32 projid) if (err) goto out_stop; - if (sb_has_quota_limits_enabled(sb, PRJQUOTA)) { - struct dquot *transfer_to[MAXQUOTAS] = { }; - - transfer_to[PRJQUOTA] = dqget(sb, make_kqid_projid(kprojid)); - if (transfer_to[PRJQUOTA]) { - err = __dquot_transfer(inode, transfer_to); - dqput(transfer_to[PRJQUOTA]); - if (err) - goto out_dirty; - } + transfer_to[PRJQUOTA] = dqget(sb, make_kqid_projid(kprojid)); + if (!IS_ERR(transfer_to[PRJQUOTA])) { + err = __dquot_transfer(inode, transfer_to); + dqput(transfer_to[PRJQUOTA]); + if (err) + goto out_dirty; } + EXT4_I(inode)->i_projid = kprojid; inode->i_ctime = ext4_current_time(inode); out_dirty: @@ -772,19 +770,13 @@ resizefs_out: return ext4_ext_precache(inode); case EXT4_IOC_SET_ENCRYPTION_POLICY: { #ifdef CONFIG_EXT4_FS_ENCRYPTION - struct ext4_encryption_policy policy; - int err = 0; + struct fscrypt_policy policy; if (copy_from_user(&policy, - (struct ext4_encryption_policy __user *)arg, - sizeof(policy))) { - err = -EFAULT; - goto encryption_policy_out; - } - - err = ext4_process_policy(&policy, inode); -encryption_policy_out: - return err; + (struct fscrypt_policy __user *)arg, + sizeof(policy))) + return -EFAULT; + return fscrypt_process_policy(inode, &policy); #else return -EOPNOTSUPP; #endif @@ -827,12 +819,12 @@ encryption_policy_out: } case EXT4_IOC_GET_ENCRYPTION_POLICY: { #ifdef CONFIG_EXT4_FS_ENCRYPTION - struct ext4_encryption_policy policy; + struct fscrypt_policy policy; int err = 0; if (!ext4_encrypted_inode(inode)) return -ENOENT; - err = ext4_get_policy(inode, &policy); + err = fscrypt_get_policy(inode, &policy); if (err) return err; if (copy_to_user((void __user *)arg, &policy, sizeof(policy))) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index eeeade76012e..f418f55c2bbe 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -1266,6 +1266,7 @@ static void ext4_mb_unload_buddy(struct ext4_buddy *e4b) static int mb_find_order_for_block(struct ext4_buddy *e4b, int block) { int order = 1; + int bb_incr = 1 << (e4b->bd_blkbits - 1); void *bb; BUG_ON(e4b->bd_bitmap == e4b->bd_buddy); @@ -1278,7 +1279,8 @@ static int mb_find_order_for_block(struct ext4_buddy *e4b, int block) /* this block is part of buddy of order 'order' */ return order; } - bb += 1 << (e4b->bd_blkbits - order); + bb += bb_incr; + bb_incr >>= 1; order++; } return 0; @@ -2348,7 +2350,6 @@ static int ext4_mb_seq_groups_open(struct inode *inode, struct file *file) } const struct file_operations ext4_seq_mb_groups_fops = { - .owner = THIS_MODULE, .open = ext4_mb_seq_groups_open, .read = seq_read, .llseek = seq_lseek, @@ -2583,7 +2584,7 @@ int ext4_mb_init(struct super_block *sb) { struct ext4_sb_info *sbi = EXT4_SB(sb); unsigned i, j; - unsigned offset; + unsigned offset, offset_incr; unsigned max; int ret; @@ -2612,17 +2613,20 @@ int ext4_mb_init(struct super_block *sb) i = 1; offset = 0; + offset_incr = 1 << (sb->s_blocksize_bits - 1); max = sb->s_blocksize << 2; do { sbi->s_mb_offsets[i] = offset; sbi->s_mb_maxs[i] = max; - offset += 1 << (sb->s_blocksize_bits - i); + offset += offset_incr; + offset_incr = offset_incr >> 1; max = max >> 1; i++; } while (i <= sb->s_blocksize_bits + 1); spin_lock_init(&sbi->s_md_lock); spin_lock_init(&sbi->s_bal_lock); + sbi->s_mb_free_pending = 0; sbi->s_mb_max_to_scan = MB_DEFAULT_MAX_TO_SCAN; sbi->s_mb_min_to_scan = MB_DEFAULT_MIN_TO_SCAN; @@ -2810,6 +2814,9 @@ static void ext4_free_data_callback(struct super_block *sb, /* we expect to find existing buddy because it's pinned */ BUG_ON(err != 0); + spin_lock(&EXT4_SB(sb)->s_md_lock); + EXT4_SB(sb)->s_mb_free_pending -= entry->efd_count; + spin_unlock(&EXT4_SB(sb)->s_md_lock); db = e4b.bd_info; /* there are blocks to put in buddy to make them really free */ @@ -2935,7 +2942,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, ext4_error(sb, "Allocating blocks %llu-%llu which overlap " "fs metadata", block, block+len); /* File system mounted not to panic on error - * Fix the bitmap and repeat the block allocation + * Fix the bitmap and return EFSCORRUPTED * We leak some of the blocks here. */ ext4_lock_group(sb, ac->ac_b_ex.fe_group); @@ -2944,7 +2951,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, ext4_unlock_group(sb, ac->ac_b_ex.fe_group); err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); if (!err) - err = -EAGAIN; + err = -EFSCORRUPTED; goto out_err; } @@ -4509,18 +4516,7 @@ repeat: } if (likely(ac->ac_status == AC_STATUS_FOUND)) { *errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_clstrs); - if (*errp == -EAGAIN) { - /* - * drop the reference that we took - * in ext4_mb_use_best_found - */ - ext4_mb_release_context(ac); - ac->ac_b_ex.fe_group = 0; - ac->ac_b_ex.fe_start = 0; - ac->ac_b_ex.fe_len = 0; - ac->ac_status = AC_STATUS_CONTINUE; - goto repeat; - } else if (*errp) { + if (*errp) { ext4_discard_allocated_blocks(ac); goto errout; } else { @@ -4579,6 +4575,7 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, { ext4_group_t group = e4b->bd_group; ext4_grpblk_t cluster; + ext4_grpblk_t clusters = new_entry->efd_count; struct ext4_free_data *entry; struct ext4_group_info *db = e4b->bd_info; struct super_block *sb = e4b->bd_sb; @@ -4645,8 +4642,11 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, } } /* Add the extent to transaction's private list */ - ext4_journal_callback_add(handle, ext4_free_data_callback, - &new_entry->efd_jce); + new_entry->efd_jce.jce_func = ext4_free_data_callback; + spin_lock(&sbi->s_md_lock); + _ext4_journal_callback_add(handle, &new_entry->efd_jce); + sbi->s_mb_free_pending += clusters; + spin_unlock(&sbi->s_md_lock); return 0; } @@ -4935,7 +4935,7 @@ int ext4_group_add_blocks(handle_t *handle, struct super_block *sb, * boundary. */ if (bit + count > EXT4_BLOCKS_PER_GROUP(sb)) { - ext4_warning(sb, "too much blocks added to group %u\n", + ext4_warning(sb, "too much blocks added to group %u", block_group); err = -EINVAL; goto error_return; diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c index 24445275d330..d89754ef1aab 100644 --- a/fs/ext4/mmp.c +++ b/fs/ext4/mmp.c @@ -52,7 +52,7 @@ static int write_mmp_block(struct super_block *sb, struct buffer_head *bh) lock_buffer(bh); bh->b_end_io = end_buffer_write_sync; get_bh(bh); - submit_bh(WRITE_SYNC | REQ_META | REQ_PRIO, bh); + submit_bh(REQ_OP_WRITE, WRITE_SYNC | REQ_META | REQ_PRIO, bh); wait_on_buffer(bh); sb_end_write(sb); if (unlikely(!buffer_uptodate(bh))) @@ -88,7 +88,7 @@ static int read_mmp_block(struct super_block *sb, struct buffer_head **bh, get_bh(*bh); lock_buffer(*bh); (*bh)->b_end_io = end_buffer_read_sync; - submit_bh(READ_SYNC | REQ_META | REQ_PRIO, *bh); + submit_bh(REQ_OP_READ, READ_SYNC | REQ_META | REQ_PRIO, *bh); wait_on_buffer(*bh); if (!buffer_uptodate(*bh)) { ret = -EIO; @@ -121,7 +121,7 @@ void __dump_mmp_msg(struct super_block *sb, struct mmp_struct *mmp, __ext4_warning(sb, function, line, "%s", msg); __ext4_warning(sb, function, line, "MMP failure info: last update time: %llu, last update " - "node: %s, last update device: %s\n", + "node: %s, last update device: %s", (long long unsigned int) le64_to_cpu(mmp->mmp_time), mmp->mmp_nodename, mmp->mmp_bdevname); } @@ -353,7 +353,7 @@ skip: * wait for MMP interval and check mmp_seq. */ if (schedule_timeout_interruptible(HZ * wait_time) != 0) { - ext4_warning(sb, "MMP startup interrupted, failing mount\n"); + ext4_warning(sb, "MMP startup interrupted, failing mount"); goto failed; } diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index 325cef48b39a..a920c5d29fac 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -400,7 +400,7 @@ data_copy: /* Even in case of data=writeback it is reasonable to pin * inode to transaction, to prevent unexpected data loss */ - *err = ext4_jbd2_file_inode(handle, orig_inode); + *err = ext4_jbd2_inode_add_write(handle, orig_inode); unlock_pages: unlock_page(pagep[0]); diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 5611ec9348d7..34c0142caf6a 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -420,15 +420,14 @@ static __le32 ext4_dx_csum(struct inode *inode, struct ext4_dir_entry *dirent, struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct ext4_inode_info *ei = EXT4_I(inode); __u32 csum; - __le32 save_csum; int size; + __u32 dummy_csum = 0; + int offset = offsetof(struct dx_tail, dt_checksum); size = count_offset + (count * sizeof(struct dx_entry)); - save_csum = t->dt_checksum; - t->dt_checksum = 0; csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)dirent, size); - csum = ext4_chksum(sbi, csum, (__u8 *)t, sizeof(struct dx_tail)); - t->dt_checksum = save_csum; + csum = ext4_chksum(sbi, csum, (__u8 *)t, offset); + csum = ext4_chksum(sbi, csum, (__u8 *)&dummy_csum, sizeof(dummy_csum)); return cpu_to_le32(csum); } @@ -446,14 +445,14 @@ static int ext4_dx_csum_verify(struct inode *inode, c = get_dx_countlimit(inode, dirent, &count_offset); if (!c) { EXT4_ERROR_INODE(inode, "dir seems corrupt? Run e2fsck -D."); - return 1; + return 0; } limit = le16_to_cpu(c->limit); count = le16_to_cpu(c->count); if (count_offset + (limit * sizeof(struct dx_entry)) > EXT4_BLOCK_SIZE(inode->i_sb) - sizeof(struct dx_tail)) { warn_no_space_for_csum(inode); - return 1; + return 0; } t = (struct dx_tail *)(((struct dx_entry *)c) + limit); @@ -612,19 +611,19 @@ static struct stats dx_show_leaf(struct inode *dir, #ifdef CONFIG_EXT4_FS_ENCRYPTION int len; char *name; - struct ext4_str fname_crypto_str - = {.name = NULL, .len = 0}; + struct fscrypt_str fname_crypto_str = + FSTR_INIT(NULL, 0); int res = 0; name = de->name; len = de->name_len; - if (ext4_encrypted_inode(inode)) - res = ext4_get_encryption_info(dir); + if (ext4_encrypted_inode(dir)) + res = fscrypt_get_encryption_info(dir); if (res) { printk(KERN_WARNING "Error setting up" " fname crypto: %d\n", res); } - if (ctx == NULL) { + if (!fscrypt_has_encryption_key(dir)) { /* Directory is not encrypted */ ext4fs_dirhash(de->name, de->name_len, &h); @@ -633,19 +632,21 @@ static struct stats dx_show_leaf(struct inode *dir, (unsigned) ((char *) de - base)); } else { + struct fscrypt_str de_name = + FSTR_INIT(name, len); + /* Directory is encrypted */ - res = ext4_fname_crypto_alloc_buffer( - ctx, de->name_len, + res = fscrypt_fname_alloc_buffer( + dir, len, &fname_crypto_str); - if (res < 0) { + if (res < 0) printk(KERN_WARNING "Error " "allocating crypto " "buffer--skipping " "crypto\n"); - ctx = NULL; - } - res = ext4_fname_disk_to_usr(ctx, NULL, de, - &fname_crypto_str); + res = fscrypt_fname_disk_to_usr(dir, + 0, 0, &de_name, + &fname_crypto_str); if (res < 0) { printk(KERN_WARNING "Error " "converting filename " @@ -662,8 +663,8 @@ static struct stats dx_show_leaf(struct inode *dir, printk("%*.s:(E)%x.%u ", len, name, h.hash, (unsigned) ((char *) de - base)); - ext4_fname_crypto_free_buffer( - &fname_crypto_str); + fscrypt_fname_free_buffer( + &fname_crypto_str); } #else int len = de->name_len; @@ -952,7 +953,7 @@ static int htree_dirblock_to_tree(struct file *dir_file, struct buffer_head *bh; struct ext4_dir_entry_2 *de, *top; int err = 0, count = 0; - struct ext4_str fname_crypto_str = {.name = NULL, .len = 0}, tmp_str; + struct fscrypt_str fname_crypto_str = FSTR_INIT(NULL, 0), tmp_str; dxtrace(printk(KERN_INFO "In htree dirblock_to_tree: block %lu\n", (unsigned long)block)); @@ -967,12 +968,12 @@ static int htree_dirblock_to_tree(struct file *dir_file, #ifdef CONFIG_EXT4_FS_ENCRYPTION /* Check if the directory is encrypted */ if (ext4_encrypted_inode(dir)) { - err = ext4_get_encryption_info(dir); + err = fscrypt_get_encryption_info(dir); if (err < 0) { brelse(bh); return err; } - err = ext4_fname_crypto_alloc_buffer(dir, EXT4_NAME_LEN, + err = fscrypt_fname_alloc_buffer(dir, EXT4_NAME_LEN, &fname_crypto_str); if (err < 0) { brelse(bh); @@ -1003,10 +1004,13 @@ static int htree_dirblock_to_tree(struct file *dir_file, &tmp_str); } else { int save_len = fname_crypto_str.len; + struct fscrypt_str de_name = FSTR_INIT(de->name, + de->name_len); /* Directory is encrypted */ - err = ext4_fname_disk_to_usr(dir, hinfo, de, - &fname_crypto_str); + err = fscrypt_fname_disk_to_usr(dir, hinfo->hash, + hinfo->minor_hash, &de_name, + &fname_crypto_str); if (err < 0) { count = err; goto errout; @@ -1025,7 +1029,7 @@ static int htree_dirblock_to_tree(struct file *dir_file, errout: brelse(bh); #ifdef CONFIG_EXT4_FS_ENCRYPTION - ext4_fname_crypto_free_buffer(&fname_crypto_str); + fscrypt_fname_free_buffer(&fname_crypto_str); #endif return count; } @@ -1050,7 +1054,7 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash, int count = 0; int ret, err; __u32 hashval; - struct ext4_str tmp_str; + struct fscrypt_str tmp_str; dxtrace(printk(KERN_DEBUG "In htree_fill_tree, start hash: %x:%x\n", start_hash, start_minor_hash)); @@ -1107,6 +1111,11 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash, } while (1) { + if (fatal_signal_pending(current)) { + err = -ERESTARTSYS; + goto errout; + } + cond_resched(); block = dx_get_block(frame->at); ret = htree_dirblock_to_tree(dir_file, dir, block, &hinfo, start_hash, start_minor_hash); @@ -1438,7 +1447,8 @@ restart: } bh_use[ra_max] = bh; if (bh) - ll_rw_block(READ | REQ_META | REQ_PRIO, + ll_rw_block(REQ_OP_READ, + REQ_META | REQ_PRIO, 1, &bh); } } @@ -1558,26 +1568,23 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsi struct ext4_dir_entry_2 *de; struct buffer_head *bh; - if (ext4_encrypted_inode(dir)) { - int res = ext4_get_encryption_info(dir); + if (ext4_encrypted_inode(dir)) { + int res = fscrypt_get_encryption_info(dir); /* - * This should be a properly defined flag for - * dentry->d_flags when we uplift this to the VFS. - * d_fsdata is set to (void *) 1 if if the dentry is + * DCACHE_ENCRYPTED_WITH_KEY is set if the dentry is * created while the directory was encrypted and we - * don't have access to the key. + * have access to the key. */ - dentry->d_fsdata = NULL; - if (ext4_encryption_info(dir)) - dentry->d_fsdata = (void *) 1; - d_set_d_op(dentry, &ext4_encrypted_d_ops); - if (res && res != -ENOKEY) - return ERR_PTR(res); - } + if (fscrypt_has_encryption_key(dir)) + fscrypt_set_encrypted_dentry(dentry); + fscrypt_set_d_op(dentry); + if (res && res != -ENOKEY) + return ERR_PTR(res); + } - if (dentry->d_name.len > EXT4_NAME_LEN) - return ERR_PTR(-ENAMETOOLONG); + if (dentry->d_name.len > EXT4_NAME_LEN) + return ERR_PTR(-ENAMETOOLONG); bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL); if (IS_ERR(bh)) @@ -1604,16 +1611,14 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsi } if (!IS_ERR(inode) && ext4_encrypted_inode(dir) && (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) && - !ext4_is_child_context_consistent_with_parent(dir, - inode)) { + !fscrypt_has_permitted_context(dir, inode)) { int nokey = ext4_encrypted_inode(inode) && - !ext4_encryption_info(inode); - + !fscrypt_has_encryption_key(inode); iput(inode); if (nokey) return ERR_PTR(-ENOKEY); ext4_warning(inode->i_sb, - "Inconsistent encryption contexts: %lu/%lu\n", + "Inconsistent encryption contexts: %lu/%lu", (unsigned long) dir->i_ino, (unsigned long) inode->i_ino); return ERR_PTR(-EPERM); @@ -2685,30 +2690,30 @@ out_stop: /* * routine to check that the specified directory is empty (for rmdir) */ -int ext4_empty_dir(struct inode *inode) +bool ext4_empty_dir(struct inode *inode) { unsigned int offset; struct buffer_head *bh; struct ext4_dir_entry_2 *de, *de1; struct super_block *sb; - int err = 0; if (ext4_has_inline_data(inode)) { int has_inline_data = 1; + int ret; - err = empty_inline_dir(inode, &has_inline_data); + ret = empty_inline_dir(inode, &has_inline_data); if (has_inline_data) - return err; + return ret; } sb = inode->i_sb; if (inode->i_size < EXT4_DIR_REC_LEN(1) + EXT4_DIR_REC_LEN(2)) { EXT4_ERROR_INODE(inode, "invalid size"); - return 1; + return true; } bh = ext4_read_dirblock(inode, 0, EITHER); if (IS_ERR(bh)) - return 1; + return true; de = (struct ext4_dir_entry_2 *) bh->b_data; de1 = ext4_next_entry(de, sb->s_blocksize); @@ -2717,7 +2722,7 @@ int ext4_empty_dir(struct inode *inode) strcmp(".", de->name) || strcmp("..", de1->name)) { ext4_warning_inode(inode, "directory missing '.' and/or '..'"); brelse(bh); - return 1; + return true; } offset = ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize) + ext4_rec_len_from_disk(de1->rec_len, sb->s_blocksize); @@ -2725,12 +2730,11 @@ int ext4_empty_dir(struct inode *inode) while (offset < inode->i_size) { if ((void *) de >= (void *) (bh->b_data+sb->s_blocksize)) { unsigned int lblock; - err = 0; brelse(bh); lblock = offset >> EXT4_BLOCK_SIZE_BITS(sb); bh = ext4_read_dirblock(inode, lblock, EITHER); if (IS_ERR(bh)) - return 1; + return true; de = (struct ext4_dir_entry_2 *) bh->b_data; } if (ext4_check_dir_entry(inode, NULL, de, bh, @@ -2742,13 +2746,13 @@ int ext4_empty_dir(struct inode *inode) } if (le32_to_cpu(de->inode)) { brelse(bh); - return 0; + return false; } offset += ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize); de = ext4_next_entry(de, sb->s_blocksize); } brelse(bh); - return 1; + return true; } /* @@ -2828,7 +2832,7 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode) * list entries can cause panics at unmount time. */ mutex_lock(&sbi->s_orphan_lock); - list_del(&EXT4_I(inode)->i_orphan); + list_del_init(&EXT4_I(inode)->i_orphan); mutex_unlock(&sbi->s_orphan_lock); } } @@ -3071,8 +3075,8 @@ static int ext4_symlink(struct inode *dir, int err, len = strlen(symname); int credits; bool encryption_required; - struct ext4_str disk_link; - struct ext4_encrypted_symlink_data *sd = NULL; + struct fscrypt_str disk_link; + struct fscrypt_symlink_data *sd = NULL; disk_link.len = len + 1; disk_link.name = (char *) symname; @@ -3080,13 +3084,13 @@ static int ext4_symlink(struct inode *dir, encryption_required = (ext4_encrypted_inode(dir) || DUMMY_ENCRYPTION_ENABLED(EXT4_SB(dir->i_sb))); if (encryption_required) { - err = ext4_get_encryption_info(dir); + err = fscrypt_get_encryption_info(dir); if (err) return err; - if (ext4_encryption_info(dir) == NULL) + if (!fscrypt_has_encryption_key(dir)) return -EPERM; - disk_link.len = (ext4_fname_encrypted_size(dir, len) + - sizeof(struct ext4_encrypted_symlink_data)); + disk_link.len = (fscrypt_fname_encrypted_size(dir, len) + + sizeof(struct fscrypt_symlink_data)); sd = kzalloc(disk_link.len, GFP_KERNEL); if (!sd) return -ENOMEM; @@ -3134,13 +3138,12 @@ static int ext4_symlink(struct inode *dir, if (encryption_required) { struct qstr istr; - struct ext4_str ostr; + struct fscrypt_str ostr = + FSTR_INIT(sd->encrypted_path, disk_link.len); istr.name = (const unsigned char *) symname; istr.len = len; - ostr.name = sd->encrypted_path; - ostr.len = disk_link.len; - err = ext4_fname_usr_to_disk(inode, &istr, &ostr); + err = fscrypt_fname_usr_to_disk(inode, &istr, &ostr); if (err < 0) goto err_drop_inode; sd->len = cpu_to_le16(ostr.len); @@ -3229,7 +3232,7 @@ static int ext4_link(struct dentry *old_dentry, if (inode->i_nlink >= EXT4_LINK_MAX) return -EMLINK; if (ext4_encrypted_inode(dir) && - !ext4_is_child_context_consistent_with_parent(dir, inode)) + !fscrypt_has_permitted_context(dir, inode)) return -EPERM; if ((ext4_test_inode_flag(dir, EXT4_INODE_PROJINHERIT)) && @@ -3552,8 +3555,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, if ((old.dir != new.dir) && ext4_encrypted_inode(new.dir) && - !ext4_is_child_context_consistent_with_parent(new.dir, - old.inode)) { + !fscrypt_has_permitted_context(new.dir, old.inode)) { retval = -EPERM; goto end_rename; } @@ -3725,10 +3727,8 @@ static int ext4_cross_rename(struct inode *old_dir, struct dentry *old_dentry, if ((ext4_encrypted_inode(old_dir) || ext4_encrypted_inode(new_dir)) && (old_dir != new_dir) && - (!ext4_is_child_context_consistent_with_parent(new_dir, - old.inode) || - !ext4_is_child_context_consistent_with_parent(old_dir, - new.inode))) + (!fscrypt_has_permitted_context(new_dir, old.inode) || + !fscrypt_has_permitted_context(old_dir, new.inode))) return -EPERM; if ((ext4_test_inode_flag(new_dir, EXT4_INODE_PROJINHERIT) && diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index e4fc8ea45d78..a6132a730967 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -24,6 +24,7 @@ #include <linux/slab.h> #include <linux/mm.h> #include <linux/backing-dev.h> +#include <linux/fscrypto.h> #include "ext4_jbd2.h" #include "xattr.h" @@ -67,7 +68,6 @@ static void ext4_finish_bio(struct bio *bio) struct page *page = bvec->bv_page; #ifdef CONFIG_EXT4_FS_ENCRYPTION struct page *data_page = NULL; - struct ext4_crypto_ctx *ctx = NULL; #endif struct buffer_head *bh, *head; unsigned bio_start = bvec->bv_offset; @@ -82,8 +82,7 @@ static void ext4_finish_bio(struct bio *bio) if (!page->mapping) { /* The bounce data pages are unmapped. */ data_page = page; - ctx = (struct ext4_crypto_ctx *)page_private(data_page); - page = ctx->w.control_page; + fscrypt_pullback_bio_page(&page, false); } #endif @@ -113,8 +112,8 @@ static void ext4_finish_bio(struct bio *bio) local_irq_restore(flags); if (!under_io) { #ifdef CONFIG_EXT4_FS_ENCRYPTION - if (ctx) - ext4_restore_control_page(data_page); + if (data_page) + fscrypt_restore_control_page(data_page); #endif end_page_writeback(page); } @@ -340,11 +339,10 @@ void ext4_io_submit(struct ext4_io_submit *io) struct bio *bio = io->io_bio; if (bio) { - int io_op = io->io_wbc->sync_mode == WB_SYNC_ALL ? - WRITE_SYNC : WRITE; - bio_get(io->io_bio); - submit_bio(io_op, io->io_bio); - bio_put(io->io_bio); + int io_op_flags = io->io_wbc->sync_mode == WB_SYNC_ALL ? + WRITE_SYNC : 0; + bio_set_op_attrs(io->io_bio, REQ_OP_WRITE, io_op_flags); + submit_bio(io->io_bio); } io->io_bio = NULL; } @@ -474,7 +472,7 @@ int ext4_bio_write_page(struct ext4_io_submit *io, gfp_t gfp_flags = GFP_NOFS; retry_encrypt: - data_page = ext4_encrypt(inode, page, gfp_flags); + data_page = fscrypt_encrypt_page(inode, page, gfp_flags); if (IS_ERR(data_page)) { ret = PTR_ERR(data_page); if (ret == -ENOMEM && wbc->sync_mode == WB_SYNC_ALL) { @@ -512,7 +510,7 @@ int ext4_bio_write_page(struct ext4_io_submit *io, if (ret) { out: if (data_page) - ext4_restore_control_page(data_page); + fscrypt_restore_control_page(data_page); printk_ratelimited(KERN_ERR "%s: ret = %d\n", __func__, ret); redirty_page_for_writepage(wbc, page); do { diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c index dc54a4b60eba..a81b829d56de 100644 --- a/fs/ext4/readpage.c +++ b/fs/ext4/readpage.c @@ -46,37 +46,6 @@ #include "ext4.h" -/* - * Call ext4_decrypt on every single page, reusing the encryption - * context. - */ -static void completion_pages(struct work_struct *work) -{ -#ifdef CONFIG_EXT4_FS_ENCRYPTION - struct ext4_crypto_ctx *ctx = - container_of(work, struct ext4_crypto_ctx, r.work); - struct bio *bio = ctx->r.bio; - struct bio_vec *bv; - int i; - - bio_for_each_segment_all(bv, bio, i) { - struct page *page = bv->bv_page; - - int ret = ext4_decrypt(page); - if (ret) { - WARN_ON_ONCE(1); - SetPageError(page); - } else - SetPageUptodate(page); - unlock_page(page); - } - ext4_release_crypto_ctx(ctx); - bio_put(bio); -#else - BUG(); -#endif -} - static inline bool ext4_bio_encrypted(struct bio *bio) { #ifdef CONFIG_EXT4_FS_ENCRYPTION @@ -104,14 +73,10 @@ static void mpage_end_io(struct bio *bio) int i; if (ext4_bio_encrypted(bio)) { - struct ext4_crypto_ctx *ctx = bio->bi_private; - if (bio->bi_error) { - ext4_release_crypto_ctx(ctx); + fscrypt_release_ctx(bio->bi_private); } else { - INIT_WORK(&ctx->r.work, completion_pages); - ctx->r.bio = bio; - queue_work(ext4_read_workqueue, &ctx->r.work); + fscrypt_decrypt_bio_pages(bio->bi_private, bio); return; } } @@ -135,7 +100,6 @@ int ext4_mpage_readpages(struct address_space *mapping, unsigned nr_pages) { struct bio *bio = NULL; - unsigned page_idx; sector_t last_block_in_bio = 0; struct inode *inode = mapping->host; @@ -157,7 +121,7 @@ int ext4_mpage_readpages(struct address_space *mapping, map.m_len = 0; map.m_flags = 0; - for (page_idx = 0; nr_pages; page_idx++, nr_pages--) { + for (; nr_pages; nr_pages--) { int fully_mapped = 1; unsigned first_hole = blocks_per_page; @@ -166,7 +130,7 @@ int ext4_mpage_readpages(struct address_space *mapping, page = list_entry(pages->prev, struct page, lru); list_del(&page->lru); if (add_to_page_cache_lru(page, mapping, page->index, - mapping_gfp_constraint(mapping, GFP_KERNEL))) + readahead_gfp_mask(mapping))) goto next_page; } @@ -271,15 +235,15 @@ int ext4_mpage_readpages(struct address_space *mapping, */ if (bio && (last_block_in_bio != blocks[0] - 1)) { submit_and_realloc: - submit_bio(READ, bio); + submit_bio(bio); bio = NULL; } if (bio == NULL) { - struct ext4_crypto_ctx *ctx = NULL; + struct fscrypt_ctx *ctx = NULL; if (ext4_encrypted_inode(inode) && S_ISREG(inode->i_mode)) { - ctx = ext4_get_crypto_ctx(inode, GFP_NOFS); + ctx = fscrypt_get_ctx(inode, GFP_NOFS); if (IS_ERR(ctx)) goto set_error_page; } @@ -287,13 +251,14 @@ int ext4_mpage_readpages(struct address_space *mapping, min_t(int, nr_pages, BIO_MAX_PAGES)); if (!bio) { if (ctx) - ext4_release_crypto_ctx(ctx); + fscrypt_release_ctx(ctx); goto set_error_page; } bio->bi_bdev = bdev; bio->bi_iter.bi_sector = blocks[0] << (blkbits - 9); bio->bi_end_io = mpage_end_io; bio->bi_private = ctx; + bio_set_op_attrs(bio, REQ_OP_READ, 0); } length = first_hole << blkbits; @@ -303,14 +268,14 @@ int ext4_mpage_readpages(struct address_space *mapping, if (((map.m_flags & EXT4_MAP_BOUNDARY) && (relative_block == map.m_len)) || (first_hole != blocks_per_page)) { - submit_bio(READ, bio); + submit_bio(bio); bio = NULL; } else last_block_in_bio = blocks[blocks_per_page - 1]; goto next_page; confused: if (bio) { - submit_bio(READ, bio); + submit_bio(bio); bio = NULL; } if (!PageUptodate(page)) @@ -323,6 +288,6 @@ int ext4_mpage_readpages(struct address_space *mapping, } BUG_ON(pages && !list_empty(pages)); if (bio) - submit_bio(READ, bio); + submit_bio(bio); return 0; } diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index 34038e3598d5..cf681004b196 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -41,7 +41,7 @@ int ext4_resize_begin(struct super_block *sb) */ if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) { ext4_warning(sb, "There are errors in the filesystem, " - "so online resizing is not allowed\n"); + "so online resizing is not allowed"); return -EPERM; } diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 304c712dbe12..1c593aa0218e 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -859,6 +859,7 @@ static void ext4_put_super(struct super_block *sb) percpu_counter_destroy(&sbi->s_freeinodes_counter); percpu_counter_destroy(&sbi->s_dirs_counter); percpu_counter_destroy(&sbi->s_dirtyclusters_counter); + percpu_free_rwsem(&sbi->s_journal_flag_rwsem); brelse(sbi->s_sbh); #ifdef CONFIG_QUOTA for (i = 0; i < EXT4_MAXQUOTAS; i++) @@ -944,9 +945,6 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) ei->i_datasync_tid = 0; atomic_set(&ei->i_unwritten, 0); INIT_WORK(&ei->i_rsv_conversion_work, ext4_end_io_rsv_work); -#ifdef CONFIG_EXT4_FS_ENCRYPTION - ei->i_crypt_info = NULL; -#endif return &ei->vfs_inode; } @@ -1025,8 +1023,7 @@ void ext4_clear_inode(struct inode *inode) EXT4_I(inode)->jinode = NULL; } #ifdef CONFIG_EXT4_FS_ENCRYPTION - if (EXT4_I(inode)->i_crypt_info) - ext4_free_encryption_info(inode, EXT4_I(inode)->i_crypt_info); + fscrypt_put_encryption_info(inode, NULL); #endif } @@ -1093,6 +1090,90 @@ static int bdev_try_to_free_page(struct super_block *sb, struct page *page, return try_to_free_buffers(page); } +#ifdef CONFIG_EXT4_FS_ENCRYPTION +static int ext4_get_context(struct inode *inode, void *ctx, size_t len) +{ + return ext4_xattr_get(inode, EXT4_XATTR_INDEX_ENCRYPTION, + EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, ctx, len); +} + +static int ext4_key_prefix(struct inode *inode, u8 **key) +{ + *key = EXT4_SB(inode->i_sb)->key_prefix; + return EXT4_SB(inode->i_sb)->key_prefix_size; +} + +static int ext4_prepare_context(struct inode *inode) +{ + return ext4_convert_inline_data(inode); +} + +static int ext4_set_context(struct inode *inode, const void *ctx, size_t len, + void *fs_data) +{ + handle_t *handle; + int res, res2; + + /* fs_data is null when internally used. */ + if (fs_data) { + res = ext4_xattr_set(inode, EXT4_XATTR_INDEX_ENCRYPTION, + EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, ctx, + len, 0); + if (!res) { + ext4_set_inode_flag(inode, EXT4_INODE_ENCRYPT); + ext4_clear_inode_state(inode, + EXT4_STATE_MAY_INLINE_DATA); + } + return res; + } + + handle = ext4_journal_start(inode, EXT4_HT_MISC, + ext4_jbd2_credits_xattr(inode)); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + res = ext4_xattr_set(inode, EXT4_XATTR_INDEX_ENCRYPTION, + EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, ctx, + len, 0); + if (!res) { + ext4_set_inode_flag(inode, EXT4_INODE_ENCRYPT); + res = ext4_mark_inode_dirty(handle, inode); + if (res) + EXT4_ERROR_INODE(inode, "Failed to mark inode dirty"); + } + res2 = ext4_journal_stop(handle); + if (!res) + res = res2; + return res; +} + +static int ext4_dummy_context(struct inode *inode) +{ + return DUMMY_ENCRYPTION_ENABLED(EXT4_SB(inode->i_sb)); +} + +static unsigned ext4_max_namelen(struct inode *inode) +{ + return S_ISLNK(inode->i_mode) ? inode->i_sb->s_blocksize : + EXT4_NAME_LEN; +} + +static struct fscrypt_operations ext4_cryptops = { + .get_context = ext4_get_context, + .key_prefix = ext4_key_prefix, + .prepare_context = ext4_prepare_context, + .set_context = ext4_set_context, + .dummy_context = ext4_dummy_context, + .is_encrypted = ext4_encrypted_inode, + .empty_dir = ext4_empty_dir, + .max_namelen = ext4_max_namelen, +}; +#else +static struct fscrypt_operations ext4_cryptops = { + .is_encrypted = ext4_encrypted_inode, +}; +#endif + #ifdef CONFIG_QUOTA static char *quotatypes[] = INITQFNAMES; #define QTYPE2NAME(t) (quotatypes[t]) @@ -2067,23 +2148,25 @@ failed: static __le16 ext4_group_desc_csum(struct super_block *sb, __u32 block_group, struct ext4_group_desc *gdp) { - int offset; + int offset = offsetof(struct ext4_group_desc, bg_checksum); __u16 crc = 0; __le32 le_group = cpu_to_le32(block_group); struct ext4_sb_info *sbi = EXT4_SB(sb); if (ext4_has_metadata_csum(sbi->s_sb)) { /* Use new metadata_csum algorithm */ - __le16 save_csum; __u32 csum32; + __u16 dummy_csum = 0; - save_csum = gdp->bg_checksum; - gdp->bg_checksum = 0; csum32 = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&le_group, sizeof(le_group)); - csum32 = ext4_chksum(sbi, csum32, (__u8 *)gdp, - sbi->s_desc_size); - gdp->bg_checksum = save_csum; + csum32 = ext4_chksum(sbi, csum32, (__u8 *)gdp, offset); + csum32 = ext4_chksum(sbi, csum32, (__u8 *)&dummy_csum, + sizeof(dummy_csum)); + offset += sizeof(dummy_csum); + if (offset < sbi->s_desc_size) + csum32 = ext4_chksum(sbi, csum32, (__u8 *)gdp + offset, + sbi->s_desc_size - offset); crc = csum32 & 0xFFFF; goto out; @@ -2093,8 +2176,6 @@ static __le16 ext4_group_desc_csum(struct super_block *sb, __u32 block_group, if (!ext4_has_feature_gdt_csum(sb)) return 0; - offset = offsetof(struct ext4_group_desc, bg_checksum); - crc = crc16(~0, sbi->s_es->s_uuid, sizeof(sbi->s_es->s_uuid)); crc = crc16(crc, (__u8 *)&le_group, sizeof(le_group)); crc = crc16(crc, (__u8 *)gdp, offset); @@ -2277,6 +2358,16 @@ static void ext4_orphan_cleanup(struct super_block *sb, while (es->s_last_orphan) { struct inode *inode; + /* + * We may have encountered an error during cleanup; if + * so, skip the rest. + */ + if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) { + jbd_debug(1, "Skipping orphan recovery on fs with errors.\n"); + es->s_last_orphan = 0; + break; + } + inode = ext4_orphan_get(sb, le32_to_cpu(es->s_last_orphan)); if (IS_ERR(inode)) { es->s_last_orphan = 0; @@ -3415,17 +3506,17 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) goto failed_mount; } + if (le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) > (blocksize / 4)) { + ext4_msg(sb, KERN_ERR, + "Number of reserved GDT blocks insanely large: %d", + le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks)); + goto failed_mount; + } + if (sbi->s_mount_opt & EXT4_MOUNT_DAX) { - if (blocksize != PAGE_SIZE) { - ext4_msg(sb, KERN_ERR, - "error: unsupported blocksize for dax"); - goto failed_mount; - } - if (!sb->s_bdev->bd_disk->fops->direct_access) { - ext4_msg(sb, KERN_ERR, - "error: device does not support dax"); + err = bdev_dax_supported(sb, blocksize); + if (err) goto failed_mount; - } } if (ext4_has_feature_encrypt(sb) && es->s_encryption_level) { @@ -3692,6 +3783,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) sb->s_op = &ext4_sops; sb->s_export_op = &ext4_export_ops; sb->s_xattr = ext4_xattr_handlers; + sb->s_cop = &ext4_cryptops; #ifdef CONFIG_QUOTA sb->dq_op = &ext4_quota_operations; if (ext4_has_feature_quota(sb)) @@ -3930,6 +4022,9 @@ no_journal: if (!err) err = percpu_counter_init(&sbi->s_dirtyclusters_counter, 0, GFP_KERNEL); + if (!err) + err = percpu_init_rwsem(&sbi->s_journal_flag_rwsem); + if (err) { ext4_msg(sb, KERN_ERR, "insufficient memory"); goto failed_mount6; @@ -3999,6 +4094,11 @@ no_journal: ratelimit_state_init(&sbi->s_msg_ratelimit_state, 5 * HZ, 10); kfree(orig_data); +#ifdef CONFIG_EXT4_FS_ENCRYPTION + memcpy(sbi->key_prefix, EXT4_KEY_DESC_PREFIX, + EXT4_KEY_DESC_PREFIX_SIZE); + sbi->key_prefix_size = EXT4_KEY_DESC_PREFIX_SIZE; +#endif return 0; cantfind_ext4: @@ -4207,7 +4307,7 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb, goto out_bdev; } journal->j_private = sb; - ll_rw_block(READ | REQ_META | REQ_PRIO, 1, &journal->j_sb_buffer); + ll_rw_block(REQ_OP_READ, REQ_META | REQ_PRIO, 1, &journal->j_sb_buffer); wait_on_buffer(journal->j_sb_buffer); if (!buffer_uptodate(journal->j_sb_buffer)) { ext4_msg(sb, KERN_ERR, "I/O error on journal device"); @@ -4330,20 +4430,6 @@ static int ext4_commit_super(struct super_block *sb, int sync) if (!sbh || block_device_ejected(sb)) return error; - if (buffer_write_io_error(sbh)) { - /* - * Oh, dear. A previous attempt to write the - * superblock failed. This could happen because the - * USB device was yanked out. Or it could happen to - * be a transient write error and maybe the block will - * be remapped. Nothing we can do but to retry the - * write and hope for the best. - */ - ext4_msg(sb, KERN_ERR, "previous I/O error to " - "superblock detected"); - clear_buffer_write_io_error(sbh); - set_buffer_uptodate(sbh); - } /* * If the file system is mounted read-only, don't update the * superblock write time. This avoids updating the superblock @@ -4374,7 +4460,23 @@ static int ext4_commit_super(struct super_block *sb, int sync) &EXT4_SB(sb)->s_freeinodes_counter)); BUFFER_TRACE(sbh, "marking dirty"); ext4_superblock_csum_set(sb); + lock_buffer(sbh); + if (buffer_write_io_error(sbh)) { + /* + * Oh, dear. A previous attempt to write the + * superblock failed. This could happen because the + * USB device was yanked out. Or it could happen to + * be a transient write error and maybe the block will + * be remapped. Nothing we can do but to retry the + * write and hope for the best. + */ + ext4_msg(sb, KERN_ERR, "previous I/O error to " + "superblock detected"); + clear_buffer_write_io_error(sbh); + set_buffer_uptodate(sbh); + } mark_buffer_dirty(sbh); + unlock_buffer(sbh); if (sync) { error = __sync_dirty_buffer(sbh, test_opt(sb, BARRIER) ? WRITE_FUA : WRITE_SYNC); @@ -5425,7 +5527,6 @@ out5: static void __exit ext4_exit_fs(void) { - ext4_exit_crypto(); ext4_destroy_lazyinit_thread(); unregister_as_ext2(); unregister_as_ext3(); diff --git a/fs/ext4/symlink.c b/fs/ext4/symlink.c index 75ed5c2f0c16..4d83d9e05f2e 100644 --- a/fs/ext4/symlink.c +++ b/fs/ext4/symlink.c @@ -22,23 +22,22 @@ #include "ext4.h" #include "xattr.h" -#ifdef CONFIG_EXT4_FS_ENCRYPTION static const char *ext4_encrypted_get_link(struct dentry *dentry, struct inode *inode, struct delayed_call *done) { struct page *cpage = NULL; char *caddr, *paddr = NULL; - struct ext4_str cstr, pstr; - struct ext4_encrypted_symlink_data *sd; + struct fscrypt_str cstr, pstr; + struct fscrypt_symlink_data *sd; loff_t size = min_t(loff_t, i_size_read(inode), PAGE_SIZE - 1); int res; - u32 plen, max_size = inode->i_sb->s_blocksize; + u32 max_size = inode->i_sb->s_blocksize; if (!dentry) return ERR_PTR(-ECHILD); - res = ext4_get_encryption_info(inode); + res = fscrypt_get_encryption_info(inode); if (res) return ERR_PTR(res); @@ -54,30 +53,27 @@ static const char *ext4_encrypted_get_link(struct dentry *dentry, } /* Symlink is encrypted */ - sd = (struct ext4_encrypted_symlink_data *)caddr; + sd = (struct fscrypt_symlink_data *)caddr; cstr.name = sd->encrypted_path; cstr.len = le16_to_cpu(sd->len); - if ((cstr.len + - sizeof(struct ext4_encrypted_symlink_data) - 1) > - max_size) { + if ((cstr.len + sizeof(struct fscrypt_symlink_data) - 1) > max_size) { /* Symlink data on the disk is corrupted */ res = -EFSCORRUPTED; goto errout; } - plen = (cstr.len < EXT4_FNAME_CRYPTO_DIGEST_SIZE*2) ? - EXT4_FNAME_CRYPTO_DIGEST_SIZE*2 : cstr.len; - paddr = kmalloc(plen + 1, GFP_NOFS); - if (!paddr) { - res = -ENOMEM; + + res = fscrypt_fname_alloc_buffer(inode, cstr.len, &pstr); + if (res) goto errout; - } - pstr.name = paddr; - pstr.len = plen; - res = _ext4_fname_disk_to_usr(inode, NULL, &cstr, &pstr); + + res = fscrypt_fname_disk_to_usr(inode, 0, 0, &cstr, &pstr); if (res < 0) goto errout; + + paddr = pstr.name; + /* Null-terminate the name */ - if (res <= plen) + if (res <= pstr.len) paddr[res] = '\0'; if (cpage) put_page(cpage); @@ -99,7 +95,6 @@ const struct inode_operations ext4_encrypted_symlink_inode_operations = { .listxattr = ext4_listxattr, .removexattr = generic_removexattr, }; -#endif const struct inode_operations ext4_symlink_inode_operations = { .readlink = generic_readlink, diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c index 1420a3c614af..73bcfd41f5f2 100644 --- a/fs/ext4/sysfs.c +++ b/fs/ext4/sysfs.c @@ -359,7 +359,6 @@ static int name##_open(struct inode *inode, struct file *file) \ } \ \ static const struct file_operations ext4_seq_##name##_fops = { \ - .owner = THIS_MODULE, \ .open = name##_open, \ .read = seq_read, \ .llseek = seq_lseek, \ diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index e79bd32b9b79..39e9cfb1b371 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -121,17 +121,18 @@ static __le32 ext4_xattr_block_csum(struct inode *inode, { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); __u32 csum; - __le32 save_csum; __le64 dsk_block_nr = cpu_to_le64(block_nr); + __u32 dummy_csum = 0; + int offset = offsetof(struct ext4_xattr_header, h_checksum); - save_csum = hdr->h_checksum; - hdr->h_checksum = 0; csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&dsk_block_nr, sizeof(dsk_block_nr)); - csum = ext4_chksum(sbi, csum, (__u8 *)hdr, - EXT4_BLOCK_SIZE(inode->i_sb)); + csum = ext4_chksum(sbi, csum, (__u8 *)hdr, offset); + csum = ext4_chksum(sbi, csum, (__u8 *)&dummy_csum, sizeof(dummy_csum)); + offset += sizeof(dummy_csum); + csum = ext4_chksum(sbi, csum, (__u8 *)hdr + offset, + EXT4_BLOCK_SIZE(inode->i_sb) - offset); - hdr->h_checksum = save_csum; return cpu_to_le32(csum); } diff --git a/fs/ext4/xattr_security.c b/fs/ext4/xattr_security.c index 123a7d010efe..a8921112030d 100644 --- a/fs/ext4/xattr_security.c +++ b/fs/ext4/xattr_security.c @@ -22,10 +22,11 @@ ext4_xattr_security_get(const struct xattr_handler *handler, static int ext4_xattr_security_set(const struct xattr_handler *handler, - struct dentry *dentry, const char *name, - const void *value, size_t size, int flags) + struct dentry *unused, struct inode *inode, + const char *name, const void *value, + size_t size, int flags) { - return ext4_xattr_set(d_inode(dentry), EXT4_XATTR_INDEX_SECURITY, + return ext4_xattr_set(inode, EXT4_XATTR_INDEX_SECURITY, name, value, size, flags); } diff --git a/fs/ext4/xattr_trusted.c b/fs/ext4/xattr_trusted.c index 60652fa24cbc..c7765c735714 100644 --- a/fs/ext4/xattr_trusted.c +++ b/fs/ext4/xattr_trusted.c @@ -29,10 +29,11 @@ ext4_xattr_trusted_get(const struct xattr_handler *handler, static int ext4_xattr_trusted_set(const struct xattr_handler *handler, - struct dentry *dentry, const char *name, - const void *value, size_t size, int flags) + struct dentry *unused, struct inode *inode, + const char *name, const void *value, + size_t size, int flags) { - return ext4_xattr_set(d_inode(dentry), EXT4_XATTR_INDEX_TRUSTED, + return ext4_xattr_set(inode, EXT4_XATTR_INDEX_TRUSTED, name, value, size, flags); } diff --git a/fs/ext4/xattr_user.c b/fs/ext4/xattr_user.c index 17a446ffecd3..ca20e423034b 100644 --- a/fs/ext4/xattr_user.c +++ b/fs/ext4/xattr_user.c @@ -30,12 +30,13 @@ ext4_xattr_user_get(const struct xattr_handler *handler, static int ext4_xattr_user_set(const struct xattr_handler *handler, - struct dentry *dentry, const char *name, - const void *value, size_t size, int flags) + struct dentry *unused, struct inode *inode, + const char *name, const void *value, + size_t size, int flags) { - if (!test_opt(dentry->d_sb, XATTR_USER)) + if (!test_opt(inode->i_sb, XATTR_USER)) return -EOPNOTSUPP; - return ext4_xattr_set(d_inode(dentry), EXT4_XATTR_INDEX_USER, + return ext4_xattr_set(inode, EXT4_XATTR_INDEX_USER, name, value, size, flags); } diff --git a/fs/f2fs/Kconfig b/fs/f2fs/Kconfig index 1f8982a957f1..378c221d68a9 100644 --- a/fs/f2fs/Kconfig +++ b/fs/f2fs/Kconfig @@ -94,3 +94,11 @@ config F2FS_IO_TRACE information and block IO patterns in the filesystem level. If unsure, say N. + +config F2FS_FAULT_INJECTION + bool "F2FS fault injection facility" + depends on F2FS_FS + help + Test F2FS to inject faults such as ENOMEM, ENOSPC, and so on. + + If unsure, say N. diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c index 6f1fdda977b3..4dcc9e28dc5c 100644 --- a/fs/f2fs/acl.c +++ b/fs/f2fs/acl.c @@ -115,7 +115,7 @@ static void *f2fs_acl_to_disk(const struct posix_acl *acl, size_t *size) struct f2fs_acl_entry *entry; int i; - f2fs_acl = kmalloc(sizeof(struct f2fs_acl_header) + acl->a_count * + f2fs_acl = f2fs_kmalloc(sizeof(struct f2fs_acl_header) + acl->a_count * sizeof(struct f2fs_acl_entry), GFP_NOFS); if (!f2fs_acl) return ERR_PTR(-ENOMEM); @@ -175,7 +175,7 @@ static struct posix_acl *__f2fs_get_acl(struct inode *inode, int type, retval = f2fs_getxattr(inode, name_index, "", NULL, 0, dpage); if (retval > 0) { - value = kmalloc(retval, GFP_F2FS_ZERO); + value = f2fs_kmalloc(retval, GFP_F2FS_ZERO); if (!value) return ERR_PTR(-ENOMEM); retval = f2fs_getxattr(inode, name_index, "", value, @@ -201,7 +201,6 @@ struct posix_acl *f2fs_get_acl(struct inode *inode, int type) static int __f2fs_set_acl(struct inode *inode, int type, struct posix_acl *acl, struct page *ipage) { - struct f2fs_inode_info *fi = F2FS_I(inode); int name_index; void *value = NULL; size_t size = 0; @@ -214,7 +213,7 @@ static int __f2fs_set_acl(struct inode *inode, int type, error = posix_acl_equiv_mode(acl, &inode->i_mode); if (error < 0) return error; - set_acl_inode(fi, inode->i_mode); + set_acl_inode(inode, inode->i_mode); if (error == 0) acl = NULL; } @@ -233,7 +232,7 @@ static int __f2fs_set_acl(struct inode *inode, int type, if (acl) { value = f2fs_acl_to_disk(acl, &size); if (IS_ERR(value)) { - clear_inode_flag(fi, FI_ACL_MODE); + clear_inode_flag(inode, FI_ACL_MODE); return (int)PTR_ERR(value); } } @@ -244,7 +243,7 @@ static int __f2fs_set_acl(struct inode *inode, int type, if (!error) set_cached_acl(inode, type, acl); - clear_inode_flag(fi, FI_ACL_MODE); + clear_inode_flag(inode, FI_ACL_MODE); return error; } @@ -385,6 +384,8 @@ int f2fs_init_acl(struct inode *inode, struct inode *dir, struct page *ipage, if (error) return error; + f2fs_mark_inode_dirty_sync(inode); + if (default_acl) { error = __f2fs_set_acl(inode, ACL_TYPE_DEFAULT, default_acl, ipage); diff --git a/fs/f2fs/acl.h b/fs/f2fs/acl.h index 997ca8edb6cb..b2334d11dae8 100644 --- a/fs/f2fs/acl.h +++ b/fs/f2fs/acl.h @@ -37,7 +37,7 @@ struct f2fs_acl_header { #ifdef CONFIG_F2FS_FS_POSIX_ACL extern struct posix_acl *f2fs_get_acl(struct inode *, int); -extern int f2fs_set_acl(struct inode *inode, struct posix_acl *acl, int type); +extern int f2fs_set_acl(struct inode *, struct posix_acl *, int); extern int f2fs_init_acl(struct inode *, struct inode *, struct page *, struct page *); #else diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 0955312e5ca0..f94d01e7d001 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -26,6 +26,14 @@ static struct kmem_cache *ino_entry_slab; struct kmem_cache *inode_entry_slab; +void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi, bool end_io) +{ + set_ckpt_flags(sbi->ckpt, CP_ERROR_FLAG); + sbi->sb->s_flags |= MS_RDONLY; + if (!end_io) + f2fs_flush_merged_bios(sbi); +} + /* * We guarantee no failure on the returned page. */ @@ -34,13 +42,14 @@ struct page *grab_meta_page(struct f2fs_sb_info *sbi, pgoff_t index) struct address_space *mapping = META_MAPPING(sbi); struct page *page = NULL; repeat: - page = grab_cache_page(mapping, index); + page = f2fs_grab_cache_page(mapping, index, false); if (!page) { cond_resched(); goto repeat; } f2fs_wait_on_page_writeback(page, META, true); - SetPageUptodate(page); + if (!PageUptodate(page)) + SetPageUptodate(page); return page; } @@ -55,16 +64,17 @@ static struct page *__get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index, struct f2fs_io_info fio = { .sbi = sbi, .type = META, - .rw = READ_SYNC | REQ_META | REQ_PRIO, + .op = REQ_OP_READ, + .op_flags = READ_SYNC | REQ_META | REQ_PRIO, .old_blkaddr = index, .new_blkaddr = index, .encrypted_page = NULL, }; if (unlikely(!is_meta)) - fio.rw &= ~REQ_META; + fio.op_flags &= ~REQ_META; repeat: - page = grab_cache_page(mapping, index); + page = f2fs_grab_cache_page(mapping, index, false); if (!page) { cond_resched(); goto repeat; @@ -91,7 +101,7 @@ repeat: * meta page. */ if (unlikely(!PageUptodate(page))) - f2fs_stop_checkpoint(sbi); + f2fs_stop_checkpoint(sbi, false); out: return page; } @@ -149,13 +159,14 @@ int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, struct f2fs_io_info fio = { .sbi = sbi, .type = META, - .rw = sync ? (READ_SYNC | REQ_META | REQ_PRIO) : READA, + .op = REQ_OP_READ, + .op_flags = sync ? (READ_SYNC | REQ_META | REQ_PRIO) : REQ_RAHEAD, .encrypted_page = NULL, }; struct blk_plug plug; if (unlikely(type == META_POR)) - fio.rw &= ~REQ_META; + fio.op_flags &= ~REQ_META; blk_start_plug(&plug); for (; nrpages-- > 0; blkno++) { @@ -186,7 +197,8 @@ int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, BUG(); } - page = grab_cache_page(META_MAPPING(sbi), fio.new_blkaddr); + page = f2fs_grab_cache_page(META_MAPPING(sbi), + fio.new_blkaddr, false); if (!page) continue; if (PageUptodate(page)) { @@ -211,7 +223,7 @@ void ra_meta_pages_cond(struct f2fs_sb_info *sbi, pgoff_t index) bool readahead = false; page = find_get_page(META_MAPPING(sbi), index); - if (!page || (page && !PageUptodate(page))) + if (!page || !PageUptodate(page)) readahead = true; f2fs_put_page(page, 0); @@ -255,6 +267,7 @@ static int f2fs_write_meta_pages(struct address_space *mapping, struct writeback_control *wbc) { struct f2fs_sb_info *sbi = F2FS_M_SB(mapping); + struct blk_plug plug; long diff, written; /* collect a number of dirty meta pages and write together */ @@ -267,7 +280,9 @@ static int f2fs_write_meta_pages(struct address_space *mapping, /* if mounting is failed, skip writing node pages */ mutex_lock(&sbi->cp_mutex); diff = nr_pages_to_write(sbi, META, wbc); + blk_start_plug(&plug); written = sync_meta_pages(sbi, META, wbc->nr_to_write); + blk_finish_plug(&plug); mutex_unlock(&sbi->cp_mutex); wbc->nr_to_write = max((long)0, wbc->nr_to_write - written - diff); return 0; @@ -355,9 +370,10 @@ static int f2fs_set_meta_page_dirty(struct page *page) { trace_f2fs_set_page_dirty(page, META); - SetPageUptodate(page); + if (!PageUptodate(page)) + SetPageUptodate(page); if (!PageDirty(page)) { - __set_page_dirty_nobuffers(page); + f2fs_set_page_dirty_nobuffers(page); inc_page_count(F2FS_P_SB(page), F2FS_DIRTY_META); SetPagePrivate(page); f2fs_trace_pid(page); @@ -448,12 +464,12 @@ bool exist_written_data(struct f2fs_sb_info *sbi, nid_t ino, int mode) return e ? true : false; } -void release_ino_entry(struct f2fs_sb_info *sbi) +void release_ino_entry(struct f2fs_sb_info *sbi, bool all) { struct ino_entry *e, *tmp; int i; - for (i = APPEND_INO; i <= UPDATE_INO; i++) { + for (i = all ? ORPHAN_INO: APPEND_INO; i <= UPDATE_INO; i++) { struct inode_management *im = &sbi->im[i]; spin_lock(&im->ino_lock); @@ -473,6 +489,13 @@ int acquire_orphan_inode(struct f2fs_sb_info *sbi) int err = 0; spin_lock(&im->ino_lock); + +#ifdef CONFIG_F2FS_FAULT_INJECTION + if (time_to_inject(FAULT_ORPHAN)) { + spin_unlock(&im->ino_lock); + return -ENOSPC; + } +#endif if (unlikely(im->ino_num >= sbi->max_orphans)) err = -ENOSPC; else @@ -492,10 +515,11 @@ void release_orphan_inode(struct f2fs_sb_info *sbi) spin_unlock(&im->ino_lock); } -void add_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) +void add_orphan_inode(struct inode *inode) { /* add new orphan ino entry into list */ - __add_ino_entry(sbi, ino, ORPHAN_INO); + __add_ino_entry(F2FS_I_SB(inode), inode->i_ino, ORPHAN_INO); + update_inode_page(inode); } void remove_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) @@ -743,28 +767,25 @@ fail_no_cp: static void __add_dirty_inode(struct inode *inode, enum inode_type type) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct f2fs_inode_info *fi = F2FS_I(inode); int flag = (type == DIR_INODE) ? FI_DIRTY_DIR : FI_DIRTY_FILE; - if (is_inode_flag_set(fi, flag)) + if (is_inode_flag_set(inode, flag)) return; - set_inode_flag(fi, flag); - list_add_tail(&fi->dirty_list, &sbi->inode_list[type]); + set_inode_flag(inode, flag); + list_add_tail(&F2FS_I(inode)->dirty_list, &sbi->inode_list[type]); stat_inc_dirty_inode(sbi, type); } static void __remove_dirty_inode(struct inode *inode, enum inode_type type) { - struct f2fs_inode_info *fi = F2FS_I(inode); int flag = (type == DIR_INODE) ? FI_DIRTY_DIR : FI_DIRTY_FILE; - if (get_dirty_pages(inode) || - !is_inode_flag_set(F2FS_I(inode), flag)) + if (get_dirty_pages(inode) || !is_inode_flag_set(inode, flag)) return; - list_del_init(&fi->dirty_list); - clear_inode_flag(fi, flag); + list_del_init(&F2FS_I(inode)->dirty_list); + clear_inode_flag(inode, flag); stat_dec_dirty_inode(F2FS_I_SB(inode), type); } @@ -778,7 +799,8 @@ void update_dirty_page(struct inode *inode, struct page *page) return; spin_lock(&sbi->inode_lock[type]); - __add_dirty_inode(inode, type); + if (type != FILE_INODE || test_opt(sbi, DATA_FLUSH)) + __add_dirty_inode(inode, type); inode_inc_dirty_pages(inode); spin_unlock(&sbi->inode_lock[type]); @@ -786,34 +808,21 @@ void update_dirty_page(struct inode *inode, struct page *page) f2fs_trace_pid(page); } -void add_dirty_dir_inode(struct inode *inode) -{ - struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - - spin_lock(&sbi->inode_lock[DIR_INODE]); - __add_dirty_inode(inode, DIR_INODE); - spin_unlock(&sbi->inode_lock[DIR_INODE]); -} - void remove_dirty_inode(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct f2fs_inode_info *fi = F2FS_I(inode); enum inode_type type = S_ISDIR(inode->i_mode) ? DIR_INODE : FILE_INODE; if (!S_ISDIR(inode->i_mode) && !S_ISREG(inode->i_mode) && !S_ISLNK(inode->i_mode)) return; + if (type == FILE_INODE && !test_opt(sbi, DATA_FLUSH)) + return; + spin_lock(&sbi->inode_lock[type]); __remove_dirty_inode(inode, type); spin_unlock(&sbi->inode_lock[type]); - - /* Only from the recovery routine */ - if (is_inode_flag_set(fi, FI_DELAY_IPUT)) { - clear_inode_flag(fi, FI_DELAY_IPUT); - iput(inode); - } } int sync_dirty_inodes(struct f2fs_sb_info *sbi, enum inode_type type) @@ -857,6 +866,34 @@ retry: goto retry; } +int f2fs_sync_inode_meta(struct f2fs_sb_info *sbi) +{ + struct list_head *head = &sbi->inode_list[DIRTY_META]; + struct inode *inode; + struct f2fs_inode_info *fi; + s64 total = get_pages(sbi, F2FS_DIRTY_IMETA); + + while (total--) { + if (unlikely(f2fs_cp_error(sbi))) + return -EIO; + + spin_lock(&sbi->inode_lock[DIRTY_META]); + if (list_empty(head)) { + spin_unlock(&sbi->inode_lock[DIRTY_META]); + return 0; + } + fi = list_entry(head->next, struct f2fs_inode_info, + gdirty_list); + inode = igrab(&fi->vfs_inode); + spin_unlock(&sbi->inode_lock[DIRTY_META]); + if (inode) { + update_inode_page(inode); + iput(inode); + } + }; + return 0; +} + /* * Freeze all the FS-operations for checkpoint. */ @@ -883,6 +920,14 @@ retry_flush_dents: goto retry_flush_dents; } + if (get_pages(sbi, F2FS_DIRTY_IMETA)) { + f2fs_unlock_all(sbi); + err = f2fs_sync_inode_meta(sbi); + if (err) + goto out; + goto retry_flush_dents; + } + /* * POR: we should ensure that there are no dirty node pages * until finishing nat/sit flush. @@ -892,7 +937,7 @@ retry_flush_nodes: if (get_pages(sbi, F2FS_DIRTY_NODES)) { up_write(&sbi->node_write); - err = sync_node_pages(sbi, 0, &wbc); + err = sync_node_pages(sbi, &wbc); if (err) { f2fs_unlock_all(sbi); goto out; @@ -907,6 +952,8 @@ out: static void unblock_operations(struct f2fs_sb_info *sbi) { up_write(&sbi->node_write); + + build_free_nids(sbi); f2fs_unlock_all(sbi); } @@ -917,7 +964,7 @@ static void wait_on_all_pages_writeback(struct f2fs_sb_info *sbi) for (;;) { prepare_to_wait(&sbi->cp_wait, &wait, TASK_UNINTERRUPTIBLE); - if (!get_pages(sbi, F2FS_WRITEBACK)) + if (!atomic_read(&sbi->nr_wb_bios)) break; io_schedule_timeout(5*HZ); @@ -947,7 +994,7 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) * This avoids to conduct wrong roll-forward operations and uses * metapages, so should be called prior to sync_meta_pages below. */ - if (discard_next_dnode(sbi, discard_blk)) + if (!test_opt(sbi, LFS) && discard_next_dnode(sbi, discard_blk)) invalidate = true; /* Flush all the NAT/SIT pages */ @@ -1082,7 +1129,7 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) /* update user_block_counts */ sbi->last_valid_block_count = sbi->total_valid_block_count; - sbi->alloc_valid_block_count = 0; + percpu_counter_set(&sbi->alloc_valid_block_count, 0); /* Here, we only have one bio having CP pack */ sync_meta_pages(sbi, META_FLUSH, LONG_MAX); @@ -1098,7 +1145,7 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) invalidate_mapping_pages(META_MAPPING(sbi), discard_blk, discard_blk); - release_ino_entry(sbi); + release_ino_entry(sbi, false); if (unlikely(f2fs_cp_error(sbi))) return -EIO; diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index bb376c3bca62..e2624275d828 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -19,6 +19,8 @@ #include <linux/bio.h> #include <linux/prefetch.h> #include <linux/uio.h> +#include <linux/mm.h> +#include <linux/memcontrol.h> #include <linux/cleancache.h> #include "f2fs.h" @@ -45,7 +47,8 @@ static void f2fs_read_end_io(struct bio *bio) struct page *page = bvec->bv_page; if (!bio->bi_error) { - SetPageUptodate(page); + if (!PageUptodate(page)) + SetPageUptodate(page); } else { ClearPageUptodate(page); SetPageError(page); @@ -68,13 +71,12 @@ static void f2fs_write_end_io(struct bio *bio) if (unlikely(bio->bi_error)) { set_bit(AS_EIO, &page->mapping->flags); - f2fs_stop_checkpoint(sbi); + f2fs_stop_checkpoint(sbi, true); } end_page_writeback(page); - dec_page_count(sbi, F2FS_WRITEBACK); } - - if (!get_pages(sbi, F2FS_WRITEBACK) && wq_has_sleeper(&sbi->cp_wait)) + if (atomic_dec_and_test(&sbi->nr_wb_bios) && + wq_has_sleeper(&sbi->cp_wait)) wake_up(&sbi->cp_wait); bio_put(bio); @@ -98,6 +100,18 @@ static struct bio *__bio_alloc(struct f2fs_sb_info *sbi, block_t blk_addr, return bio; } +static inline void __submit_bio(struct f2fs_sb_info *sbi, + struct bio *bio, enum page_type type) +{ + if (!is_read_io(bio_op(bio))) { + atomic_inc(&sbi->nr_wb_bios); + if (f2fs_sb_mounted_hmsmr(sbi->sb) && + current->plug && (type == DATA || type == NODE)) + blk_finish_plug(current->plug); + } + submit_bio(bio); +} + static void __submit_merged_bio(struct f2fs_bio_info *io) { struct f2fs_io_info *fio = &io->fio; @@ -105,12 +119,14 @@ static void __submit_merged_bio(struct f2fs_bio_info *io) if (!io->bio) return; - if (is_read_io(fio->rw)) + if (is_read_io(fio->op)) trace_f2fs_submit_read_bio(io->sbi->sb, fio, io->bio); else trace_f2fs_submit_write_bio(io->sbi->sb, fio, io->bio); - submit_bio(fio->rw, io->bio); + bio_set_op_attrs(io->bio, fio->op, fio->op_flags); + + __submit_bio(io->sbi, io->bio, fio->type); io->bio = NULL; } @@ -176,10 +192,12 @@ static void __f2fs_submit_merged_bio(struct f2fs_sb_info *sbi, /* change META to META_FLUSH in the checkpoint procedure */ if (type >= META_FLUSH) { io->fio.type = META_FLUSH; + io->fio.op = REQ_OP_WRITE; if (test_opt(sbi, NOBARRIER)) - io->fio.rw = WRITE_FLUSH | REQ_META | REQ_PRIO; + io->fio.op_flags = WRITE_FLUSH | REQ_META | REQ_PRIO; else - io->fio.rw = WRITE_FLUSH_FUA | REQ_META | REQ_PRIO; + io->fio.op_flags = WRITE_FLUSH_FUA | REQ_META | + REQ_PRIO; } __submit_merged_bio(io); out: @@ -221,14 +239,16 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio) f2fs_trace_ios(fio, 0); /* Allocate a new bio */ - bio = __bio_alloc(fio->sbi, fio->new_blkaddr, 1, is_read_io(fio->rw)); + bio = __bio_alloc(fio->sbi, fio->new_blkaddr, 1, is_read_io(fio->op)); if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) { bio_put(bio); return -EFAULT; } + bio->bi_rw = fio->op_flags; + bio_set_op_attrs(bio, fio->op, fio->op_flags); - submit_bio(fio->rw, bio); + __submit_bio(fio->sbi, bio, fio->type); return 0; } @@ -237,7 +257,7 @@ void f2fs_submit_page_mbio(struct f2fs_io_info *fio) struct f2fs_sb_info *sbi = fio->sbi; enum page_type btype = PAGE_TYPE_OF_BIO(fio->type); struct f2fs_bio_info *io; - bool is_read = is_read_io(fio->rw); + bool is_read = is_read_io(fio->op); struct page *bio_page; io = is_read ? &sbi->read_io : &sbi->write_io[btype]; @@ -248,11 +268,8 @@ void f2fs_submit_page_mbio(struct f2fs_io_info *fio) down_write(&io->io_rwsem); - if (!is_read) - inc_page_count(sbi, F2FS_WRITEBACK); - if (io->bio && (io->last_block_in_bio != fio->new_blkaddr - 1 || - io->fio.rw != fio->rw)) + (io->fio.op != fio->op || io->fio.op_flags != fio->op_flags))) __submit_merged_bio(io); alloc_new: if (io->bio == NULL) { @@ -278,6 +295,16 @@ alloc_new: trace_f2fs_submit_page_mbio(fio->page, fio); } +static void __set_data_blkaddr(struct dnode_of_data *dn) +{ + struct f2fs_node *rn = F2FS_NODE(dn->node_page); + __le32 *addr_array; + + /* Get physical address of data block */ + addr_array = blkaddr_in_node(rn); + addr_array[dn->ofs_in_node] = cpu_to_le32(dn->data_blkaddr); +} + /* * Lock ordering for the change of data block address: * ->data_page @@ -286,19 +313,9 @@ alloc_new: */ void set_data_blkaddr(struct dnode_of_data *dn) { - struct f2fs_node *rn; - __le32 *addr_array; - struct page *node_page = dn->node_page; - unsigned int ofs_in_node = dn->ofs_in_node; - - f2fs_wait_on_page_writeback(node_page, NODE, true); - - rn = F2FS_NODE(node_page); - - /* Get physical address of data block */ - addr_array = blkaddr_in_node(rn); - addr_array[ofs_in_node] = cpu_to_le32(dn->data_blkaddr); - if (set_page_dirty(node_page)) + f2fs_wait_on_page_writeback(dn->node_page, NODE, true); + __set_data_blkaddr(dn); + if (set_page_dirty(dn->node_page)) dn->node_changed = true; } @@ -309,24 +326,50 @@ void f2fs_update_data_blkaddr(struct dnode_of_data *dn, block_t blkaddr) f2fs_update_extent_cache(dn); } -int reserve_new_block(struct dnode_of_data *dn) +/* dn->ofs_in_node will be returned with up-to-date last block pointer */ +int reserve_new_blocks(struct dnode_of_data *dn, blkcnt_t count) { struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); - if (unlikely(is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC))) + if (!count) + return 0; + + if (unlikely(is_inode_flag_set(dn->inode, FI_NO_ALLOC))) return -EPERM; - if (unlikely(!inc_valid_block_count(sbi, dn->inode, 1))) + if (unlikely(!inc_valid_block_count(sbi, dn->inode, &count))) return -ENOSPC; - trace_f2fs_reserve_new_block(dn->inode, dn->nid, dn->ofs_in_node); + trace_f2fs_reserve_new_blocks(dn->inode, dn->nid, + dn->ofs_in_node, count); - dn->data_blkaddr = NEW_ADDR; - set_data_blkaddr(dn); - mark_inode_dirty(dn->inode); - sync_inode_page(dn); + f2fs_wait_on_page_writeback(dn->node_page, NODE, true); + + for (; count > 0; dn->ofs_in_node++) { + block_t blkaddr = + datablock_addr(dn->node_page, dn->ofs_in_node); + if (blkaddr == NULL_ADDR) { + dn->data_blkaddr = NEW_ADDR; + __set_data_blkaddr(dn); + count--; + } + } + + if (set_page_dirty(dn->node_page)) + dn->node_changed = true; return 0; } +/* Should keep dn->ofs_in_node unchanged */ +int reserve_new_block(struct dnode_of_data *dn) +{ + unsigned int ofs_in_node = dn->ofs_in_node; + int ret; + + ret = reserve_new_blocks(dn, 1); + dn->ofs_in_node = ofs_in_node; + return ret; +} + int f2fs_reserve_block(struct dnode_of_data *dn, pgoff_t index) { bool need_put = dn->inode_page ? false : true; @@ -357,7 +400,7 @@ int f2fs_get_block(struct dnode_of_data *dn, pgoff_t index) } struct page *get_read_data_page(struct inode *inode, pgoff_t index, - int rw, bool for_write) + int op_flags, bool for_write) { struct address_space *mapping = inode->i_mapping; struct dnode_of_data dn; @@ -367,7 +410,8 @@ struct page *get_read_data_page(struct inode *inode, pgoff_t index, struct f2fs_io_info fio = { .sbi = F2FS_I_SB(inode), .type = DATA, - .rw = rw, + .op = REQ_OP_READ, + .op_flags = op_flags, .encrypted_page = NULL, }; @@ -407,7 +451,8 @@ got_it: */ if (dn.data_blkaddr == NEW_ADDR) { zero_user_segment(page, 0, PAGE_SIZE); - SetPageUptodate(page); + if (!PageUptodate(page)) + SetPageUptodate(page); unlock_page(page); return page; } @@ -466,14 +511,14 @@ repeat: /* wait for read completion */ lock_page(page); - if (unlikely(!PageUptodate(page))) { - f2fs_put_page(page, 1); - return ERR_PTR(-EIO); - } if (unlikely(page->mapping != mapping)) { f2fs_put_page(page, 1); goto repeat; } + if (unlikely(!PageUptodate(page))) { + f2fs_put_page(page, 1); + return ERR_PTR(-EIO); + } return page; } @@ -518,7 +563,8 @@ struct page *get_new_data_page(struct inode *inode, if (dn.data_blkaddr == NEW_ADDR) { zero_user_segment(page, 0, PAGE_SIZE); - SetPageUptodate(page); + if (!PageUptodate(page)) + SetPageUptodate(page); } else { f2fs_put_page(page, 1); @@ -530,11 +576,8 @@ struct page *get_new_data_page(struct inode *inode, } got_it: if (new_i_size && i_size_read(inode) < - ((loff_t)(index + 1) << PAGE_SHIFT)) { - i_size_write(inode, ((loff_t)(index + 1) << PAGE_SHIFT)); - /* Only the directory inode sets new_i_size */ - set_inode_flag(F2FS_I(inode), FI_UPDATE_DIR); - } + ((loff_t)(index + 1) << PAGE_SHIFT)) + f2fs_i_size_write(inode, ((loff_t)(index + 1) << PAGE_SHIFT)); return page; } @@ -545,15 +588,16 @@ static int __allocate_data_block(struct dnode_of_data *dn) struct node_info ni; int seg = CURSEG_WARM_DATA; pgoff_t fofs; + blkcnt_t count = 1; - if (unlikely(is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC))) + if (unlikely(is_inode_flag_set(dn->inode, FI_NO_ALLOC))) return -EPERM; dn->data_blkaddr = datablock_addr(dn->node_page, dn->ofs_in_node); if (dn->data_blkaddr == NEW_ADDR) goto alloc; - if (unlikely(!inc_valid_block_count(sbi, dn->inode, 1))) + if (unlikely(!inc_valid_block_count(sbi, dn->inode, &count))) return -ENOSPC; alloc: @@ -571,7 +615,7 @@ alloc: fofs = start_bidx_of_node(ofs_of_node(dn->node_page), dn->inode) + dn->ofs_in_node; if (i_size_read(dn->inode) < ((loff_t)(fofs + 1) << PAGE_SHIFT)) - i_size_write(dn->inode, + f2fs_i_size_write(dn->inode, ((loff_t)(fofs + 1) << PAGE_SHIFT)); return 0; } @@ -582,8 +626,8 @@ ssize_t f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from) struct f2fs_map_blocks map; ssize_t ret = 0; - map.m_lblk = F2FS_BYTES_TO_BLK(iocb->ki_pos); - map.m_len = F2FS_BLK_ALIGN(iov_iter_count(from)); + map.m_lblk = F2FS_BLK_ALIGN(iocb->ki_pos); + map.m_len = F2FS_BYTES_TO_BLK(iov_iter_count(from)); map.m_next_pgofs = NULL; if (f2fs_encrypted_inode(inode)) @@ -620,9 +664,11 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, unsigned int maxblocks = map->m_len; struct dnode_of_data dn; struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - int mode = create ? ALLOC_NODE : LOOKUP_NODE_RA; - pgoff_t pgofs, end_offset; + int mode = create ? ALLOC_NODE : LOOKUP_NODE; + pgoff_t pgofs, end_offset, end; int err = 0, ofs = 1; + unsigned int ofs_in_node, last_ofs_in_node; + blkcnt_t prealloc; struct extent_info ei; bool allocated = false; block_t blkaddr; @@ -632,6 +678,7 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, /* it only supports block size == page size */ pgofs = (pgoff_t)map->m_lblk; + end = pgofs + maxblocks; if (!create && f2fs_lookup_extent_cache(inode, pgofs, &ei)) { map->m_pblk = ei.blk + pgofs - ei.fofs; @@ -648,6 +695,8 @@ next_dnode: set_new_dnode(&dn, inode, NULL, NULL, 0); err = get_dnode_of_data(&dn, pgofs, mode); if (err) { + if (flag == F2FS_GET_BLOCK_BMAP) + map->m_pblk = 0; if (err == -ENOENT) { err = 0; if (map->m_next_pgofs) @@ -657,6 +706,8 @@ next_dnode: goto unlock_out; } + prealloc = 0; + ofs_in_node = dn.ofs_in_node; end_offset = ADDRS_PER_PAGE(dn.node_page, inode); next_block: @@ -669,31 +720,40 @@ next_block: goto sync_out; } if (flag == F2FS_GET_BLOCK_PRE_AIO) { - if (blkaddr == NULL_ADDR) - err = reserve_new_block(&dn); + if (blkaddr == NULL_ADDR) { + prealloc++; + last_ofs_in_node = dn.ofs_in_node; + } } else { err = __allocate_data_block(&dn); + if (!err) { + set_inode_flag(inode, FI_APPEND_WRITE); + allocated = true; + } } if (err) goto sync_out; - allocated = true; map->m_flags = F2FS_MAP_NEW; blkaddr = dn.data_blkaddr; } else { + if (flag == F2FS_GET_BLOCK_BMAP) { + map->m_pblk = 0; + goto sync_out; + } if (flag == F2FS_GET_BLOCK_FIEMAP && blkaddr == NULL_ADDR) { if (map->m_next_pgofs) *map->m_next_pgofs = pgofs + 1; } if (flag != F2FS_GET_BLOCK_FIEMAP || - blkaddr != NEW_ADDR) { - if (flag == F2FS_GET_BLOCK_BMAP) - err = -ENOENT; + blkaddr != NEW_ADDR) goto sync_out; - } } } + if (flag == F2FS_GET_BLOCK_PRE_AIO) + goto skip; + if (map->m_len == 0) { /* preallocated unwritten block should be mapped for fiemap. */ if (blkaddr == NEW_ADDR) @@ -705,36 +765,49 @@ next_block: } else if ((map->m_pblk != NEW_ADDR && blkaddr == (map->m_pblk + ofs)) || (map->m_pblk == NEW_ADDR && blkaddr == NEW_ADDR) || - flag == F2FS_GET_BLOCK_PRE_DIO || - flag == F2FS_GET_BLOCK_PRE_AIO) { + flag == F2FS_GET_BLOCK_PRE_DIO) { ofs++; map->m_len++; } else { goto sync_out; } +skip: dn.ofs_in_node++; pgofs++; - if (map->m_len < maxblocks) { - if (dn.ofs_in_node < end_offset) - goto next_block; + /* preallocate blocks in batch for one dnode page */ + if (flag == F2FS_GET_BLOCK_PRE_AIO && + (pgofs == end || dn.ofs_in_node == end_offset)) { - if (allocated) - sync_inode_page(&dn); - f2fs_put_dnode(&dn); + dn.ofs_in_node = ofs_in_node; + err = reserve_new_blocks(&dn, prealloc); + if (err) + goto sync_out; - if (create) { - f2fs_unlock_op(sbi); - f2fs_balance_fs(sbi, allocated); + map->m_len += dn.ofs_in_node - ofs_in_node; + if (prealloc && dn.ofs_in_node != last_ofs_in_node + 1) { + err = -ENOSPC; + goto sync_out; } - allocated = false; - goto next_dnode; + dn.ofs_in_node = end_offset; + } + + if (pgofs >= end) + goto sync_out; + else if (dn.ofs_in_node < end_offset) + goto next_block; + + f2fs_put_dnode(&dn); + + if (create) { + f2fs_unlock_op(sbi); + f2fs_balance_fs(sbi, allocated); } + allocated = false; + goto next_dnode; sync_out: - if (allocated) - sync_inode_page(&dn); f2fs_put_dnode(&dn); unlock_out: if (create) { @@ -894,6 +967,37 @@ out: return ret; } +struct bio *f2fs_grab_bio(struct inode *inode, block_t blkaddr, + unsigned nr_pages) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct fscrypt_ctx *ctx = NULL; + struct block_device *bdev = sbi->sb->s_bdev; + struct bio *bio; + + if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) { + ctx = fscrypt_get_ctx(inode, GFP_NOFS); + if (IS_ERR(ctx)) + return ERR_CAST(ctx); + + /* wait the page to be moved by cleaning */ + f2fs_wait_on_encrypted_page_writeback(sbi, blkaddr); + } + + bio = bio_alloc(GFP_KERNEL, min_t(int, nr_pages, BIO_MAX_PAGES)); + if (!bio) { + if (ctx) + fscrypt_release_ctx(ctx); + return ERR_PTR(-ENOMEM); + } + bio->bi_bdev = bdev; + bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(blkaddr); + bio->bi_end_io = f2fs_read_end_io; + bio->bi_private = ctx; + + return bio; +} + /* * This function was originally taken from fs/mpage.c, and customized for f2fs. * Major change was from block_size == page_size in f2fs by default. @@ -912,7 +1016,6 @@ static int f2fs_mpage_readpages(struct address_space *mapping, sector_t last_block; sector_t last_block_in_file; sector_t block_nr; - struct block_device *bdev = inode->i_sb->s_bdev; struct f2fs_map_blocks map; map.m_pblk = 0; @@ -928,7 +1031,8 @@ static int f2fs_mpage_readpages(struct address_space *mapping, page = list_entry(pages->prev, struct page, lru); list_del(&page->lru); if (add_to_page_cache_lru(page, mapping, - page->index, GFP_KERNEL)) + page->index, + readahead_gfp_mask(mapping))) goto next_page; } @@ -972,7 +1076,8 @@ got_it: } } else { zero_user_segment(page, 0, PAGE_SIZE); - SetPageUptodate(page); + if (!PageUptodate(page)) + SetPageUptodate(page); unlock_page(page); goto next_page; } @@ -983,35 +1088,16 @@ got_it: */ if (bio && (last_block_in_bio != block_nr - 1)) { submit_and_realloc: - submit_bio(READ, bio); + __submit_bio(F2FS_I_SB(inode), bio, DATA); bio = NULL; } if (bio == NULL) { - struct fscrypt_ctx *ctx = NULL; - - if (f2fs_encrypted_inode(inode) && - S_ISREG(inode->i_mode)) { - - ctx = fscrypt_get_ctx(inode, GFP_NOFS); - if (IS_ERR(ctx)) - goto set_error_page; - - /* wait the page to be moved by cleaning */ - f2fs_wait_on_encrypted_page_writeback( - F2FS_I_SB(inode), block_nr); - } - - bio = bio_alloc(GFP_KERNEL, - min_t(int, nr_pages, BIO_MAX_PAGES)); - if (!bio) { - if (ctx) - fscrypt_release_ctx(ctx); + bio = f2fs_grab_bio(inode, block_nr, nr_pages); + if (IS_ERR(bio)) { + bio = NULL; goto set_error_page; } - bio->bi_bdev = bdev; - bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(block_nr); - bio->bi_end_io = f2fs_read_end_io; - bio->bi_private = ctx; + bio_set_op_attrs(bio, REQ_OP_READ, 0); } if (bio_add_page(bio, page, blocksize, 0) < blocksize) @@ -1026,7 +1112,7 @@ set_error_page: goto next_page; confused: if (bio) { - submit_bio(READ, bio); + __submit_bio(F2FS_I_SB(inode), bio, DATA); bio = NULL; } unlock_page(page); @@ -1036,7 +1122,7 @@ next_page: } BUG_ON(pages && !list_empty(pages)); if (bio) - submit_bio(READ, bio); + __submit_bio(F2FS_I_SB(inode), bio, DATA); return 0; } @@ -1125,14 +1211,14 @@ retry_encrypt: !IS_ATOMIC_WRITTEN_PAGE(page) && need_inplace_update(inode))) { rewrite_data_page(fio); - set_inode_flag(F2FS_I(inode), FI_UPDATE_WRITE); + set_inode_flag(inode, FI_UPDATE_WRITE); trace_f2fs_do_write_data_page(page, IPU); } else { write_data_page(&dn, fio); trace_f2fs_do_write_data_page(page, OPU); - set_inode_flag(F2FS_I(inode), FI_APPEND_WRITE); + set_inode_flag(inode, FI_APPEND_WRITE); if (page->index == 0) - set_inode_flag(F2FS_I(inode), FI_FIRST_BLOCK_WRITTEN); + set_inode_flag(inode, FI_FIRST_BLOCK_WRITTEN); } out_writepage: f2fs_put_dnode(&dn); @@ -1147,13 +1233,15 @@ static int f2fs_write_data_page(struct page *page, loff_t i_size = i_size_read(inode); const pgoff_t end_index = ((unsigned long long) i_size) >> PAGE_SHIFT; + loff_t psize = (page->index + 1) << PAGE_SHIFT; unsigned offset = 0; bool need_balance_fs = false; int err = 0; struct f2fs_io_info fio = { .sbi = sbi, .type = DATA, - .rw = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : WRITE, + .op = REQ_OP_WRITE, + .op_flags = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : 0, .page = page, .encrypted_page = NULL, }; @@ -1177,24 +1265,24 @@ write: goto redirty_out; if (f2fs_is_drop_cache(inode)) goto out; - if (f2fs_is_volatile_file(inode) && !wbc->for_reclaim && - available_free_memory(sbi, BASE_CHECK)) + /* we should not write 0'th page having journal header */ + if (f2fs_is_volatile_file(inode) && (!page->index || + (!wbc->for_reclaim && + available_free_memory(sbi, BASE_CHECK)))) goto redirty_out; + /* we should bypass data pages to proceed the kworkder jobs */ + if (unlikely(f2fs_cp_error(sbi))) { + mapping_set_error(page->mapping, -EIO); + goto out; + } + /* Dentry blocks are controlled by checkpoint */ if (S_ISDIR(inode->i_mode)) { - if (unlikely(f2fs_cp_error(sbi))) - goto redirty_out; err = do_write_data_page(&fio); goto done; } - /* we should bypass data pages to proceed the kworkder jobs */ - if (unlikely(f2fs_cp_error(sbi))) { - SetPageError(page); - goto out; - } - if (!wbc->for_reclaim) need_balance_fs = true; else if (has_not_enough_free_secs(sbi, 0)) @@ -1206,6 +1294,8 @@ write: err = f2fs_write_inline_data(inode, page); if (err == -EAGAIN) err = do_write_data_page(&fio); + if (F2FS_I(inode)->last_disk_size < psize) + F2FS_I(inode)->last_disk_size = psize; f2fs_unlock_op(sbi); done: if (err && err != -ENOENT) @@ -1232,16 +1322,8 @@ out: redirty_out: redirty_page_for_writepage(wbc, page); - return AOP_WRITEPAGE_ACTIVATE; -} - -static int __f2fs_writepage(struct page *page, struct writeback_control *wbc, - void *data) -{ - struct address_space *mapping = data; - int ret = mapping->a_ops->writepage(page, wbc); - mapping_set_error(mapping, ret); - return ret; + unlock_page(page); + return err; } /* @@ -1250,8 +1332,7 @@ static int __f2fs_writepage(struct page *page, struct writeback_control *wbc, * warm/hot data page. */ static int f2fs_write_cache_pages(struct address_space *mapping, - struct writeback_control *wbc, writepage_t writepage, - void *data) + struct writeback_control *wbc) { int ret = 0; int done = 0; @@ -1264,10 +1345,9 @@ static int f2fs_write_cache_pages(struct address_space *mapping, int cycled; int range_whole = 0; int tag; - int step = 0; pagevec_init(&pvec, 0); -next: + if (wbc->range_cyclic) { writeback_index = mapping->writeback_index; /* prev offset */ index = writeback_index; @@ -1322,9 +1402,6 @@ continue_unlock: goto continue_unlock; } - if (step == is_cold_data(page)) - goto continue_unlock; - if (PageWriteback(page)) { if (wbc->sync_mode != WB_SYNC_NONE) f2fs_wait_on_page_writeback(page, @@ -1337,16 +1414,11 @@ continue_unlock: if (!clear_page_dirty_for_io(page)) goto continue_unlock; - ret = (*writepage)(page, wbc, data); + ret = mapping->a_ops->writepage(page, wbc); if (unlikely(ret)) { - if (ret == AOP_WRITEPAGE_ACTIVATE) { - unlock_page(page); - ret = 0; - } else { - done_index = page->index + 1; - done = 1; - break; - } + done_index = page->index + 1; + done = 1; + break; } if (--wbc->nr_to_write <= 0 && @@ -1359,11 +1431,6 @@ continue_unlock: cond_resched(); } - if (step < 1) { - step++; - goto next; - } - if (!cycled && !done) { cycled = 1; index = 0; @@ -1381,9 +1448,8 @@ static int f2fs_write_data_pages(struct address_space *mapping, { struct inode *inode = mapping->host; struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - bool locked = false; + struct blk_plug plug; int ret; - long diff; /* deal with chardevs and other special file */ if (!mapping->a_ops->writepage) @@ -1399,7 +1465,7 @@ static int f2fs_write_data_pages(struct address_space *mapping, goto skip_write; /* skip writing during file defragment */ - if (is_inode_flag_set(F2FS_I(inode), FI_DO_DEFRAG)) + if (is_inode_flag_set(inode, FI_DO_DEFRAG)) goto skip_write; /* during POR, we don't need to trigger writepage at all. */ @@ -1408,20 +1474,16 @@ static int f2fs_write_data_pages(struct address_space *mapping, trace_f2fs_writepages(mapping->host, wbc, DATA); - diff = nr_pages_to_write(sbi, DATA, wbc); - - if (!S_ISDIR(inode->i_mode) && wbc->sync_mode == WB_SYNC_ALL) { - mutex_lock(&sbi->writepages); - locked = true; - } - ret = f2fs_write_cache_pages(mapping, wbc, __f2fs_writepage, mapping); - f2fs_submit_merged_bio_cond(sbi, inode, NULL, 0, DATA, WRITE); - if (locked) - mutex_unlock(&sbi->writepages); + blk_start_plug(&plug); + ret = f2fs_write_cache_pages(mapping, wbc); + blk_finish_plug(&plug); + /* + * if some pages were truncated, we cannot guarantee its mapping->host + * to detect pending bios. + */ + f2fs_submit_merged_bio(sbi, DATA, WRITE); remove_dirty_inode(inode); - - wbc->nr_to_write = max((long)0, wbc->nr_to_write - diff); return ret; skip_write: @@ -1479,8 +1541,9 @@ restart: if (f2fs_has_inline_data(inode)) { if (pos + len <= MAX_INLINE_DATA) { read_inline_data(page, ipage); - set_inode_flag(F2FS_I(inode), FI_DATA_EXIST); - set_inline_node(ipage); + set_inode_flag(inode, FI_DATA_EXIST); + if (inode->i_nlink) + set_inline_node(ipage); } else { err = f2fs_convert_inline_page(&dn, page); if (err) @@ -1496,7 +1559,7 @@ restart: } else { /* hole case */ err = get_dnode_of_data(&dn, index, LOOKUP_NODE); - if (err || (!err && dn.data_blkaddr == NULL_ADDR)) { + if (err || dn.data_blkaddr == NULL_ADDR) { f2fs_put_dnode(&dn); f2fs_lock_op(sbi); locked = true; @@ -1588,38 +1651,35 @@ repeat: if (blkaddr == NEW_ADDR) { zero_user_segment(page, 0, PAGE_SIZE); } else { - struct f2fs_io_info fio = { - .sbi = sbi, - .type = DATA, - .rw = READ_SYNC, - .old_blkaddr = blkaddr, - .new_blkaddr = blkaddr, - .page = page, - .encrypted_page = NULL, - }; - err = f2fs_submit_page_bio(&fio); - if (err) - goto fail; + struct bio *bio; - lock_page(page); - if (unlikely(!PageUptodate(page))) { - err = -EIO; + bio = f2fs_grab_bio(inode, blkaddr, 1); + if (IS_ERR(bio)) { + err = PTR_ERR(bio); + goto fail; + } + bio_set_op_attrs(bio, REQ_OP_READ, READ_SYNC); + if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) { + bio_put(bio); + err = -EFAULT; goto fail; } + + __submit_bio(sbi, bio, DATA); + + lock_page(page); if (unlikely(page->mapping != mapping)) { f2fs_put_page(page, 1); goto repeat; } - - /* avoid symlink page */ - if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) { - err = fscrypt_decrypt_page(page); - if (err) - goto fail; + if (unlikely(!PageUptodate(page))) { + err = -EIO; + goto fail; } } out_update: - SetPageUptodate(page); + if (!PageUptodate(page)) + SetPageUptodate(page); out_clear: clear_cold_data(page); return 0; @@ -1640,13 +1700,11 @@ static int f2fs_write_end(struct file *file, trace_f2fs_write_end(inode, pos, len, copied); set_page_dirty(page); + f2fs_put_page(page, 1); - if (pos + copied > i_size_read(inode)) { - i_size_write(inode, pos + copied); - mark_inode_dirty(inode); - } + if (pos + copied > i_size_read(inode)) + f2fs_i_size_write(inode, pos + copied); - f2fs_put_page(page, 1); f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); return copied; } @@ -1671,6 +1729,7 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) struct inode *inode = mapping->host; size_t count = iov_iter_count(iter); loff_t offset = iocb->ki_pos; + int rw = iov_iter_rw(iter); int err; err = check_direct_IO(inode, iter, offset); @@ -1679,14 +1738,23 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) return 0; + if (test_opt(F2FS_I_SB(inode), LFS)) + return 0; - trace_f2fs_direct_IO_enter(inode, offset, count, iov_iter_rw(iter)); + trace_f2fs_direct_IO_enter(inode, offset, count, rw); + down_read(&F2FS_I(inode)->dio_rwsem[rw]); err = blockdev_direct_IO(iocb, inode, iter, get_data_block_dio); - if (err < 0 && iov_iter_rw(iter) == WRITE) - f2fs_write_failed(mapping, offset + count); + up_read(&F2FS_I(inode)->dio_rwsem[rw]); + + if (rw == WRITE) { + if (err > 0) + set_inode_flag(inode, FI_UPDATE_WRITE); + else if (err < 0) + f2fs_write_failed(mapping, offset + count); + } - trace_f2fs_direct_IO_exit(inode, offset, count, iov_iter_rw(iter), err); + trace_f2fs_direct_IO_exit(inode, offset, count, rw, err); return err; } @@ -1714,6 +1782,7 @@ void f2fs_invalidate_page(struct page *page, unsigned int offset, if (IS_ATOMIC_WRITTEN_PAGE(page)) return; + set_page_private(page, 0); ClearPagePrivate(page); } @@ -1727,10 +1796,40 @@ int f2fs_release_page(struct page *page, gfp_t wait) if (IS_ATOMIC_WRITTEN_PAGE(page)) return 0; + set_page_private(page, 0); ClearPagePrivate(page); return 1; } +/* + * This was copied from __set_page_dirty_buffers which gives higher performance + * in very high speed storages. (e.g., pmem) + */ +void f2fs_set_page_dirty_nobuffers(struct page *page) +{ + struct address_space *mapping = page->mapping; + unsigned long flags; + + if (unlikely(!mapping)) + return; + + spin_lock(&mapping->private_lock); + lock_page_memcg(page); + SetPageDirty(page); + spin_unlock(&mapping->private_lock); + + spin_lock_irqsave(&mapping->tree_lock, flags); + WARN_ON_ONCE(!PageUptodate(page)); + account_page_dirtied(page, mapping); + radix_tree_tag_set(&mapping->page_tree, + page_index(page), PAGECACHE_TAG_DIRTY); + spin_unlock_irqrestore(&mapping->tree_lock, flags); + unlock_page_memcg(page); + + __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); + return; +} + static int f2fs_set_data_page_dirty(struct page *page) { struct address_space *mapping = page->mapping; @@ -1738,7 +1837,8 @@ static int f2fs_set_data_page_dirty(struct page *page) trace_f2fs_set_page_dirty(page, DATA); - SetPageUptodate(page); + if (!PageUptodate(page)) + SetPageUptodate(page); if (f2fs_is_atomic_file(inode)) { if (!IS_ATOMIC_WRITTEN_PAGE(page)) { @@ -1753,7 +1853,7 @@ static int f2fs_set_data_page_dirty(struct page *page) } if (!PageDirty(page)) { - __set_page_dirty_nobuffers(page); + f2fs_set_page_dirty_nobuffers(page); update_dirty_page(inode, page); return 1; } diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index f4a61a5ff79f..badd407bb622 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -47,8 +47,9 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->ndirty_data = get_pages(sbi, F2FS_DIRTY_DATA); si->ndirty_dirs = sbi->ndirty_inode[DIR_INODE]; si->ndirty_files = sbi->ndirty_inode[FILE_INODE]; + si->ndirty_all = sbi->ndirty_inode[DIRTY_META]; si->inmem_pages = get_pages(sbi, F2FS_INMEM_PAGES); - si->wb_pages = get_pages(sbi, F2FS_WRITEBACK); + si->wb_bios = atomic_read(&sbi->nr_wb_bios); si->total_count = (int)sbi->user_block_count / sbi->blocks_per_seg; si->rsvd_segs = reserved_segments(sbi); si->overp_segs = overprovision_segments(sbi); @@ -58,6 +59,7 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->inline_xattr = atomic_read(&sbi->inline_xattr); si->inline_inode = atomic_read(&sbi->inline_inode); si->inline_dir = atomic_read(&sbi->inline_dir); + si->orphans = sbi->im[ORPHAN_INO].ino_num; si->utilization = utilization(sbi); si->free_segs = free_segments(sbi); @@ -143,6 +145,7 @@ static void update_mem_info(struct f2fs_sb_info *sbi) si->base_mem = sizeof(struct f2fs_sb_info) + sbi->sb->s_blocksize; si->base_mem += 2 * sizeof(struct f2fs_inode_info); si->base_mem += sizeof(*sbi->ckpt); + si->base_mem += sizeof(struct percpu_counter) * NR_COUNT_TYPE; /* build sm */ si->base_mem += sizeof(struct f2fs_sm_info); @@ -192,7 +195,7 @@ get_cache: si->cache_mem += NM_I(sbi)->dirty_nat_cnt * sizeof(struct nat_entry_set); si->cache_mem += si->inmem_pages * sizeof(struct inmem_pages); - for (i = 0; i <= UPDATE_INO; i++) + for (i = 0; i <= ORPHAN_INO; i++) si->cache_mem += sbi->im[i].ino_num * sizeof(struct ino_entry); si->cache_mem += atomic_read(&sbi->total_ext_tree) * sizeof(struct extent_tree); @@ -216,8 +219,9 @@ static int stat_show(struct seq_file *s, void *v) list_for_each_entry(si, &f2fs_stat_list, stat_list) { update_general_status(si->sbi); - seq_printf(s, "\n=====[ partition info(%pg). #%d ]=====\n", - si->sbi->sb->s_bdev, i++); + seq_printf(s, "\n=====[ partition info(%pg). #%d, %s]=====\n", + si->sbi->sb->s_bdev, i++, + f2fs_readonly(si->sbi->sb) ? "RO": "RW"); seq_printf(s, "[SB: 1] [CP: 2] [SIT: %d] [NAT: %d] ", si->sit_area_segs, si->nat_area_segs); seq_printf(s, "[SSA: %d] [MAIN: %d", @@ -237,6 +241,8 @@ static int stat_show(struct seq_file *s, void *v) si->inline_inode); seq_printf(s, " - Inline_dentry Inode: %u\n", si->inline_dir); + seq_printf(s, " - Orphan Inode: %u\n", + si->orphans); seq_printf(s, "\nMain area: %d segs, %d secs %d zones\n", si->main_area_segs, si->main_area_sections, si->main_area_zones); @@ -295,15 +301,15 @@ static int stat_show(struct seq_file *s, void *v) seq_printf(s, " - Inner Struct Count: tree: %d(%d), node: %d\n", si->ext_tree, si->zombie_tree, si->ext_node); seq_puts(s, "\nBalancing F2FS Async:\n"); - seq_printf(s, " - inmem: %4d, wb: %4d\n", - si->inmem_pages, si->wb_pages); - seq_printf(s, " - nodes: %4d in %4d\n", + seq_printf(s, " - inmem: %4lld, wb_bios: %4d\n", + si->inmem_pages, si->wb_bios); + seq_printf(s, " - nodes: %4lld in %4d\n", si->ndirty_node, si->node_pages); - seq_printf(s, " - dents: %4d in dirs:%4d\n", - si->ndirty_dent, si->ndirty_dirs); - seq_printf(s, " - datas: %4d in files:%4d\n", + seq_printf(s, " - dents: %4lld in dirs:%4d (%4d)\n", + si->ndirty_dent, si->ndirty_dirs, si->ndirty_all); + seq_printf(s, " - datas: %4lld in files:%4d\n", si->ndirty_data, si->ndirty_files); - seq_printf(s, " - meta: %4d in %4d\n", + seq_printf(s, " - meta: %4lld in %4d\n", si->ndirty_meta, si->meta_pages); seq_printf(s, " - NATs: %9d/%9d\n - SITs: %9d/%9d\n", si->dirty_nats, si->nats, si->dirty_sits, si->sits); diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 9e4615146d13..a485f68a76b1 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -48,7 +48,6 @@ unsigned char f2fs_filetype_table[F2FS_FT_MAX] = { [F2FS_FT_SYMLINK] = DT_LNK, }; -#define S_SHIFT 12 static unsigned char f2fs_type_by_mode[S_IFMT >> S_SHIFT] = { [S_IFREG >> S_SHIFT] = F2FS_FT_REG_FILE, [S_IFDIR >> S_SHIFT] = F2FS_FT_DIR, @@ -64,6 +63,13 @@ void set_de_type(struct f2fs_dir_entry *de, umode_t mode) de->file_type = f2fs_type_by_mode[(mode & S_IFMT) >> S_SHIFT]; } +unsigned char get_de_type(struct f2fs_dir_entry *de) +{ + if (de->file_type < F2FS_FT_MAX) + return f2fs_filetype_table[de->file_type]; + return DT_UNKNOWN; +} + static unsigned long dir_block_index(unsigned int level, int dir_level, unsigned int idx) { @@ -95,11 +101,6 @@ static struct f2fs_dir_entry *find_in_block(struct page *dentry_page, else kunmap(dentry_page); - /* - * For the most part, it should be a bug when name_len is zero. - * We stop here for figuring out where the bugs has occurred. - */ - f2fs_bug_on(F2FS_P_SB(dentry_page), d.max < 0); return de; } @@ -124,6 +125,11 @@ struct f2fs_dir_entry *find_target_dentry(struct fscrypt_name *fname, de = &d->dentry[bit_pos]; + if (unlikely(!de->name_len)) { + bit_pos++; + continue; + } + /* encrypted case */ de_name.name = d->filename[bit_pos]; de_name.len = le16_to_cpu(de->name_len); @@ -141,10 +147,6 @@ struct f2fs_dir_entry *find_target_dentry(struct fscrypt_name *fname, *max_slots = max_len; max_len = 0; - /* remain bug on condition */ - if (unlikely(!de->name_len)) - d->max = -1; - bit_pos += GET_DENTRY_SLOTS(le16_to_cpu(de->name_len)); } @@ -183,8 +185,13 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir, /* no need to allocate new dentry pages to all the indices */ dentry_page = find_data_page(dir, bidx); if (IS_ERR(dentry_page)) { - room = true; - continue; + if (PTR_ERR(dentry_page) == -ENOENT) { + room = true; + continue; + } else { + *res_page = dentry_page; + break; + } } de = find_in_block(dentry_page, fname, namehash, &max_slots, @@ -221,19 +228,22 @@ struct f2fs_dir_entry *f2fs_find_entry(struct inode *dir, struct fscrypt_name fname; int err; - *res_page = NULL; - err = fscrypt_setup_filename(dir, child, 1, &fname); - if (err) + if (err) { + *res_page = ERR_PTR(err); return NULL; + } if (f2fs_has_inline_dentry(dir)) { + *res_page = NULL; de = find_in_inline_dir(dir, &fname, res_page); goto out; } - if (npages == 0) + if (npages == 0) { + *res_page = NULL; goto out; + } max_depth = F2FS_I(dir)->i_current_depth; if (unlikely(max_depth > MAX_DIR_HASH_DEPTH)) { @@ -241,13 +251,13 @@ struct f2fs_dir_entry *f2fs_find_entry(struct inode *dir, "Corrupted max_depth of %lu: %u", dir->i_ino, max_depth); max_depth = MAX_DIR_HASH_DEPTH; - F2FS_I(dir)->i_current_depth = max_depth; - mark_inode_dirty(dir); + f2fs_i_depth_write(dir, max_depth); } for (level = 0; level < max_depth; level++) { + *res_page = NULL; de = find_in_level(dir, level, &fname, res_page); - if (de) + if (de || IS_ERR(*res_page)) break; } out: @@ -257,35 +267,22 @@ out: struct f2fs_dir_entry *f2fs_parent_dir(struct inode *dir, struct page **p) { - struct page *page; - struct f2fs_dir_entry *de; - struct f2fs_dentry_block *dentry_blk; - - if (f2fs_has_inline_dentry(dir)) - return f2fs_parent_inline_dir(dir, p); - - page = get_lock_data_page(dir, 0, false); - if (IS_ERR(page)) - return NULL; + struct qstr dotdot = QSTR_INIT("..", 2); - dentry_blk = kmap(page); - de = &dentry_blk->dentry[1]; - *p = page; - unlock_page(page); - return de; + return f2fs_find_entry(dir, &dotdot, p); } -ino_t f2fs_inode_by_name(struct inode *dir, struct qstr *qstr) +ino_t f2fs_inode_by_name(struct inode *dir, struct qstr *qstr, + struct page **page) { ino_t res = 0; struct f2fs_dir_entry *de; - struct page *page; - de = f2fs_find_entry(dir, qstr, &page); + de = f2fs_find_entry(dir, qstr, page); if (de) { res = le32_to_cpu(de->ino); - f2fs_dentry_kunmap(dir, page); - f2fs_put_page(page, 0); + f2fs_dentry_kunmap(dir, *page); + f2fs_put_page(*page, 0); } return res; @@ -301,9 +298,9 @@ void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de, set_de_type(de, inode->i_mode); f2fs_dentry_kunmap(dir, page); set_page_dirty(page); - dir->i_mtime = dir->i_ctime = CURRENT_TIME; - mark_inode_dirty(dir); + dir->i_mtime = dir->i_ctime = CURRENT_TIME; + f2fs_mark_inode_dirty_sync(dir); f2fs_put_page(page, 1); } @@ -383,15 +380,20 @@ struct page *init_inode_metadata(struct inode *inode, struct inode *dir, struct page *page; int err; - if (is_inode_flag_set(F2FS_I(inode), FI_NEW_INODE)) { + if (is_inode_flag_set(inode, FI_NEW_INODE)) { page = new_inode_page(inode); if (IS_ERR(page)) return page; if (S_ISDIR(inode->i_mode)) { + /* in order to handle error case */ + get_page(page); err = make_empty_dir(inode, dir, page); - if (err) - goto error; + if (err) { + lock_page(page); + goto put_error; + } + put_page(page); } err = f2fs_init_acl(inode, dir, page, dpage); @@ -422,7 +424,7 @@ struct page *init_inode_metadata(struct inode *inode, struct inode *dir, * This file should be checkpointed during fsync. * We lost i_pino from now on. */ - if (is_inode_flag_set(F2FS_I(inode), FI_INC_LINK)) { + if (is_inode_flag_set(inode, FI_INC_LINK)) { file_lost_pino(inode); /* * If link the tmpfile to alias through linkat path, @@ -430,41 +432,33 @@ struct page *init_inode_metadata(struct inode *inode, struct inode *dir, */ if (inode->i_nlink == 0) remove_orphan_inode(F2FS_I_SB(dir), inode->i_ino); - inc_nlink(inode); + f2fs_i_links_write(inode, true); } return page; put_error: + clear_nlink(inode); + update_inode(inode, page); f2fs_put_page(page, 1); -error: - /* once the failed inode becomes a bad inode, i_mode is S_IFREG */ - truncate_inode_pages(&inode->i_data, 0); - truncate_blocks(inode, 0, false); - remove_dirty_inode(inode); - remove_inode_page(inode); return ERR_PTR(err); } void update_parent_metadata(struct inode *dir, struct inode *inode, unsigned int current_depth) { - if (inode && is_inode_flag_set(F2FS_I(inode), FI_NEW_INODE)) { - if (S_ISDIR(inode->i_mode)) { - inc_nlink(dir); - set_inode_flag(F2FS_I(dir), FI_UPDATE_DIR); - } - clear_inode_flag(F2FS_I(inode), FI_NEW_INODE); + if (inode && is_inode_flag_set(inode, FI_NEW_INODE)) { + if (S_ISDIR(inode->i_mode)) + f2fs_i_links_write(dir, true); + clear_inode_flag(inode, FI_NEW_INODE); } dir->i_mtime = dir->i_ctime = CURRENT_TIME; - mark_inode_dirty(dir); + f2fs_mark_inode_dirty_sync(dir); - if (F2FS_I(dir)->i_current_depth != current_depth) { - F2FS_I(dir)->i_current_depth = current_depth; - set_inode_flag(F2FS_I(dir), FI_UPDATE_DIR); - } + if (F2FS_I(dir)->i_current_depth != current_depth) + f2fs_i_depth_write(dir, current_depth); - if (inode && is_inode_flag_set(F2FS_I(inode), FI_INC_LINK)) - clear_inode_flag(F2FS_I(inode), FI_INC_LINK); + if (inode && is_inode_flag_set(inode, FI_INC_LINK)) + clear_inode_flag(inode, FI_INC_LINK); } int room_for_filename(const void *bitmap, int slots, int max_slots) @@ -509,11 +503,7 @@ void f2fs_update_dentry(nid_t ino, umode_t mode, struct f2fs_dentry_ptr *d, } } -/* - * Caller should grab and release a rwsem by calling f2fs_lock_op() and - * f2fs_unlock_op(). - */ -int __f2fs_add_link(struct inode *dir, const struct qstr *name, +int f2fs_add_regular_entry(struct inode *dir, const struct qstr *new_name, struct inode *inode, nid_t ino, umode_t mode) { unsigned int bit_pos; @@ -526,28 +516,11 @@ int __f2fs_add_link(struct inode *dir, const struct qstr *name, struct f2fs_dentry_block *dentry_blk = NULL; struct f2fs_dentry_ptr d; struct page *page = NULL; - struct fscrypt_name fname; - struct qstr new_name; - int slots, err; - - err = fscrypt_setup_filename(dir, name, 0, &fname); - if (err) - return err; - - new_name.name = fname_name(&fname); - new_name.len = fname_len(&fname); - - if (f2fs_has_inline_dentry(dir)) { - err = f2fs_add_inline_entry(dir, &new_name, inode, ino, mode); - if (!err || err != -EAGAIN) - goto out; - else - err = 0; - } + int slots, err = 0; level = 0; - slots = GET_DENTRY_SLOTS(new_name.len); - dentry_hash = f2fs_dentry_hash(&new_name); + slots = GET_DENTRY_SLOTS(new_name->len); + dentry_hash = f2fs_dentry_hash(new_name); current_depth = F2FS_I(dir)->i_current_depth; if (F2FS_I(dir)->chash == dentry_hash) { @@ -556,10 +529,12 @@ int __f2fs_add_link(struct inode *dir, const struct qstr *name, } start: - if (unlikely(current_depth == MAX_DIR_HASH_DEPTH)) { - err = -ENOSPC; - goto out; - } +#ifdef CONFIG_F2FS_FAULT_INJECTION + if (time_to_inject(FAULT_DIR_DEPTH)) + return -ENOSPC; +#endif + if (unlikely(current_depth == MAX_DIR_HASH_DEPTH)) + return -ENOSPC; /* Increase the depth, if required */ if (level == current_depth) @@ -573,10 +548,8 @@ start: for (block = bidx; block <= (bidx + nblock - 1); block++) { dentry_page = get_new_data_page(dir, NULL, block, true); - if (IS_ERR(dentry_page)) { - err = PTR_ERR(dentry_page); - goto out; - } + if (IS_ERR(dentry_page)) + return PTR_ERR(dentry_page); dentry_blk = kmap(dentry_page); bit_pos = room_for_filename(&dentry_blk->dentry_bitmap, @@ -596,7 +569,7 @@ add_dentry: if (inode) { down_write(&F2FS_I(inode)->i_sem); - page = init_inode_metadata(inode, dir, &new_name, NULL); + page = init_inode_metadata(inode, dir, new_name, NULL); if (IS_ERR(page)) { err = PTR_ERR(page); goto fail; @@ -606,14 +579,12 @@ add_dentry: } make_dentry_ptr(NULL, &d, (void *)dentry_blk, 1); - f2fs_update_dentry(ino, mode, &d, &new_name, dentry_hash, bit_pos); + f2fs_update_dentry(ino, mode, &d, new_name, dentry_hash, bit_pos); set_page_dirty(dentry_page); if (inode) { - /* we don't need to mark_inode_dirty now */ - F2FS_I(inode)->i_pino = dir->i_ino; - update_inode(inode, page); + f2fs_i_pino_write(inode, dir->i_ino); f2fs_put_page(page, 1); } @@ -622,13 +593,36 @@ fail: if (inode) up_write(&F2FS_I(inode)->i_sem); - if (is_inode_flag_set(F2FS_I(dir), FI_UPDATE_DIR)) { - update_inode_page(dir); - clear_inode_flag(F2FS_I(dir), FI_UPDATE_DIR); - } kunmap(dentry_page); f2fs_put_page(dentry_page, 1); -out: + + return err; +} + +/* + * Caller should grab and release a rwsem by calling f2fs_lock_op() and + * f2fs_unlock_op(). + */ +int __f2fs_add_link(struct inode *dir, const struct qstr *name, + struct inode *inode, nid_t ino, umode_t mode) +{ + struct fscrypt_name fname; + struct qstr new_name; + int err; + + err = fscrypt_setup_filename(dir, name, 0, &fname); + if (err) + return err; + + new_name.name = fname_name(&fname); + new_name.len = fname_len(&fname); + + err = -EAGAIN; + if (f2fs_has_inline_dentry(dir)) + err = f2fs_add_inline_entry(dir, &new_name, inode, ino, mode); + if (err == -EAGAIN) + err = f2fs_add_regular_entry(dir, &new_name, inode, ino, mode); + fscrypt_free_filename(&fname); f2fs_update_time(F2FS_I_SB(dir), REQ_TIME); return err; @@ -645,42 +639,34 @@ int f2fs_do_tmpfile(struct inode *inode, struct inode *dir) err = PTR_ERR(page); goto fail; } - /* we don't need to mark_inode_dirty now */ - update_inode(inode, page); f2fs_put_page(page, 1); - clear_inode_flag(F2FS_I(inode), FI_NEW_INODE); + clear_inode_flag(inode, FI_NEW_INODE); fail: up_write(&F2FS_I(inode)->i_sem); f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); return err; } -void f2fs_drop_nlink(struct inode *dir, struct inode *inode, struct page *page) +void f2fs_drop_nlink(struct inode *dir, struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(dir); down_write(&F2FS_I(inode)->i_sem); - if (S_ISDIR(inode->i_mode)) { - drop_nlink(dir); - if (page) - update_inode(dir, page); - else - update_inode_page(dir); - } + if (S_ISDIR(inode->i_mode)) + f2fs_i_links_write(dir, false); inode->i_ctime = CURRENT_TIME; - drop_nlink(inode); + f2fs_i_links_write(inode, false); if (S_ISDIR(inode->i_mode)) { - drop_nlink(inode); - i_size_write(inode, 0); + f2fs_i_links_write(inode, false); + f2fs_i_size_write(inode, 0); } up_write(&F2FS_I(inode)->i_sem); - update_inode_page(inode); if (inode->i_nlink == 0) - add_orphan_inode(sbi, inode->i_ino); + add_orphan_inode(inode); else release_orphan_inode(sbi); } @@ -718,9 +704,10 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, set_page_dirty(page); dir->i_ctime = dir->i_mtime = CURRENT_TIME; + f2fs_mark_inode_dirty_sync(dir); if (inode) - f2fs_drop_nlink(dir, inode, NULL); + f2fs_drop_nlink(dir, inode); if (bit_pos == NR_DENTRY_IN_BLOCK && !truncate_hole(dir, page->index, page->index + 1)) { @@ -792,10 +779,7 @@ bool f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d, continue; } - if (de->file_type < F2FS_FT_MAX) - d_type = f2fs_filetype_table[de->file_type]; - else - d_type = DT_UNKNOWN; + d_type = get_de_type(de); de_name.name = d->filename[bit_pos]; de_name.len = le16_to_cpu(de->name_len); @@ -804,7 +788,7 @@ bool f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d, int save_len = fstr->len; int ret; - de_name.name = kmalloc(de_name.len, GFP_NOFS); + de_name.name = f2fs_kmalloc(de_name.len, GFP_NOFS); if (!de_name.name) return false; @@ -887,6 +871,7 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx) kunmap(dentry_page); f2fs_put_page(dentry_page, 1); } + err = 0; out: fscrypt_fname_free_buffer(&fstr); return err; diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index c859bb044728..2b06d4fcd954 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -170,8 +170,10 @@ static void __drop_largest_extent(struct inode *inode, { struct extent_info *largest = &F2FS_I(inode)->extent_tree->largest; - if (fofs < largest->fofs + largest->len && fofs + len > largest->fofs) + if (fofs < largest->fofs + largest->len && fofs + len > largest->fofs) { largest->len = 0; + f2fs_mark_inode_dirty_sync(inode); + } } /* return true, if inode page is changed */ @@ -196,8 +198,7 @@ bool f2fs_init_extent_tree(struct inode *inode, struct f2fs_extent *i_ext) if (!i_ext || !i_ext->len) return false; - set_extent_info(&ei, le32_to_cpu(i_ext->fofs), - le32_to_cpu(i_ext->blk), le32_to_cpu(i_ext->len)); + get_extent_info(&ei, i_ext); write_lock(&et->lock); if (atomic_read(&et->node_cnt)) @@ -336,11 +337,12 @@ lookup_neighbors: return en; } -static struct extent_node *__try_merge_extent_node(struct f2fs_sb_info *sbi, +static struct extent_node *__try_merge_extent_node(struct inode *inode, struct extent_tree *et, struct extent_info *ei, struct extent_node *prev_ex, struct extent_node *next_ex) { + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct extent_node *en = NULL; if (prev_ex && __is_back_mergeable(ei, &prev_ex->ei)) { @@ -361,7 +363,7 @@ static struct extent_node *__try_merge_extent_node(struct f2fs_sb_info *sbi, if (!en) return NULL; - __try_update_largest_extent(et, en); + __try_update_largest_extent(inode, et, en); spin_lock(&sbi->extent_lock); if (!list_empty(&en->list)) { @@ -372,11 +374,12 @@ static struct extent_node *__try_merge_extent_node(struct f2fs_sb_info *sbi, return en; } -static struct extent_node *__insert_extent_tree(struct f2fs_sb_info *sbi, +static struct extent_node *__insert_extent_tree(struct inode *inode, struct extent_tree *et, struct extent_info *ei, struct rb_node **insert_p, struct rb_node *insert_parent) { + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct rb_node **p = &et->root.rb_node; struct rb_node *parent = NULL; struct extent_node *en = NULL; @@ -403,7 +406,7 @@ do_insert: if (!en) return NULL; - __try_update_largest_extent(et, en); + __try_update_largest_extent(inode, et, en); /* update in global extent list */ spin_lock(&sbi->extent_lock); @@ -432,7 +435,7 @@ static unsigned int f2fs_update_extent_tree_range(struct inode *inode, write_lock(&et->lock); - if (is_inode_flag_set(F2FS_I(inode), FI_NO_EXTENT)) { + if (is_inode_flag_set(inode, FI_NO_EXTENT)) { write_unlock(&et->lock); return false; } @@ -474,7 +477,7 @@ static unsigned int f2fs_update_extent_tree_range(struct inode *inode, set_extent_info(&ei, end, end - dei.fofs + dei.blk, org_end - end); - en1 = __insert_extent_tree(sbi, et, &ei, + en1 = __insert_extent_tree(inode, et, &ei, NULL, NULL); next_en = en1; } else { @@ -495,7 +498,7 @@ static unsigned int f2fs_update_extent_tree_range(struct inode *inode, } if (parts) - __try_update_largest_extent(et, en); + __try_update_largest_extent(inode, et, en); else __release_extent_node(sbi, et, en); @@ -515,20 +518,20 @@ static unsigned int f2fs_update_extent_tree_range(struct inode *inode, if (blkaddr) { set_extent_info(&ei, fofs, blkaddr, len); - if (!__try_merge_extent_node(sbi, et, &ei, prev_en, next_en)) - __insert_extent_tree(sbi, et, &ei, + if (!__try_merge_extent_node(inode, et, &ei, prev_en, next_en)) + __insert_extent_tree(inode, et, &ei, insert_p, insert_parent); /* give up extent_cache, if split and small updates happen */ if (dei.len >= 1 && prev.len < F2FS_MIN_EXTENT_LEN && et->largest.len < F2FS_MIN_EXTENT_LEN) { - et->largest.len = 0; - set_inode_flag(F2FS_I(inode), FI_NO_EXTENT); + __drop_largest_extent(inode, 0, UINT_MAX); + set_inode_flag(inode, FI_NO_EXTENT); } } - if (is_inode_flag_set(F2FS_I(inode), FI_NO_EXTENT)) + if (is_inode_flag_set(inode, FI_NO_EXTENT)) __free_extent_tree(sbi, et); write_unlock(&et->lock); @@ -628,6 +631,19 @@ unsigned int f2fs_destroy_extent_node(struct inode *inode) return node_cnt; } +void f2fs_drop_extent_tree(struct inode *inode) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct extent_tree *et = F2FS_I(inode)->extent_tree; + + set_inode_flag(inode, FI_NO_EXTENT); + + write_lock(&et->lock); + __free_extent_tree(sbi, et); + __drop_largest_extent(inode, 0, UINT_MAX); + write_unlock(&et->lock); +} + void f2fs_destroy_extent_tree(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); @@ -686,9 +702,7 @@ void f2fs_update_extent_cache(struct dnode_of_data *dn) fofs = start_bidx_of_node(ofs_of_node(dn->node_page), dn->inode) + dn->ofs_in_node; - - if (f2fs_update_extent_tree_range(dn->inode, fofs, blkaddr, 1)) - sync_inode_page(dn); + f2fs_update_extent_tree_range(dn->inode, fofs, blkaddr, 1); } void f2fs_update_extent_cache_range(struct dnode_of_data *dn, @@ -698,8 +712,7 @@ void f2fs_update_extent_cache_range(struct dnode_of_data *dn, if (!f2fs_may_extent_tree(dn->inode)) return; - if (f2fs_update_extent_tree_range(dn->inode, fofs, blkaddr, len)) - sync_inode_page(dn); + f2fs_update_extent_tree_range(dn->inode, fofs, blkaddr, len); } void init_extent_cache_info(struct f2fs_sb_info *sbi) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 7a4558d17f36..7890e9071499 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -37,6 +37,60 @@ } while (0) #endif +#ifdef CONFIG_F2FS_FAULT_INJECTION +enum { + FAULT_KMALLOC, + FAULT_PAGE_ALLOC, + FAULT_ALLOC_NID, + FAULT_ORPHAN, + FAULT_BLOCK, + FAULT_DIR_DEPTH, + FAULT_EVICT_INODE, + FAULT_MAX, +}; + +struct f2fs_fault_info { + atomic_t inject_ops; + unsigned int inject_rate; + unsigned int inject_type; +}; + +extern struct f2fs_fault_info f2fs_fault; +extern char *fault_name[FAULT_MAX]; +#define IS_FAULT_SET(type) (f2fs_fault.inject_type & (1 << (type))) + +static inline bool time_to_inject(int type) +{ + if (!f2fs_fault.inject_rate) + return false; + if (type == FAULT_KMALLOC && !IS_FAULT_SET(type)) + return false; + else if (type == FAULT_PAGE_ALLOC && !IS_FAULT_SET(type)) + return false; + else if (type == FAULT_ALLOC_NID && !IS_FAULT_SET(type)) + return false; + else if (type == FAULT_ORPHAN && !IS_FAULT_SET(type)) + return false; + else if (type == FAULT_BLOCK && !IS_FAULT_SET(type)) + return false; + else if (type == FAULT_DIR_DEPTH && !IS_FAULT_SET(type)) + return false; + else if (type == FAULT_EVICT_INODE && !IS_FAULT_SET(type)) + return false; + + atomic_inc(&f2fs_fault.inject_ops); + if (atomic_read(&f2fs_fault.inject_ops) >= f2fs_fault.inject_rate) { + atomic_set(&f2fs_fault.inject_ops, 0); + printk("%sF2FS-fs : inject %s in %pF\n", + KERN_INFO, + fault_name[type], + __builtin_return_address(0)); + return true; + } + return false; +} +#endif + /* * For mount options */ @@ -56,6 +110,9 @@ #define F2FS_MOUNT_EXTENT_CACHE 0x00002000 #define F2FS_MOUNT_FORCE_FG_GC 0x00004000 #define F2FS_MOUNT_DATA_FLUSH 0x00008000 +#define F2FS_MOUNT_FAULT_INJECTION 0x00010000 +#define F2FS_MOUNT_ADAPTIVE 0x00020000 +#define F2FS_MOUNT_LFS 0x00040000 #define clear_opt(sbi, option) (sbi->mount_opt.opt &= ~F2FS_MOUNT_##option) #define set_opt(sbi, option) (sbi->mount_opt.opt |= F2FS_MOUNT_##option) @@ -76,6 +133,7 @@ struct f2fs_mount_info { }; #define F2FS_FEATURE_ENCRYPT 0x0001 +#define F2FS_FEATURE_HMSMR 0x0002 #define F2FS_HAS_FEATURE(sb, mask) \ ((F2FS_SB(sb)->raw_super->feature & cpu_to_le32(mask)) != 0) @@ -106,7 +164,7 @@ enum { #define BATCHED_TRIM_BLOCKS(sbi) \ (BATCHED_TRIM_SEGMENTS(sbi) << (sbi)->log_blocks_per_seg) #define DEF_CP_INTERVAL 60 /* 60 secs */ -#define DEF_IDLE_INTERVAL 120 /* 2 mins */ +#define DEF_IDLE_INTERVAL 5 /* 5 secs */ struct cp_control { int reason; @@ -159,7 +217,6 @@ struct fsync_inode_entry { struct inode *inode; /* vfs inode pointer */ block_t blkaddr; /* block address locating the last fsync */ block_t last_dentry; /* block address locating the last dentry */ - block_t last_inode; /* block address locating the last inode */ }; #define nats_in_cursum(jnl) (le16_to_cpu(jnl->n_nats)) @@ -211,6 +268,8 @@ static inline bool __has_cursum_space(struct f2fs_journal *journal, #define F2FS_IOC_GARBAGE_COLLECT _IO(F2FS_IOCTL_MAGIC, 6) #define F2FS_IOC_WRITE_CHECKPOINT _IO(F2FS_IOCTL_MAGIC, 7) #define F2FS_IOC_DEFRAGMENT _IO(F2FS_IOCTL_MAGIC, 8) +#define F2FS_IOC_MOVE_RANGE _IOWR(F2FS_IOCTL_MAGIC, 9, \ + struct f2fs_move_range) #define F2FS_IOC_SET_ENCRYPTION_POLICY FS_IOC_SET_ENCRYPTION_POLICY #define F2FS_IOC_GET_ENCRYPTION_POLICY FS_IOC_GET_ENCRYPTION_POLICY @@ -240,6 +299,13 @@ struct f2fs_defragment { u64 len; }; +struct f2fs_move_range { + u32 dst_fd; /* destination fd */ + u64 pos_in; /* start position in src_fd */ + u64 pos_out; /* start position in dst_fd */ + u64 len; /* size to move */ +}; + /* * For INODE and NODE manager */ @@ -385,24 +451,27 @@ struct f2fs_inode_info { /* Use below internally in f2fs*/ unsigned long flags; /* use to pass per-file flags */ struct rw_semaphore i_sem; /* protect fi info */ - atomic_t dirty_pages; /* # of dirty pages */ + struct percpu_counter dirty_pages; /* # of dirty pages */ f2fs_hash_t chash; /* hash value of given file name */ unsigned int clevel; /* maximum level of given file name */ nid_t i_xattr_nid; /* node id that contains xattrs */ unsigned long long xattr_ver; /* cp version of xattr modification */ + loff_t last_disk_size; /* lastly written file size */ - struct list_head dirty_list; /* linked in global dirty list */ + struct list_head dirty_list; /* dirty list for dirs and files */ + struct list_head gdirty_list; /* linked in global dirty list */ struct list_head inmem_pages; /* inmemory pages managed by f2fs */ struct mutex inmem_lock; /* lock for inmemory pages */ struct extent_tree *extent_tree; /* cached extent_tree entry */ + struct rw_semaphore dio_rwsem[2];/* avoid racing between dio and gc */ }; static inline void get_extent_info(struct extent_info *ext, - struct f2fs_extent i_ext) + struct f2fs_extent *i_ext) { - ext->fofs = le32_to_cpu(i_ext.fofs); - ext->blk = le32_to_cpu(i_ext.blk); - ext->len = le32_to_cpu(i_ext.len); + ext->fofs = le32_to_cpu(i_ext->fofs); + ext->blk = le32_to_cpu(i_ext->blk); + ext->len = le32_to_cpu(i_ext->len); } static inline void set_raw_extent(struct extent_info *ext, @@ -447,11 +516,14 @@ static inline bool __is_front_mergeable(struct extent_info *cur, return __is_extent_mergeable(cur, front); } -static inline void __try_update_largest_extent(struct extent_tree *et, - struct extent_node *en) +extern void f2fs_mark_inode_dirty_sync(struct inode *); +static inline void __try_update_largest_extent(struct inode *inode, + struct extent_tree *et, struct extent_node *en) { - if (en->ei.len > et->largest.len) + if (en->ei.len > et->largest.len) { et->largest = en->ei; + f2fs_mark_inode_dirty_sync(inode); + } } struct f2fs_nm_info { @@ -466,7 +538,7 @@ struct f2fs_nm_info { /* NAT cache management */ struct radix_tree_root nat_root;/* root of the nat entry cache */ struct radix_tree_root nat_set_root;/* root of the nat set cache */ - struct rw_semaphore nat_tree_lock; /* protect nat_tree_lock */ + struct percpu_rw_semaphore nat_tree_lock; /* protect nat_tree_lock */ struct list_head nat_entries; /* cached nat entry list (clean) */ unsigned int nat_cnt; /* the # of cached nat entries */ unsigned int dirty_nat_cnt; /* total num of nat entries in set */ @@ -548,6 +620,7 @@ struct flush_cmd { struct flush_cmd_control { struct task_struct *f2fs_issue_flush; /* flush thread */ wait_queue_head_t flush_wait_queue; /* waiting queue for wake-up */ + atomic_t submit_flush; /* # of issued flushes */ struct llist_head issue_list; /* list for command issue */ struct llist_node *dispatch_list; /* list for command dispatch */ }; @@ -599,12 +672,12 @@ struct f2fs_sm_info { * dirty dentry blocks, dirty node blocks, and dirty meta blocks. */ enum count_type { - F2FS_WRITEBACK, F2FS_DIRTY_DENTS, F2FS_DIRTY_DATA, F2FS_DIRTY_NODES, F2FS_DIRTY_META, F2FS_INMEM_PAGES, + F2FS_DIRTY_IMETA, NR_COUNT_TYPE, }; @@ -636,14 +709,15 @@ enum page_type { struct f2fs_io_info { struct f2fs_sb_info *sbi; /* f2fs_sb_info pointer */ enum page_type type; /* contains DATA/NODE/META/META_FLUSH */ - int rw; /* contains R/RS/W/WS with REQ_META/REQ_PRIO */ + int op; /* contains REQ_OP_ */ + int op_flags; /* rq_flag_bits */ block_t new_blkaddr; /* new block address to be written */ block_t old_blkaddr; /* old block address before Cow */ struct page *page; /* page to be written */ struct page *encrypted_page; /* encrypted page */ }; -#define is_read_io(rw) (((rw) & 1) == READ) +#define is_read_io(rw) (rw == READ) struct f2fs_bio_info { struct f2fs_sb_info *sbi; /* f2fs superblock */ struct bio *bio; /* bios to merge */ @@ -655,6 +729,7 @@ struct f2fs_bio_info { enum inode_type { DIR_INODE, /* for dirty dir inode */ FILE_INODE, /* for dirty regular/symlink inode */ + DIRTY_META, /* for all dirtied inode metadata */ NR_INODE_TYPE, }; @@ -672,6 +747,7 @@ enum { SBI_IS_CLOSE, /* specify unmounting */ SBI_NEED_FSCK, /* need fsck.f2fs to fix */ SBI_POR_DOING, /* recovery is doing or not */ + SBI_NEED_SB_WRITE, /* need to recover superblock */ }; enum { @@ -680,6 +756,10 @@ enum { MAX_TIME, }; +#ifdef CONFIG_F2FS_FS_ENCRYPTION +#define F2FS_KEY_DESC_PREFIX "f2fs:" +#define F2FS_KEY_DESC_PREFIX_SIZE 5 +#endif struct f2fs_sb_info { struct super_block *sb; /* pointer to VFS super block */ struct proc_dir_entry *s_proc; /* proc entry */ @@ -687,6 +767,10 @@ struct f2fs_sb_info { int valid_super_block; /* valid super block no */ int s_flag; /* flags for sbi */ +#ifdef CONFIG_F2FS_FS_ENCRYPTION + u8 key_prefix[F2FS_KEY_DESC_PREFIX_SIZE]; + u8 key_prefix_size; +#endif /* for node-related operations */ struct f2fs_nm_info *nm_info; /* node manager */ struct inode *node_inode; /* cache node blocks */ @@ -697,14 +781,14 @@ struct f2fs_sb_info { /* for bio operations */ struct f2fs_bio_info read_io; /* for read bios */ struct f2fs_bio_info write_io[NR_PAGE_TYPE]; /* for write bios */ + struct mutex wio_mutex[NODE + 1]; /* bio ordering for NODE/DATA */ /* for checkpoint */ struct f2fs_checkpoint *ckpt; /* raw checkpoint pointer */ struct inode *meta_inode; /* cache meta blocks */ struct mutex cp_mutex; /* checkpoint procedure lock */ - struct rw_semaphore cp_rwsem; /* blocking FS operations */ + struct percpu_rw_semaphore cp_rwsem; /* blocking FS operations */ struct rw_semaphore node_write; /* locking node writes */ - struct mutex writepages; /* mutex for writepages() */ wait_queue_head_t cp_wait; unsigned long last_time[MAX_TIME]; /* to store time in jiffies */ long interval_time[MAX_TIME]; /* to store thresholds */ @@ -742,18 +826,24 @@ struct f2fs_sb_info { unsigned int total_sections; /* total section count */ unsigned int total_node_count; /* total node block count */ unsigned int total_valid_node_count; /* valid node block count */ - unsigned int total_valid_inode_count; /* valid inode count */ loff_t max_file_blocks; /* max block index of file */ int active_logs; /* # of active logs */ int dir_level; /* directory level */ block_t user_block_count; /* # of user blocks */ block_t total_valid_block_count; /* # of valid blocks */ - block_t alloc_valid_block_count; /* # of allocated blocks */ block_t discard_blks; /* discard command candidats */ block_t last_valid_block_count; /* for recovery */ u32 s_next_generation; /* for NFS support */ - atomic_t nr_pages[NR_COUNT_TYPE]; /* # of pages, see count_type */ + atomic_t nr_wb_bios; /* # of writeback bios */ + + /* # of pages, see count_type */ + struct percpu_counter nr_pages[NR_COUNT_TYPE]; + /* # of allocated blocks */ + struct percpu_counter alloc_valid_block_count; + + /* valid inode count */ + struct percpu_counter total_valid_inode_count; struct f2fs_mount_info mount_opt; /* mount options */ @@ -984,22 +1074,22 @@ static inline void clear_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f) static inline void f2fs_lock_op(struct f2fs_sb_info *sbi) { - down_read(&sbi->cp_rwsem); + percpu_down_read(&sbi->cp_rwsem); } static inline void f2fs_unlock_op(struct f2fs_sb_info *sbi) { - up_read(&sbi->cp_rwsem); + percpu_up_read(&sbi->cp_rwsem); } static inline void f2fs_lock_all(struct f2fs_sb_info *sbi) { - down_write(&sbi->cp_rwsem); + percpu_down_write(&sbi->cp_rwsem); } static inline void f2fs_unlock_all(struct f2fs_sb_info *sbi) { - up_write(&sbi->cp_rwsem); + percpu_up_write(&sbi->cp_rwsem); } static inline int __get_cp_reason(struct f2fs_sb_info *sbi) @@ -1054,22 +1144,37 @@ static inline bool f2fs_has_xattr_block(unsigned int ofs) return ofs == XATTR_NODE_OFFSET; } +static inline void f2fs_i_blocks_write(struct inode *, blkcnt_t, bool); static inline bool inc_valid_block_count(struct f2fs_sb_info *sbi, - struct inode *inode, blkcnt_t count) + struct inode *inode, blkcnt_t *count) { - block_t valid_block_count; + blkcnt_t diff; - spin_lock(&sbi->stat_lock); - valid_block_count = - sbi->total_valid_block_count + (block_t)count; - if (unlikely(valid_block_count > sbi->user_block_count)) { - spin_unlock(&sbi->stat_lock); +#ifdef CONFIG_F2FS_FAULT_INJECTION + if (time_to_inject(FAULT_BLOCK)) return false; +#endif + /* + * let's increase this in prior to actual block count change in order + * for f2fs_sync_file to avoid data races when deciding checkpoint. + */ + percpu_counter_add(&sbi->alloc_valid_block_count, (*count)); + + spin_lock(&sbi->stat_lock); + sbi->total_valid_block_count += (block_t)(*count); + if (unlikely(sbi->total_valid_block_count > sbi->user_block_count)) { + diff = sbi->total_valid_block_count - sbi->user_block_count; + *count -= diff; + sbi->total_valid_block_count = sbi->user_block_count; + if (!*count) { + spin_unlock(&sbi->stat_lock); + percpu_counter_sub(&sbi->alloc_valid_block_count, diff); + return false; + } } - inode->i_blocks += count; - sbi->total_valid_block_count = valid_block_count; - sbi->alloc_valid_block_count += (block_t)count; spin_unlock(&sbi->stat_lock); + + f2fs_i_blocks_write(inode, *count, true); return true; } @@ -1080,27 +1185,27 @@ static inline void dec_valid_block_count(struct f2fs_sb_info *sbi, spin_lock(&sbi->stat_lock); f2fs_bug_on(sbi, sbi->total_valid_block_count < (block_t) count); f2fs_bug_on(sbi, inode->i_blocks < count); - inode->i_blocks -= count; sbi->total_valid_block_count -= (block_t)count; spin_unlock(&sbi->stat_lock); + f2fs_i_blocks_write(inode, count, false); } static inline void inc_page_count(struct f2fs_sb_info *sbi, int count_type) { - atomic_inc(&sbi->nr_pages[count_type]); + percpu_counter_inc(&sbi->nr_pages[count_type]); set_sbi_flag(sbi, SBI_IS_DIRTY); } static inline void inode_inc_dirty_pages(struct inode *inode) { - atomic_inc(&F2FS_I(inode)->dirty_pages); + percpu_counter_inc(&F2FS_I(inode)->dirty_pages); inc_page_count(F2FS_I_SB(inode), S_ISDIR(inode->i_mode) ? F2FS_DIRTY_DENTS : F2FS_DIRTY_DATA); } static inline void dec_page_count(struct f2fs_sb_info *sbi, int count_type) { - atomic_dec(&sbi->nr_pages[count_type]); + percpu_counter_dec(&sbi->nr_pages[count_type]); } static inline void inode_dec_dirty_pages(struct inode *inode) @@ -1109,26 +1214,28 @@ static inline void inode_dec_dirty_pages(struct inode *inode) !S_ISLNK(inode->i_mode)) return; - atomic_dec(&F2FS_I(inode)->dirty_pages); + percpu_counter_dec(&F2FS_I(inode)->dirty_pages); dec_page_count(F2FS_I_SB(inode), S_ISDIR(inode->i_mode) ? F2FS_DIRTY_DENTS : F2FS_DIRTY_DATA); } -static inline int get_pages(struct f2fs_sb_info *sbi, int count_type) +static inline s64 get_pages(struct f2fs_sb_info *sbi, int count_type) { - return atomic_read(&sbi->nr_pages[count_type]); + return percpu_counter_sum_positive(&sbi->nr_pages[count_type]); } -static inline int get_dirty_pages(struct inode *inode) +static inline s64 get_dirty_pages(struct inode *inode) { - return atomic_read(&F2FS_I(inode)->dirty_pages); + return percpu_counter_sum_positive(&F2FS_I(inode)->dirty_pages); } static inline int get_blocktype_secs(struct f2fs_sb_info *sbi, int block_type) { unsigned int pages_per_sec = sbi->segs_per_sec * sbi->blocks_per_seg; - return ((get_pages(sbi, block_type) + pages_per_sec - 1) - >> sbi->log_blocks_per_seg) / sbi->segs_per_sec; + unsigned int segs = (get_pages(sbi, block_type) + pages_per_sec - 1) >> + sbi->log_blocks_per_seg; + + return segs / sbi->segs_per_sec; } static inline block_t valid_user_blocks(struct f2fs_sb_info *sbi) @@ -1215,13 +1322,13 @@ static inline bool inc_valid_node_count(struct f2fs_sb_info *sbi, } if (inode) - inode->i_blocks++; + f2fs_i_blocks_write(inode, 1, true); - sbi->alloc_valid_block_count++; sbi->total_valid_node_count++; sbi->total_valid_block_count++; spin_unlock(&sbi->stat_lock); + percpu_counter_inc(&sbi->alloc_valid_block_count); return true; } @@ -1234,7 +1341,7 @@ static inline void dec_valid_node_count(struct f2fs_sb_info *sbi, f2fs_bug_on(sbi, !sbi->total_valid_node_count); f2fs_bug_on(sbi, !inode->i_blocks); - inode->i_blocks--; + f2fs_i_blocks_write(inode, 1, false); sbi->total_valid_node_count--; sbi->total_valid_block_count--; @@ -1248,28 +1355,30 @@ static inline unsigned int valid_node_count(struct f2fs_sb_info *sbi) static inline void inc_valid_inode_count(struct f2fs_sb_info *sbi) { - spin_lock(&sbi->stat_lock); - f2fs_bug_on(sbi, sbi->total_valid_inode_count == sbi->total_node_count); - sbi->total_valid_inode_count++; - spin_unlock(&sbi->stat_lock); + percpu_counter_inc(&sbi->total_valid_inode_count); } static inline void dec_valid_inode_count(struct f2fs_sb_info *sbi) { - spin_lock(&sbi->stat_lock); - f2fs_bug_on(sbi, !sbi->total_valid_inode_count); - sbi->total_valid_inode_count--; - spin_unlock(&sbi->stat_lock); + percpu_counter_dec(&sbi->total_valid_inode_count); } -static inline unsigned int valid_inode_count(struct f2fs_sb_info *sbi) +static inline s64 valid_inode_count(struct f2fs_sb_info *sbi) { - return sbi->total_valid_inode_count; + return percpu_counter_sum_positive(&sbi->total_valid_inode_count); } static inline struct page *f2fs_grab_cache_page(struct address_space *mapping, pgoff_t index, bool for_write) { +#ifdef CONFIG_F2FS_FAULT_INJECTION + struct page *page = find_lock_page(mapping, index); + if (page) + return page; + + if (time_to_inject(FAULT_PAGE_ALLOC)) + return NULL; +#endif if (!for_write) return grab_cache_page(mapping, index); return grab_cache_page_write_begin(mapping, index, AOP_FLAG_NOFS); @@ -1429,13 +1538,12 @@ static inline void f2fs_change_bit(unsigned int nr, char *addr) enum { FI_NEW_INODE, /* indicate newly allocated inode */ FI_DIRTY_INODE, /* indicate inode is dirty or not */ + FI_AUTO_RECOVER, /* indicate inode is recoverable */ FI_DIRTY_DIR, /* indicate directory has dirty pages */ FI_INC_LINK, /* need to increment i_nlink */ FI_ACL_MODE, /* indicate acl mode */ FI_NO_ALLOC, /* should not allocate any blocks */ FI_FREE_NID, /* free allocated nide */ - FI_UPDATE_DIR, /* should update inode block for consistency */ - FI_DELAY_IPUT, /* used for the recovery */ FI_NO_EXTENT, /* not to use the extent cache */ FI_INLINE_XATTR, /* used for inline xattr */ FI_INLINE_DATA, /* used for inline data*/ @@ -1453,64 +1561,143 @@ enum { FI_DIRTY_FILE, /* indicate regular/symlink has dirty pages */ }; -static inline void set_inode_flag(struct f2fs_inode_info *fi, int flag) +static inline void __mark_inode_dirty_flag(struct inode *inode, + int flag, bool set) +{ + switch (flag) { + case FI_INLINE_XATTR: + case FI_INLINE_DATA: + case FI_INLINE_DENTRY: + if (set) + return; + case FI_DATA_EXIST: + case FI_INLINE_DOTS: + f2fs_mark_inode_dirty_sync(inode); + } +} + +static inline void set_inode_flag(struct inode *inode, int flag) +{ + if (!test_bit(flag, &F2FS_I(inode)->flags)) + set_bit(flag, &F2FS_I(inode)->flags); + __mark_inode_dirty_flag(inode, flag, true); +} + +static inline int is_inode_flag_set(struct inode *inode, int flag) +{ + return test_bit(flag, &F2FS_I(inode)->flags); +} + +static inline void clear_inode_flag(struct inode *inode, int flag) +{ + if (test_bit(flag, &F2FS_I(inode)->flags)) + clear_bit(flag, &F2FS_I(inode)->flags); + __mark_inode_dirty_flag(inode, flag, false); +} + +static inline void set_acl_inode(struct inode *inode, umode_t mode) +{ + F2FS_I(inode)->i_acl_mode = mode; + set_inode_flag(inode, FI_ACL_MODE); + f2fs_mark_inode_dirty_sync(inode); +} + +static inline void f2fs_i_links_write(struct inode *inode, bool inc) +{ + if (inc) + inc_nlink(inode); + else + drop_nlink(inode); + f2fs_mark_inode_dirty_sync(inode); +} + +static inline void f2fs_i_blocks_write(struct inode *inode, + blkcnt_t diff, bool add) +{ + bool clean = !is_inode_flag_set(inode, FI_DIRTY_INODE); + bool recover = is_inode_flag_set(inode, FI_AUTO_RECOVER); + + inode->i_blocks = add ? inode->i_blocks + diff : + inode->i_blocks - diff; + f2fs_mark_inode_dirty_sync(inode); + if (clean || recover) + set_inode_flag(inode, FI_AUTO_RECOVER); +} + +static inline void f2fs_i_size_write(struct inode *inode, loff_t i_size) { - if (!test_bit(flag, &fi->flags)) - set_bit(flag, &fi->flags); + bool clean = !is_inode_flag_set(inode, FI_DIRTY_INODE); + bool recover = is_inode_flag_set(inode, FI_AUTO_RECOVER); + + if (i_size_read(inode) == i_size) + return; + + i_size_write(inode, i_size); + f2fs_mark_inode_dirty_sync(inode); + if (clean || recover) + set_inode_flag(inode, FI_AUTO_RECOVER); +} + +static inline bool f2fs_skip_inode_update(struct inode *inode) +{ + if (!is_inode_flag_set(inode, FI_AUTO_RECOVER)) + return false; + return F2FS_I(inode)->last_disk_size == i_size_read(inode); } -static inline int is_inode_flag_set(struct f2fs_inode_info *fi, int flag) +static inline void f2fs_i_depth_write(struct inode *inode, unsigned int depth) { - return test_bit(flag, &fi->flags); + F2FS_I(inode)->i_current_depth = depth; + f2fs_mark_inode_dirty_sync(inode); } -static inline void clear_inode_flag(struct f2fs_inode_info *fi, int flag) +static inline void f2fs_i_xnid_write(struct inode *inode, nid_t xnid) { - if (test_bit(flag, &fi->flags)) - clear_bit(flag, &fi->flags); + F2FS_I(inode)->i_xattr_nid = xnid; + f2fs_mark_inode_dirty_sync(inode); } -static inline void set_acl_inode(struct f2fs_inode_info *fi, umode_t mode) +static inline void f2fs_i_pino_write(struct inode *inode, nid_t pino) { - fi->i_acl_mode = mode; - set_inode_flag(fi, FI_ACL_MODE); + F2FS_I(inode)->i_pino = pino; + f2fs_mark_inode_dirty_sync(inode); } -static inline void get_inline_info(struct f2fs_inode_info *fi, - struct f2fs_inode *ri) +static inline void get_inline_info(struct inode *inode, struct f2fs_inode *ri) { + struct f2fs_inode_info *fi = F2FS_I(inode); + if (ri->i_inline & F2FS_INLINE_XATTR) - set_inode_flag(fi, FI_INLINE_XATTR); + set_bit(FI_INLINE_XATTR, &fi->flags); if (ri->i_inline & F2FS_INLINE_DATA) - set_inode_flag(fi, FI_INLINE_DATA); + set_bit(FI_INLINE_DATA, &fi->flags); if (ri->i_inline & F2FS_INLINE_DENTRY) - set_inode_flag(fi, FI_INLINE_DENTRY); + set_bit(FI_INLINE_DENTRY, &fi->flags); if (ri->i_inline & F2FS_DATA_EXIST) - set_inode_flag(fi, FI_DATA_EXIST); + set_bit(FI_DATA_EXIST, &fi->flags); if (ri->i_inline & F2FS_INLINE_DOTS) - set_inode_flag(fi, FI_INLINE_DOTS); + set_bit(FI_INLINE_DOTS, &fi->flags); } -static inline void set_raw_inline(struct f2fs_inode_info *fi, - struct f2fs_inode *ri) +static inline void set_raw_inline(struct inode *inode, struct f2fs_inode *ri) { ri->i_inline = 0; - if (is_inode_flag_set(fi, FI_INLINE_XATTR)) + if (is_inode_flag_set(inode, FI_INLINE_XATTR)) ri->i_inline |= F2FS_INLINE_XATTR; - if (is_inode_flag_set(fi, FI_INLINE_DATA)) + if (is_inode_flag_set(inode, FI_INLINE_DATA)) ri->i_inline |= F2FS_INLINE_DATA; - if (is_inode_flag_set(fi, FI_INLINE_DENTRY)) + if (is_inode_flag_set(inode, FI_INLINE_DENTRY)) ri->i_inline |= F2FS_INLINE_DENTRY; - if (is_inode_flag_set(fi, FI_DATA_EXIST)) + if (is_inode_flag_set(inode, FI_DATA_EXIST)) ri->i_inline |= F2FS_DATA_EXIST; - if (is_inode_flag_set(fi, FI_INLINE_DOTS)) + if (is_inode_flag_set(inode, FI_INLINE_DOTS)) ri->i_inline |= F2FS_INLINE_DOTS; } static inline int f2fs_has_inline_xattr(struct inode *inode) { - return is_inode_flag_set(F2FS_I(inode), FI_INLINE_XATTR); + return is_inode_flag_set(inode, FI_INLINE_XATTR); } static inline unsigned int addrs_per_inode(struct inode *inode) @@ -1537,43 +1724,43 @@ static inline int inline_xattr_size(struct inode *inode) static inline int f2fs_has_inline_data(struct inode *inode) { - return is_inode_flag_set(F2FS_I(inode), FI_INLINE_DATA); + return is_inode_flag_set(inode, FI_INLINE_DATA); } static inline void f2fs_clear_inline_inode(struct inode *inode) { - clear_inode_flag(F2FS_I(inode), FI_INLINE_DATA); - clear_inode_flag(F2FS_I(inode), FI_DATA_EXIST); + clear_inode_flag(inode, FI_INLINE_DATA); + clear_inode_flag(inode, FI_DATA_EXIST); } static inline int f2fs_exist_data(struct inode *inode) { - return is_inode_flag_set(F2FS_I(inode), FI_DATA_EXIST); + return is_inode_flag_set(inode, FI_DATA_EXIST); } static inline int f2fs_has_inline_dots(struct inode *inode) { - return is_inode_flag_set(F2FS_I(inode), FI_INLINE_DOTS); + return is_inode_flag_set(inode, FI_INLINE_DOTS); } static inline bool f2fs_is_atomic_file(struct inode *inode) { - return is_inode_flag_set(F2FS_I(inode), FI_ATOMIC_FILE); + return is_inode_flag_set(inode, FI_ATOMIC_FILE); } static inline bool f2fs_is_volatile_file(struct inode *inode) { - return is_inode_flag_set(F2FS_I(inode), FI_VOLATILE_FILE); + return is_inode_flag_set(inode, FI_VOLATILE_FILE); } static inline bool f2fs_is_first_block_written(struct inode *inode) { - return is_inode_flag_set(F2FS_I(inode), FI_FIRST_BLOCK_WRITTEN); + return is_inode_flag_set(inode, FI_FIRST_BLOCK_WRITTEN); } static inline bool f2fs_is_drop_cache(struct inode *inode) { - return is_inode_flag_set(F2FS_I(inode), FI_DROP_CACHE); + return is_inode_flag_set(inode, FI_DROP_CACHE); } static inline void *inline_data_addr(struct page *page) @@ -1584,7 +1771,7 @@ static inline void *inline_data_addr(struct page *page) static inline int f2fs_has_inline_dentry(struct inode *inode) { - return is_inode_flag_set(F2FS_I(inode), FI_INLINE_DENTRY); + return is_inode_flag_set(inode, FI_INLINE_DENTRY); } static inline void f2fs_dentry_kunmap(struct inode *dir, struct page *page) @@ -1601,11 +1788,13 @@ static inline int is_file(struct inode *inode, int type) static inline void set_file(struct inode *inode, int type) { F2FS_I(inode)->i_advise |= type; + f2fs_mark_inode_dirty_sync(inode); } static inline void clear_file(struct inode *inode, int type) { F2FS_I(inode)->i_advise &= ~type; + f2fs_mark_inode_dirty_sync(inode); } static inline int f2fs_readonly(struct super_block *sb) @@ -1618,12 +1807,6 @@ static inline bool f2fs_cp_error(struct f2fs_sb_info *sbi) return is_set_ckpt_flags(sbi->ckpt, CP_ERROR_FLAG); } -static inline void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi) -{ - set_ckpt_flags(sbi->ckpt, CP_ERROR_FLAG); - sbi->sb->s_flags |= MS_RDONLY; -} - static inline bool is_dot_dotdot(const struct qstr *str) { if (str->len == 1 && str->name[0] == '.') @@ -1638,12 +1821,21 @@ static inline bool is_dot_dotdot(const struct qstr *str) static inline bool f2fs_may_extent_tree(struct inode *inode) { if (!test_opt(F2FS_I_SB(inode), EXTENT_CACHE) || - is_inode_flag_set(F2FS_I(inode), FI_NO_EXTENT)) + is_inode_flag_set(inode, FI_NO_EXTENT)) return false; return S_ISREG(inode->i_mode); } +static inline void *f2fs_kmalloc(size_t size, gfp_t flags) +{ +#ifdef CONFIG_F2FS_FAULT_INJECTION + if (time_to_inject(FAULT_KMALLOC)) + return NULL; +#endif + return kmalloc(size, flags); +} + static inline void *f2fs_kvmalloc(size_t size, gfp_t flags) { void *ret; @@ -1665,7 +1857,7 @@ static inline void *f2fs_kvzalloc(size_t size, gfp_t flags) } #define get_inode_mode(i) \ - ((is_inode_flag_set(F2FS_I(i), FI_ACL_MODE)) ? \ + ((is_inode_flag_set(i, FI_ACL_MODE)) ? \ (F2FS_I(i)->i_acl_mode) : ((i)->i_mode)) /* get offset of first page in next direct node */ @@ -1680,7 +1872,7 @@ static inline void *f2fs_kvzalloc(size_t size, gfp_t flags) int f2fs_sync_file(struct file *, loff_t, loff_t, int); void truncate_data_blocks(struct dnode_of_data *); int truncate_blocks(struct inode *, u64, bool); -int f2fs_truncate(struct inode *, bool); +int f2fs_truncate(struct inode *); int f2fs_getattr(struct vfsmount *, struct dentry *, struct kstat *); int f2fs_setattr(struct dentry *, struct iattr *); int truncate_hole(struct inode *, pgoff_t, pgoff_t); @@ -1710,7 +1902,7 @@ struct dentry *f2fs_get_parent(struct dentry *child); */ extern unsigned char f2fs_filetype_table[F2FS_FT_MAX]; void set_de_type(struct f2fs_dir_entry *, umode_t); - +unsigned char get_de_type(struct f2fs_dir_entry *); struct f2fs_dir_entry *find_target_dentry(struct fscrypt_name *, f2fs_hash_t, int *, struct f2fs_dentry_ptr *); bool f2fs_fill_dentries(struct dir_context *, struct f2fs_dentry_ptr *, @@ -1721,16 +1913,18 @@ struct page *init_inode_metadata(struct inode *, struct inode *, const struct qstr *, struct page *); void update_parent_metadata(struct inode *, struct inode *, unsigned int); int room_for_filename(const void *, int, int); -void f2fs_drop_nlink(struct inode *, struct inode *, struct page *); +void f2fs_drop_nlink(struct inode *, struct inode *); struct f2fs_dir_entry *f2fs_find_entry(struct inode *, struct qstr *, struct page **); struct f2fs_dir_entry *f2fs_parent_dir(struct inode *, struct page **); -ino_t f2fs_inode_by_name(struct inode *, struct qstr *); +ino_t f2fs_inode_by_name(struct inode *, struct qstr *, struct page **); void f2fs_set_link(struct inode *, struct f2fs_dir_entry *, struct page *, struct inode *); int update_dent_inode(struct inode *, struct inode *, const struct qstr *); void f2fs_update_dentry(nid_t ino, umode_t mode, struct f2fs_dentry_ptr *, const struct qstr *, f2fs_hash_t , unsigned int); +int f2fs_add_regular_entry(struct inode *, const struct qstr *, + struct inode *, nid_t, umode_t); int __f2fs_add_link(struct inode *, const struct qstr *, struct inode *, nid_t, umode_t); void f2fs_delete_entry(struct f2fs_dir_entry *, struct page *, struct inode *, @@ -1747,6 +1941,8 @@ static inline int f2fs_add_link(struct dentry *dentry, struct inode *inode) /* * super.c */ +int f2fs_inode_dirtied(struct inode *); +void f2fs_inode_synced(struct inode *); int f2fs_commit_super(struct f2fs_sb_info *, bool); int f2fs_sync_fs(struct super_block *, int); extern __printf(3, 4) @@ -1780,8 +1976,11 @@ struct page *new_node_page(struct dnode_of_data *, unsigned int, struct page *); void ra_node_page(struct f2fs_sb_info *, nid_t); struct page *get_node_page(struct f2fs_sb_info *, pgoff_t); struct page *get_node_page_ra(struct page *, int); -void sync_inode_page(struct dnode_of_data *); -int sync_node_pages(struct f2fs_sb_info *, nid_t, struct writeback_control *); +void move_node_page(struct page *, int); +int fsync_node_pages(struct f2fs_sb_info *, struct inode *, + struct writeback_control *, bool); +int sync_node_pages(struct f2fs_sb_info *, struct writeback_control *); +void build_free_nids(struct f2fs_sb_info *); bool alloc_nid(struct f2fs_sb_info *, nid_t *); void alloc_nid_done(struct f2fs_sb_info *, nid_t); void alloc_nid_failed(struct f2fs_sb_info *, nid_t); @@ -1843,6 +2042,7 @@ void destroy_segment_manager_caches(void); /* * checkpoint.c */ +void f2fs_stop_checkpoint(struct f2fs_sb_info *, bool); struct page *grab_meta_page(struct f2fs_sb_info *, pgoff_t); struct page *get_meta_page(struct f2fs_sb_info *, pgoff_t); struct page *get_tmp_page(struct f2fs_sb_info *, pgoff_t); @@ -1852,16 +2052,16 @@ void ra_meta_pages_cond(struct f2fs_sb_info *, pgoff_t); long sync_meta_pages(struct f2fs_sb_info *, enum page_type, long); void add_ino_entry(struct f2fs_sb_info *, nid_t, int type); void remove_ino_entry(struct f2fs_sb_info *, nid_t, int type); -void release_ino_entry(struct f2fs_sb_info *); +void release_ino_entry(struct f2fs_sb_info *, bool); bool exist_written_data(struct f2fs_sb_info *, nid_t, int); +int f2fs_sync_inode_meta(struct f2fs_sb_info *); int acquire_orphan_inode(struct f2fs_sb_info *); void release_orphan_inode(struct f2fs_sb_info *); -void add_orphan_inode(struct f2fs_sb_info *, nid_t); +void add_orphan_inode(struct inode *); void remove_orphan_inode(struct f2fs_sb_info *, nid_t); int recover_orphan_inodes(struct f2fs_sb_info *); int get_valid_checkpoint(struct f2fs_sb_info *); void update_dirty_page(struct inode *, struct page *); -void add_dirty_dir_inode(struct inode *); void remove_dirty_inode(struct inode *); int sync_dirty_inodes(struct f2fs_sb_info *, enum inode_type); int write_checkpoint(struct f2fs_sb_info *, struct cp_control *); @@ -1880,6 +2080,7 @@ int f2fs_submit_page_bio(struct f2fs_io_info *); void f2fs_submit_page_mbio(struct f2fs_io_info *); void set_data_blkaddr(struct dnode_of_data *); void f2fs_update_data_blkaddr(struct dnode_of_data *, block_t); +int reserve_new_blocks(struct dnode_of_data *, blkcnt_t); int reserve_new_block(struct dnode_of_data *); int f2fs_get_block(struct dnode_of_data *, pgoff_t); ssize_t f2fs_preallocate_blocks(struct kiocb *, struct iov_iter *); @@ -1891,6 +2092,7 @@ struct page *get_new_data_page(struct inode *, struct page *, pgoff_t, bool); int do_write_data_page(struct f2fs_io_info *); int f2fs_map_blocks(struct inode *, struct f2fs_map_blocks *, int, int); int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *, u64, u64); +void f2fs_set_page_dirty_nobuffers(struct page *); void f2fs_invalidate_page(struct page *, unsigned int, unsigned int); int f2fs_release_page(struct page *, gfp_t); @@ -1906,7 +2108,7 @@ void build_gc_manager(struct f2fs_sb_info *); /* * recovery.c */ -int recover_fsync_data(struct f2fs_sb_info *); +int recover_fsync_data(struct f2fs_sb_info *, bool); bool space_for_roll_forward(struct f2fs_sb_info *); /* @@ -1921,12 +2123,12 @@ struct f2fs_stat_info { unsigned long long hit_largest, hit_cached, hit_rbtree; unsigned long long hit_total, total_ext; int ext_tree, zombie_tree, ext_node; - int ndirty_node, ndirty_meta; - int ndirty_dent, ndirty_dirs, ndirty_data, ndirty_files; + s64 ndirty_node, ndirty_dent, ndirty_meta, ndirty_data, inmem_pages; + unsigned int ndirty_dirs, ndirty_files, ndirty_all; int nats, dirty_nats, sits, dirty_sits, fnids; int total_count, utilization; - int bg_gc, inmem_pages, wb_pages; - int inline_xattr, inline_inode, inline_dir; + int bg_gc, wb_bios; + int inline_xattr, inline_inode, inline_dir, orphans; unsigned int valid_count, valid_node_count, valid_inode_count; unsigned int bimodal, avg_vblocks; int util_free, util_valid, util_invalid; @@ -2091,7 +2293,6 @@ int f2fs_write_inline_data(struct inode *, struct page *); bool recover_inline_data(struct inode *, struct page *); struct f2fs_dir_entry *find_in_inline_dir(struct inode *, struct fscrypt_name *, struct page **); -struct f2fs_dir_entry *f2fs_parent_inline_dir(struct inode *, struct page **); int make_empty_inline_dir(struct inode *inode, struct inode *, struct page *); int f2fs_add_inline_entry(struct inode *, const struct qstr *, struct inode *, nid_t, umode_t); @@ -2116,6 +2317,7 @@ void f2fs_leave_shrinker(struct f2fs_sb_info *); */ unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *, int); bool f2fs_init_extent_tree(struct inode *, struct f2fs_extent *); +void f2fs_drop_extent_tree(struct inode *); unsigned int f2fs_destroy_extent_node(struct inode *); void f2fs_destroy_extent_tree(struct inode *); bool f2fs_lookup_extent_cache(struct inode *, pgoff_t, struct extent_info *); @@ -2151,6 +2353,26 @@ static inline int f2fs_sb_has_crypto(struct super_block *sb) return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_ENCRYPT); } +static inline int f2fs_sb_mounted_hmsmr(struct super_block *sb) +{ + return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_HMSMR); +} + +static inline void set_opt_mode(struct f2fs_sb_info *sbi, unsigned int mt) +{ + clear_opt(sbi, ADAPTIVE); + clear_opt(sbi, LFS); + + switch (mt) { + case F2FS_MOUNT_ADAPTIVE: + set_opt(sbi, ADAPTIVE); + break; + case F2FS_MOUNT_LFS: + set_opt(sbi, LFS); + break; + } +} + static inline bool f2fs_may_encrypt(struct inode *inode) { #ifdef CONFIG_F2FS_FS_ENCRYPTION diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index eb9d027e5981..0e493f63ea41 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -20,7 +20,8 @@ #include <linux/uaccess.h> #include <linux/mount.h> #include <linux/pagevec.h> -#include <linux/random.h> +#include <linux/uuid.h> +#include <linux/file.h> #include "f2fs.h" #include "node.h" @@ -81,7 +82,8 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma, zero_user_segment(page, offset, PAGE_SIZE); } set_page_dirty(page); - SetPageUptodate(page); + if (!PageUptodate(page)) + SetPageUptodate(page); trace_f2fs_vm_page_mkwrite(page, DATA); mapped: @@ -171,21 +173,16 @@ static void try_to_fix_pino(struct inode *inode) fi->xattr_ver = 0; if (file_wrong_pino(inode) && inode->i_nlink == 1 && get_parent_ino(inode, &pino)) { - fi->i_pino = pino; + f2fs_i_pino_write(inode, pino); file_got_pino(inode); - up_write(&fi->i_sem); - - mark_inode_dirty_sync(inode); - f2fs_write_inode(inode, NULL); - } else { - up_write(&fi->i_sem); } + up_write(&fi->i_sem); } -int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) +static int f2fs_do_sync_file(struct file *file, loff_t start, loff_t end, + int datasync, bool atomic) { struct inode *inode = file->f_mapping->host; - struct f2fs_inode_info *fi = F2FS_I(inode); struct f2fs_sb_info *sbi = F2FS_I_SB(inode); nid_t ino = inode->i_ino; int ret = 0; @@ -203,9 +200,9 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) /* if fdatasync is triggered, let's do in-place-update */ if (datasync || get_dirty_pages(inode) <= SM_I(sbi)->min_fsync_blocks) - set_inode_flag(fi, FI_NEED_IPU); + set_inode_flag(inode, FI_NEED_IPU); ret = filemap_write_and_wait_range(inode->i_mapping, start, end); - clear_inode_flag(fi, FI_NEED_IPU); + clear_inode_flag(inode, FI_NEED_IPU); if (ret) { trace_f2fs_sync_file_exit(inode, need_cp, datasync, ret); @@ -213,7 +210,7 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) } /* if the inode is dirty, let's recover all the time */ - if (!datasync) { + if (!datasync && !f2fs_skip_inode_update(inode)) { f2fs_write_inode(inode, NULL); goto go_write; } @@ -221,14 +218,14 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) /* * if there is no written data, don't waste time to write recovery info. */ - if (!is_inode_flag_set(fi, FI_APPEND_WRITE) && + if (!is_inode_flag_set(inode, FI_APPEND_WRITE) && !exist_written_data(sbi, ino, APPEND_INO)) { /* it may call write_inode just prior to fsync */ if (need_inode_page_update(sbi, ino)) goto go_write; - if (is_inode_flag_set(fi, FI_UPDATE_WRITE) || + if (is_inode_flag_set(inode, FI_UPDATE_WRITE) || exist_written_data(sbi, ino, UPDATE_INO)) goto flush_out; goto out; @@ -238,9 +235,9 @@ go_write: * Both of fdatasync() and fsync() are able to be recovered from * sudden-power-off. */ - down_read(&fi->i_sem); + down_read(&F2FS_I(inode)->i_sem); need_cp = need_do_checkpoint(inode); - up_read(&fi->i_sem); + up_read(&F2FS_I(inode)->i_sem); if (need_cp) { /* all the dirty node pages should be flushed for POR */ @@ -251,12 +248,14 @@ go_write: * will be used only for fsynced inodes after checkpoint. */ try_to_fix_pino(inode); - clear_inode_flag(fi, FI_APPEND_WRITE); - clear_inode_flag(fi, FI_UPDATE_WRITE); + clear_inode_flag(inode, FI_APPEND_WRITE); + clear_inode_flag(inode, FI_UPDATE_WRITE); goto out; } sync_nodes: - sync_node_pages(sbi, ino, &wbc); + ret = fsync_node_pages(sbi, inode, &wbc, atomic); + if (ret) + goto out; /* if cp_error was enabled, we should avoid infinite loop */ if (unlikely(f2fs_cp_error(sbi))) { @@ -265,7 +264,7 @@ sync_nodes: } if (need_inode_block_update(sbi, ino)) { - mark_inode_dirty_sync(inode); + f2fs_mark_inode_dirty_sync(inode); f2fs_write_inode(inode, NULL); goto sync_nodes; } @@ -276,10 +275,10 @@ sync_nodes: /* once recovery info is written, don't need to tack this */ remove_ino_entry(sbi, ino, APPEND_INO); - clear_inode_flag(fi, FI_APPEND_WRITE); + clear_inode_flag(inode, FI_APPEND_WRITE); flush_out: remove_ino_entry(sbi, ino, UPDATE_INO); - clear_inode_flag(fi, FI_UPDATE_WRITE); + clear_inode_flag(inode, FI_UPDATE_WRITE); ret = f2fs_issue_flush(sbi); f2fs_update_time(sbi, REQ_TIME); out: @@ -288,6 +287,11 @@ out: return ret; } +int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) +{ + return f2fs_do_sync_file(file, start, end, datasync, false); +} + static pgoff_t __get_first_dirty_index(struct address_space *mapping, pgoff_t pgofs, int whence) { @@ -352,7 +356,7 @@ static loff_t f2fs_seek_block(struct file *file, loff_t offset, int whence) for (; data_ofs < isize; data_ofs = (loff_t)pgofs << PAGE_SHIFT) { set_new_dnode(&dn, inode, NULL, NULL, 0); - err = get_dnode_of_data(&dn, pgofs, LOOKUP_NODE_RA); + err = get_dnode_of_data(&dn, pgofs, LOOKUP_NODE); if (err && err != -ENOENT) { goto fail; } else if (err == -ENOENT) { @@ -479,8 +483,7 @@ int truncate_data_blocks_range(struct dnode_of_data *dn, int count) set_data_blkaddr(dn); invalidate_blocks(sbi, blkaddr); if (dn->ofs_in_node == 0 && IS_INODE(dn->node_page)) - clear_inode_flag(F2FS_I(dn->inode), - FI_FIRST_BLOCK_WRITTEN); + clear_inode_flag(dn->inode, FI_FIRST_BLOCK_WRITTEN); nr_free++; } @@ -494,7 +497,6 @@ int truncate_data_blocks_range(struct dnode_of_data *dn, int count) dn->inode) + ofs; f2fs_update_extent_cache_range(dn, fofs, 0, len); dec_valid_block_count(sbi, dn->inode, nr_free); - sync_inode_page(dn); } dn->ofs_in_node = ofs; @@ -555,6 +557,9 @@ int truncate_blocks(struct inode *inode, u64 from, bool lock) free_from = (pgoff_t)F2FS_BYTES_TO_BLK(from + blocksize - 1); + if (free_from >= sbi->max_file_blocks) + goto free_partial; + if (lock) f2fs_lock_op(sbi); @@ -573,7 +578,7 @@ int truncate_blocks(struct inode *inode, u64 from, bool lock) } set_new_dnode(&dn, inode, ipage, NULL, 0); - err = get_dnode_of_data(&dn, free_from, LOOKUP_NODE); + err = get_dnode_of_data(&dn, free_from, LOOKUP_NODE_RA); if (err) { if (err == -ENOENT) goto free_next; @@ -596,7 +601,7 @@ free_next: out: if (lock) f2fs_unlock_op(sbi); - +free_partial: /* lastly zero out the first data page */ if (!err) err = truncate_partial_data_page(inode, from, truncate_page); @@ -605,7 +610,7 @@ out: return err; } -int f2fs_truncate(struct inode *inode, bool lock) +int f2fs_truncate(struct inode *inode) { int err; @@ -622,12 +627,12 @@ int f2fs_truncate(struct inode *inode, bool lock) return err; } - err = truncate_blocks(inode, i_size_read(inode), lock); + err = truncate_blocks(inode, i_size_read(inode), true); if (err) return err; inode->i_mtime = inode->i_ctime = CURRENT_TIME; - mark_inode_dirty(inode); + f2fs_mark_inode_dirty_sync(inode); return 0; } @@ -643,7 +648,6 @@ int f2fs_getattr(struct vfsmount *mnt, #ifdef CONFIG_F2FS_FS_POSIX_ACL static void __setattr_copy(struct inode *inode, const struct iattr *attr) { - struct f2fs_inode_info *fi = F2FS_I(inode); unsigned int ia_valid = attr->ia_valid; if (ia_valid & ATTR_UID) @@ -664,7 +668,7 @@ static void __setattr_copy(struct inode *inode, const struct iattr *attr) if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID)) mode &= ~S_ISGID; - set_acl_inode(fi, mode); + set_acl_inode(inode, mode); } } #else @@ -674,7 +678,6 @@ static void __setattr_copy(struct inode *inode, const struct iattr *attr) int f2fs_setattr(struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_inode(dentry); - struct f2fs_inode_info *fi = F2FS_I(inode); int err; err = inode_change_ok(inode, attr); @@ -688,7 +691,7 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) if (attr->ia_size <= i_size_read(inode)) { truncate_setsize(inode, attr->ia_size); - err = f2fs_truncate(inode, true); + err = f2fs_truncate(inode); if (err) return err; f2fs_balance_fs(F2FS_I_SB(inode), true); @@ -713,13 +716,13 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) if (attr->ia_valid & ATTR_MODE) { err = posix_acl_chmod(inode, get_inode_mode(inode)); - if (err || is_inode_flag_set(fi, FI_ACL_MODE)) { - inode->i_mode = fi->i_acl_mode; - clear_inode_flag(fi, FI_ACL_MODE); + if (err || is_inode_flag_set(inode, FI_ACL_MODE)) { + inode->i_mode = F2FS_I(inode)->i_acl_mode; + clear_inode_flag(inode, FI_ACL_MODE); } } - mark_inode_dirty(inode); + f2fs_mark_inode_dirty_sync(inode); return err; } @@ -848,79 +851,199 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len) return ret; } -static int __exchange_data_block(struct inode *inode, pgoff_t src, - pgoff_t dst, bool full) +static int __read_out_blkaddrs(struct inode *inode, block_t *blkaddr, + int *do_replace, pgoff_t off, pgoff_t len) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct dnode_of_data dn; - block_t new_addr; - bool do_replace = false; - int ret; + int ret, done, i; +next_dnode: set_new_dnode(&dn, inode, NULL, NULL, 0); - ret = get_dnode_of_data(&dn, src, LOOKUP_NODE_RA); + ret = get_dnode_of_data(&dn, off, LOOKUP_NODE_RA); if (ret && ret != -ENOENT) { return ret; } else if (ret == -ENOENT) { - new_addr = NULL_ADDR; - } else { - new_addr = dn.data_blkaddr; - if (!is_checkpointed_data(sbi, new_addr)) { + if (dn.max_level == 0) + return -ENOENT; + done = min((pgoff_t)ADDRS_PER_BLOCK - dn.ofs_in_node, len); + blkaddr += done; + do_replace += done; + goto next; + } + + done = min((pgoff_t)ADDRS_PER_PAGE(dn.node_page, inode) - + dn.ofs_in_node, len); + for (i = 0; i < done; i++, blkaddr++, do_replace++, dn.ofs_in_node++) { + *blkaddr = datablock_addr(dn.node_page, dn.ofs_in_node); + if (!is_checkpointed_data(sbi, *blkaddr)) { + + if (test_opt(sbi, LFS)) { + f2fs_put_dnode(&dn); + return -ENOTSUPP; + } + /* do not invalidate this block address */ f2fs_update_data_blkaddr(&dn, NULL_ADDR); - do_replace = true; + *do_replace = 1; } - f2fs_put_dnode(&dn); } + f2fs_put_dnode(&dn); +next: + len -= done; + off += done; + if (len) + goto next_dnode; + return 0; +} - if (new_addr == NULL_ADDR) - return full ? truncate_hole(inode, dst, dst + 1) : 0; +static int __roll_back_blkaddrs(struct inode *inode, block_t *blkaddr, + int *do_replace, pgoff_t off, int len) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct dnode_of_data dn; + int ret, i; - if (do_replace) { - struct page *ipage = get_node_page(sbi, inode->i_ino); - struct node_info ni; + for (i = 0; i < len; i++, do_replace++, blkaddr++) { + if (*do_replace == 0) + continue; - if (IS_ERR(ipage)) { - ret = PTR_ERR(ipage); - goto err_out; + set_new_dnode(&dn, inode, NULL, NULL, 0); + ret = get_dnode_of_data(&dn, off + i, LOOKUP_NODE_RA); + if (ret) { + dec_valid_block_count(sbi, inode, 1); + invalidate_blocks(sbi, *blkaddr); + } else { + f2fs_update_data_blkaddr(&dn, *blkaddr); } + f2fs_put_dnode(&dn); + } + return 0; +} - set_new_dnode(&dn, inode, ipage, NULL, 0); - ret = f2fs_reserve_block(&dn, dst); - if (ret) - goto err_out; +static int __clone_blkaddrs(struct inode *src_inode, struct inode *dst_inode, + block_t *blkaddr, int *do_replace, + pgoff_t src, pgoff_t dst, pgoff_t len, bool full) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(src_inode); + pgoff_t i = 0; + int ret; + + while (i < len) { + if (blkaddr[i] == NULL_ADDR && !full) { + i++; + continue; + } - truncate_data_blocks_range(&dn, 1); + if (do_replace[i] || blkaddr[i] == NULL_ADDR) { + struct dnode_of_data dn; + struct node_info ni; + size_t new_size; + pgoff_t ilen; - get_node_info(sbi, dn.nid, &ni); - f2fs_replace_block(sbi, &dn, dn.data_blkaddr, new_addr, - ni.version, true, false); - f2fs_put_dnode(&dn); - } else { - struct page *psrc, *pdst; + set_new_dnode(&dn, dst_inode, NULL, NULL, 0); + ret = get_dnode_of_data(&dn, dst + i, ALLOC_NODE); + if (ret) + return ret; + + get_node_info(sbi, dn.nid, &ni); + ilen = min((pgoff_t) + ADDRS_PER_PAGE(dn.node_page, dst_inode) - + dn.ofs_in_node, len - i); + do { + dn.data_blkaddr = datablock_addr(dn.node_page, + dn.ofs_in_node); + truncate_data_blocks_range(&dn, 1); + + if (do_replace[i]) { + f2fs_i_blocks_write(src_inode, + 1, false); + f2fs_i_blocks_write(dst_inode, + 1, true); + f2fs_replace_block(sbi, &dn, dn.data_blkaddr, + blkaddr[i], ni.version, true, false); + + do_replace[i] = 0; + } + dn.ofs_in_node++; + i++; + new_size = (dst + i) << PAGE_SHIFT; + if (dst_inode->i_size < new_size) + f2fs_i_size_write(dst_inode, new_size); + } while ((do_replace[i] || blkaddr[i] == NULL_ADDR) && --ilen); - psrc = get_lock_data_page(inode, src, true); - if (IS_ERR(psrc)) - return PTR_ERR(psrc); - pdst = get_new_data_page(inode, NULL, dst, true); - if (IS_ERR(pdst)) { + f2fs_put_dnode(&dn); + } else { + struct page *psrc, *pdst; + + psrc = get_lock_data_page(src_inode, src + i, true); + if (IS_ERR(psrc)) + return PTR_ERR(psrc); + pdst = get_new_data_page(dst_inode, NULL, dst + i, + true); + if (IS_ERR(pdst)) { + f2fs_put_page(psrc, 1); + return PTR_ERR(pdst); + } + f2fs_copy_page(psrc, pdst); + set_page_dirty(pdst); + f2fs_put_page(pdst, 1); f2fs_put_page(psrc, 1); - return PTR_ERR(pdst); - } - f2fs_copy_page(psrc, pdst); - set_page_dirty(pdst); - f2fs_put_page(pdst, 1); - f2fs_put_page(psrc, 1); - return truncate_hole(inode, src, src + 1); + ret = truncate_hole(src_inode, src + i, src + i + 1); + if (ret) + return ret; + i++; + } } return 0; +} -err_out: - if (!get_dnode_of_data(&dn, src, LOOKUP_NODE)) { - f2fs_update_data_blkaddr(&dn, new_addr); - f2fs_put_dnode(&dn); +static int __exchange_data_block(struct inode *src_inode, + struct inode *dst_inode, pgoff_t src, pgoff_t dst, + pgoff_t len, bool full) +{ + block_t *src_blkaddr; + int *do_replace; + pgoff_t olen; + int ret; + + while (len) { + olen = min((pgoff_t)4 * ADDRS_PER_BLOCK, len); + + src_blkaddr = f2fs_kvzalloc(sizeof(block_t) * olen, GFP_KERNEL); + if (!src_blkaddr) + return -ENOMEM; + + do_replace = f2fs_kvzalloc(sizeof(int) * olen, GFP_KERNEL); + if (!do_replace) { + kvfree(src_blkaddr); + return -ENOMEM; + } + + ret = __read_out_blkaddrs(src_inode, src_blkaddr, + do_replace, src, olen); + if (ret) + goto roll_back; + + ret = __clone_blkaddrs(src_inode, dst_inode, src_blkaddr, + do_replace, src, dst, olen, full); + if (ret) + goto roll_back; + + src += olen; + dst += olen; + len -= olen; + + kvfree(src_blkaddr); + kvfree(do_replace); } + return 0; + +roll_back: + __roll_back_blkaddrs(src_inode, src_blkaddr, do_replace, src, len); + kvfree(src_blkaddr); + kvfree(do_replace); return ret; } @@ -928,16 +1051,15 @@ static int f2fs_do_collapse(struct inode *inode, pgoff_t start, pgoff_t end) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); pgoff_t nrpages = (i_size_read(inode) + PAGE_SIZE - 1) / PAGE_SIZE; - int ret = 0; + int ret; - for (; end < nrpages; start++, end++) { - f2fs_balance_fs(sbi, true); - f2fs_lock_op(sbi); - ret = __exchange_data_block(inode, end, start, true); - f2fs_unlock_op(sbi); - if (ret) - break; - } + f2fs_balance_fs(sbi, true); + f2fs_lock_op(sbi); + + f2fs_drop_extent_tree(inode); + + ret = __exchange_data_block(inode, inode, end, start, nrpages - end, true); + f2fs_unlock_op(sbi); return ret; } @@ -981,7 +1103,50 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len) ret = truncate_blocks(inode, new_size, true); if (!ret) - i_size_write(inode, new_size); + f2fs_i_size_write(inode, new_size); + + return ret; +} + +static int f2fs_do_zero_range(struct dnode_of_data *dn, pgoff_t start, + pgoff_t end) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); + pgoff_t index = start; + unsigned int ofs_in_node = dn->ofs_in_node; + blkcnt_t count = 0; + int ret; + + for (; index < end; index++, dn->ofs_in_node++) { + if (datablock_addr(dn->node_page, dn->ofs_in_node) == NULL_ADDR) + count++; + } + + dn->ofs_in_node = ofs_in_node; + ret = reserve_new_blocks(dn, count); + if (ret) + return ret; + + dn->ofs_in_node = ofs_in_node; + for (index = start; index < end; index++, dn->ofs_in_node++) { + dn->data_blkaddr = + datablock_addr(dn->node_page, dn->ofs_in_node); + /* + * reserve_new_blocks will not guarantee entire block + * allocation. + */ + if (dn->data_blkaddr == NULL_ADDR) { + ret = -ENOSPC; + break; + } + if (dn->data_blkaddr != NEW_ADDR) { + invalidate_blocks(sbi, dn->data_blkaddr); + dn->data_blkaddr = NEW_ADDR; + set_data_blkaddr(dn); + } + } + + f2fs_update_extent_cache_range(dn, start, 0, index - start); return ret; } @@ -1036,35 +1201,32 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, (loff_t)pg_start << PAGE_SHIFT); } - for (index = pg_start; index < pg_end; index++) { + for (index = pg_start; index < pg_end;) { struct dnode_of_data dn; - struct page *ipage; + unsigned int end_offset; + pgoff_t end; f2fs_lock_op(sbi); - ipage = get_node_page(sbi, inode->i_ino); - if (IS_ERR(ipage)) { - ret = PTR_ERR(ipage); - f2fs_unlock_op(sbi); - goto out; - } - - set_new_dnode(&dn, inode, ipage, NULL, 0); - ret = f2fs_reserve_block(&dn, index); + set_new_dnode(&dn, inode, NULL, NULL, 0); + ret = get_dnode_of_data(&dn, index, ALLOC_NODE); if (ret) { f2fs_unlock_op(sbi); goto out; } - if (dn.data_blkaddr != NEW_ADDR) { - invalidate_blocks(sbi, dn.data_blkaddr); - f2fs_update_data_blkaddr(&dn, NEW_ADDR); - } + end_offset = ADDRS_PER_PAGE(dn.node_page, inode); + end = min(pg_end, end_offset - dn.ofs_in_node + index); + + ret = f2fs_do_zero_range(&dn, index, end); f2fs_put_dnode(&dn); f2fs_unlock_op(sbi); + if (ret) + goto out; + index = end; new_size = max_t(loff_t, new_size, - (loff_t)(index + 1) << PAGE_SHIFT); + (loff_t)index << PAGE_SHIFT); } if (off_end) { @@ -1077,11 +1239,8 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, } out: - if (!(mode & FALLOC_FL_KEEP_SIZE) && i_size_read(inode) < new_size) { - i_size_write(inode, new_size); - mark_inode_dirty(inode); - update_inode_page(inode); - } + if (!(mode & FALLOC_FL_KEEP_SIZE) && i_size_read(inode) < new_size) + f2fs_i_size_write(inode, new_size); return ret; } @@ -1089,7 +1248,7 @@ out: static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - pgoff_t pg_start, pg_end, delta, nrpages, idx; + pgoff_t nr, pg_start, pg_end, delta, idx; loff_t new_size; int ret = 0; @@ -1124,14 +1283,20 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) pg_start = offset >> PAGE_SHIFT; pg_end = (offset + len) >> PAGE_SHIFT; delta = pg_end - pg_start; - nrpages = (i_size_read(inode) + PAGE_SIZE - 1) / PAGE_SIZE; + idx = (i_size_read(inode) + PAGE_SIZE - 1) / PAGE_SIZE; + + while (!ret && idx > pg_start) { + nr = idx - pg_start; + if (nr > delta) + nr = delta; + idx -= nr; - for (idx = nrpages - 1; idx >= pg_start && idx != -1; idx--) { f2fs_lock_op(sbi); - ret = __exchange_data_block(inode, idx, idx + delta, false); + f2fs_drop_extent_tree(inode); + + ret = __exchange_data_block(inode, inode, idx, + idx + delta, nr, false); f2fs_unlock_op(sbi); - if (ret) - break; } /* write out all moved pages, if possible */ @@ -1139,7 +1304,7 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) truncate_pagecache(inode, offset); if (!ret) - i_size_write(inode, new_size); + f2fs_i_size_write(inode, new_size); return ret; } @@ -1147,10 +1312,11 @@ static int expand_inode_data(struct inode *inode, loff_t offset, loff_t len, int mode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - pgoff_t index, pg_start, pg_end; + struct f2fs_map_blocks map = { .m_next_pgofs = NULL }; + pgoff_t pg_end; loff_t new_size = i_size_read(inode); - loff_t off_start, off_end; - int ret = 0; + loff_t off_end; + int ret; ret = inode_newsize_ok(inode, (len + offset)); if (ret) @@ -1162,43 +1328,32 @@ static int expand_inode_data(struct inode *inode, loff_t offset, f2fs_balance_fs(sbi, true); - pg_start = ((unsigned long long) offset) >> PAGE_SHIFT; - pg_end = ((unsigned long long) offset + len) >> PAGE_SHIFT; - - off_start = offset & (PAGE_SIZE - 1); + pg_end = ((unsigned long long)offset + len) >> PAGE_SHIFT; off_end = (offset + len) & (PAGE_SIZE - 1); - f2fs_lock_op(sbi); + map.m_lblk = ((unsigned long long)offset) >> PAGE_SHIFT; + map.m_len = pg_end - map.m_lblk; + if (off_end) + map.m_len++; - for (index = pg_start; index <= pg_end; index++) { - struct dnode_of_data dn; + ret = f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_PRE_AIO); + if (ret) { + pgoff_t last_off; - if (index == pg_end && !off_end) - goto noalloc; + if (!map.m_len) + return ret; - set_new_dnode(&dn, inode, NULL, NULL, 0); - ret = f2fs_reserve_block(&dn, index); - if (ret) - break; -noalloc: - if (pg_start == pg_end) - new_size = offset + len; - else if (index == pg_start && off_start) - new_size = (loff_t)(index + 1) << PAGE_SHIFT; - else if (index == pg_end) - new_size = ((loff_t)index << PAGE_SHIFT) + - off_end; - else - new_size += PAGE_SIZE; - } + last_off = map.m_lblk + map.m_len - 1; - if (!(mode & FALLOC_FL_KEEP_SIZE) && - i_size_read(inode) < new_size) { - i_size_write(inode, new_size); - mark_inode_dirty(inode); - update_inode_page(inode); + /* update new size to the failed position */ + new_size = (last_off == pg_end) ? offset + len: + (loff_t)(last_off + 1) << PAGE_SHIFT; + } else { + new_size = ((loff_t)pg_end << PAGE_SHIFT) + off_end; } - f2fs_unlock_op(sbi); + + if (!(mode & FALLOC_FL_KEEP_SIZE) && i_size_read(inode) < new_size) + f2fs_i_size_write(inode, new_size); return ret; } @@ -1241,7 +1396,7 @@ static long f2fs_fallocate(struct file *file, int mode, if (!ret) { inode->i_mtime = inode->i_ctime = CURRENT_TIME; - mark_inode_dirty(inode); + f2fs_mark_inode_dirty_sync(inode); f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); } @@ -1254,13 +1409,22 @@ out: static int f2fs_release_file(struct inode *inode, struct file *filp) { + /* + * f2fs_relase_file is called at every close calls. So we should + * not drop any inmemory pages by close called by other process. + */ + if (!(filp->f_mode & FMODE_WRITE) || + atomic_read(&inode->i_writecount) != 1) + return 0; + /* some remained atomic pages should discarded */ if (f2fs_is_atomic_file(inode)) drop_inmem_pages(inode); if (f2fs_is_volatile_file(inode)) { - set_inode_flag(F2FS_I(inode), FI_DROP_CACHE); + clear_inode_flag(inode, FI_VOLATILE_FILE); + set_inode_flag(inode, FI_DROP_CACHE); filemap_fdatawrite(inode->i_mapping); - clear_inode_flag(F2FS_I(inode), FI_DROP_CACHE); + clear_inode_flag(inode, FI_DROP_CACHE); } return 0; } @@ -1294,20 +1458,16 @@ static int f2fs_ioc_setflags(struct file *filp, unsigned long arg) unsigned int oldflags; int ret; + if (!inode_owner_or_capable(inode)) + return -EACCES; + + if (get_user(flags, (int __user *)arg)) + return -EFAULT; + ret = mnt_want_write_file(filp); if (ret) return ret; - if (!inode_owner_or_capable(inode)) { - ret = -EACCES; - goto out; - } - - if (get_user(flags, (int __user *)arg)) { - ret = -EFAULT; - goto out; - } - flags = f2fs_mask_flags(inode->i_mode, flags); inode_lock(inode); @@ -1327,9 +1487,8 @@ static int f2fs_ioc_setflags(struct file *filp, unsigned long arg) fi->i_flags = flags; inode_unlock(inode); - f2fs_set_inode_flags(inode); inode->i_ctime = CURRENT_TIME; - mark_inode_dirty(inode); + f2fs_set_inode_flags(inode); out: mnt_drop_write_file(filp); return ret; @@ -1350,17 +1509,35 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) if (!inode_owner_or_capable(inode)) return -EACCES; + ret = mnt_want_write_file(filp); + if (ret) + return ret; + + inode_lock(inode); + if (f2fs_is_atomic_file(inode)) - return 0; + goto out; ret = f2fs_convert_inline_inode(inode); if (ret) - return ret; + goto out; - set_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE); + set_inode_flag(inode, FI_ATOMIC_FILE); f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); - return 0; + if (!get_dirty_pages(inode)) + goto out; + + f2fs_msg(F2FS_I_SB(inode)->sb, KERN_WARNING, + "Unexpected flush for atomic writes: ino=%lu, npages=%lld", + inode->i_ino, get_dirty_pages(inode)); + ret = filemap_write_and_wait_range(inode->i_mapping, 0, LLONG_MAX); + if (ret) + clear_inode_flag(inode, FI_ATOMIC_FILE); +out: + inode_unlock(inode); + mnt_drop_write_file(filp); + return ret; } static int f2fs_ioc_commit_atomic_write(struct file *filp) @@ -1371,24 +1548,27 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp) if (!inode_owner_or_capable(inode)) return -EACCES; - if (f2fs_is_volatile_file(inode)) - return 0; - ret = mnt_want_write_file(filp); if (ret) return ret; + inode_lock(inode); + + if (f2fs_is_volatile_file(inode)) + goto err_out; + if (f2fs_is_atomic_file(inode)) { - clear_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE); + clear_inode_flag(inode, FI_ATOMIC_FILE); ret = commit_inmem_pages(inode); if (ret) { - set_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE); + set_inode_flag(inode, FI_ATOMIC_FILE); goto err_out; } } - ret = f2fs_sync_file(filp, 0, LLONG_MAX, 0); + ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 0, true); err_out: + inode_unlock(inode); mnt_drop_write_file(filp); return ret; } @@ -1401,32 +1581,54 @@ static int f2fs_ioc_start_volatile_write(struct file *filp) if (!inode_owner_or_capable(inode)) return -EACCES; + ret = mnt_want_write_file(filp); + if (ret) + return ret; + + inode_lock(inode); + if (f2fs_is_volatile_file(inode)) - return 0; + goto out; ret = f2fs_convert_inline_inode(inode); if (ret) - return ret; + goto out; - set_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE); + set_inode_flag(inode, FI_VOLATILE_FILE); f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); - return 0; +out: + inode_unlock(inode); + mnt_drop_write_file(filp); + return ret; } static int f2fs_ioc_release_volatile_write(struct file *filp) { struct inode *inode = file_inode(filp); + int ret; if (!inode_owner_or_capable(inode)) return -EACCES; + ret = mnt_want_write_file(filp); + if (ret) + return ret; + + inode_lock(inode); + if (!f2fs_is_volatile_file(inode)) - return 0; + goto out; - if (!f2fs_is_first_block_written(inode)) - return truncate_partial_data_page(inode, 0, true); + if (!f2fs_is_first_block_written(inode)) { + ret = truncate_partial_data_page(inode, 0, true); + goto out; + } - return punch_hole(inode, 0, F2FS_BLKSIZE); + ret = punch_hole(inode, 0, F2FS_BLKSIZE); +out: + inode_unlock(inode); + mnt_drop_write_file(filp); + return ret; } static int f2fs_ioc_abort_volatile_write(struct file *filp) @@ -1441,15 +1643,17 @@ static int f2fs_ioc_abort_volatile_write(struct file *filp) if (ret) return ret; - if (f2fs_is_atomic_file(inode)) { - clear_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE); + inode_lock(inode); + + if (f2fs_is_atomic_file(inode)) drop_inmem_pages(inode); - } if (f2fs_is_volatile_file(inode)) { - clear_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE); - ret = f2fs_sync_file(filp, 0, LLONG_MAX, 0); + clear_inode_flag(inode, FI_VOLATILE_FILE); + ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 0, true); } + inode_unlock(inode); + mnt_drop_write_file(filp); f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); return ret; @@ -1461,6 +1665,7 @@ static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg) struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct super_block *sb = sbi->sb; __u32 in; + int ret; if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -1468,31 +1673,38 @@ static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg) if (get_user(in, (__u32 __user *)arg)) return -EFAULT; + ret = mnt_want_write_file(filp); + if (ret) + return ret; + switch (in) { case F2FS_GOING_DOWN_FULLSYNC: sb = freeze_bdev(sb->s_bdev); if (sb && !IS_ERR(sb)) { - f2fs_stop_checkpoint(sbi); + f2fs_stop_checkpoint(sbi, false); thaw_bdev(sb->s_bdev, sb); } break; case F2FS_GOING_DOWN_METASYNC: /* do checkpoint only */ f2fs_sync_fs(sb, 1); - f2fs_stop_checkpoint(sbi); + f2fs_stop_checkpoint(sbi, false); break; case F2FS_GOING_DOWN_NOSYNC: - f2fs_stop_checkpoint(sbi); + f2fs_stop_checkpoint(sbi, false); break; case F2FS_GOING_DOWN_METAFLUSH: sync_meta_pages(sbi, META, LONG_MAX); - f2fs_stop_checkpoint(sbi); + f2fs_stop_checkpoint(sbi, false); break; default: - return -EINVAL; + ret = -EINVAL; + goto out; } f2fs_update_time(sbi, REQ_TIME); - return 0; +out: + mnt_drop_write_file(filp); + return ret; } static int f2fs_ioc_fitrim(struct file *filp, unsigned long arg) @@ -1513,9 +1725,14 @@ static int f2fs_ioc_fitrim(struct file *filp, unsigned long arg) sizeof(range))) return -EFAULT; + ret = mnt_want_write_file(filp); + if (ret) + return ret; + range.minlen = max((unsigned int)range.minlen, q->limits.discard_granularity); ret = f2fs_trim_fs(F2FS_SB(sb), &range); + mnt_drop_write_file(filp); if (ret < 0) return ret; @@ -1540,13 +1757,21 @@ static int f2fs_ioc_set_encryption_policy(struct file *filp, unsigned long arg) { struct fscrypt_policy policy; struct inode *inode = file_inode(filp); + int ret; if (copy_from_user(&policy, (struct fscrypt_policy __user *)arg, sizeof(policy))) return -EFAULT; + ret = mnt_want_write_file(filp); + if (ret) + return ret; + f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); - return fscrypt_process_policy(inode, &policy); + ret = fscrypt_process_policy(inode, &policy); + + mnt_drop_write_file(filp); + return ret; } static int f2fs_ioc_get_encryption_policy(struct file *filp, unsigned long arg) @@ -1603,6 +1828,7 @@ static int f2fs_ioc_gc(struct file *filp, unsigned long arg) struct inode *inode = file_inode(filp); struct f2fs_sb_info *sbi = F2FS_I_SB(inode); __u32 sync; + int ret; if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -1613,20 +1839,30 @@ static int f2fs_ioc_gc(struct file *filp, unsigned long arg) if (f2fs_readonly(sbi->sb)) return -EROFS; + ret = mnt_want_write_file(filp); + if (ret) + return ret; + if (!sync) { - if (!mutex_trylock(&sbi->gc_mutex)) - return -EBUSY; + if (!mutex_trylock(&sbi->gc_mutex)) { + ret = -EBUSY; + goto out; + } } else { mutex_lock(&sbi->gc_mutex); } - return f2fs_gc(sbi, sync); + ret = f2fs_gc(sbi, sync); +out: + mnt_drop_write_file(filp); + return ret; } static int f2fs_ioc_write_checkpoint(struct file *filp, unsigned long arg) { struct inode *inode = file_inode(filp); struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + int ret; if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -1634,7 +1870,14 @@ static int f2fs_ioc_write_checkpoint(struct file *filp, unsigned long arg) if (f2fs_readonly(sbi->sb)) return -EROFS; - return f2fs_sync_fs(sbi->sb, 1); + ret = mnt_want_write_file(filp); + if (ret) + return ret; + + ret = f2fs_sync_fs(sbi->sb, 1); + + mnt_drop_write_file(filp); + return ret; } static int f2fs_defragment_range(struct f2fs_sb_info *sbi, @@ -1738,7 +1981,7 @@ do_map: continue; } - set_inode_flag(F2FS_I(inode), FI_DO_DEFRAG); + set_inode_flag(inode, FI_DO_DEFRAG); idx = map.m_lblk; while (idx < map.m_lblk + map.m_len && cnt < blk_per_seg) { @@ -1763,14 +2006,14 @@ do_map: if (idx < pg_end && cnt < blk_per_seg) goto do_map; - clear_inode_flag(F2FS_I(inode), FI_DO_DEFRAG); + clear_inode_flag(inode, FI_DO_DEFRAG); err = filemap_fdatawrite(inode->i_mapping); if (err) goto out; } clear_out: - clear_inode_flag(F2FS_I(inode), FI_DO_DEFRAG); + clear_inode_flag(inode, FI_DO_DEFRAG); out: inode_unlock(inode); if (!err) @@ -1826,6 +2069,133 @@ out: return err; } +static int f2fs_move_file_range(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, size_t len) +{ + struct inode *src = file_inode(file_in); + struct inode *dst = file_inode(file_out); + struct f2fs_sb_info *sbi = F2FS_I_SB(src); + size_t olen = len, dst_max_i_size = 0; + size_t dst_osize; + int ret; + + if (file_in->f_path.mnt != file_out->f_path.mnt || + src->i_sb != dst->i_sb) + return -EXDEV; + + if (unlikely(f2fs_readonly(src->i_sb))) + return -EROFS; + + if (S_ISDIR(src->i_mode) || S_ISDIR(dst->i_mode)) + return -EISDIR; + + if (f2fs_encrypted_inode(src) || f2fs_encrypted_inode(dst)) + return -EOPNOTSUPP; + + inode_lock(src); + if (src != dst) + inode_lock(dst); + + ret = -EINVAL; + if (pos_in + len > src->i_size || pos_in + len < pos_in) + goto out_unlock; + if (len == 0) + olen = len = src->i_size - pos_in; + if (pos_in + len == src->i_size) + len = ALIGN(src->i_size, F2FS_BLKSIZE) - pos_in; + if (len == 0) { + ret = 0; + goto out_unlock; + } + + dst_osize = dst->i_size; + if (pos_out + olen > dst->i_size) + dst_max_i_size = pos_out + olen; + + /* verify the end result is block aligned */ + if (!IS_ALIGNED(pos_in, F2FS_BLKSIZE) || + !IS_ALIGNED(pos_in + len, F2FS_BLKSIZE) || + !IS_ALIGNED(pos_out, F2FS_BLKSIZE)) + goto out_unlock; + + ret = f2fs_convert_inline_inode(src); + if (ret) + goto out_unlock; + + ret = f2fs_convert_inline_inode(dst); + if (ret) + goto out_unlock; + + /* write out all dirty pages from offset */ + ret = filemap_write_and_wait_range(src->i_mapping, + pos_in, pos_in + len); + if (ret) + goto out_unlock; + + ret = filemap_write_and_wait_range(dst->i_mapping, + pos_out, pos_out + len); + if (ret) + goto out_unlock; + + f2fs_balance_fs(sbi, true); + f2fs_lock_op(sbi); + ret = __exchange_data_block(src, dst, pos_in, + pos_out, len >> F2FS_BLKSIZE_BITS, false); + + if (!ret) { + if (dst_max_i_size) + f2fs_i_size_write(dst, dst_max_i_size); + else if (dst_osize != dst->i_size) + f2fs_i_size_write(dst, dst_osize); + } + f2fs_unlock_op(sbi); +out_unlock: + if (src != dst) + inode_unlock(dst); + inode_unlock(src); + return ret; +} + +static int f2fs_ioc_move_range(struct file *filp, unsigned long arg) +{ + struct f2fs_move_range range; + struct fd dst; + int err; + + if (!(filp->f_mode & FMODE_READ) || + !(filp->f_mode & FMODE_WRITE)) + return -EBADF; + + if (copy_from_user(&range, (struct f2fs_move_range __user *)arg, + sizeof(range))) + return -EFAULT; + + dst = fdget(range.dst_fd); + if (!dst.file) + return -EBADF; + + if (!(dst.file->f_mode & FMODE_WRITE)) { + err = -EBADF; + goto err_out; + } + + err = mnt_want_write_file(filp); + if (err) + goto err_out; + + err = f2fs_move_file_range(filp, range.pos_in, dst.file, + range.pos_out, range.len); + + mnt_drop_write_file(filp); + + if (copy_to_user((struct f2fs_move_range __user *)arg, + &range, sizeof(range))) + err = -EFAULT; +err_out: + fdput(dst); + return err; +} + long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { switch (cmd) { @@ -1861,6 +2231,8 @@ long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) return f2fs_ioc_write_checkpoint(filp, arg); case F2FS_IOC_DEFRAGMENT: return f2fs_ioc_defragment(filp, arg); + case F2FS_IOC_MOVE_RANGE: + return f2fs_ioc_move_range(filp, arg); default: return -ENOTTY; } @@ -1870,6 +2242,7 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); + struct blk_plug plug; ssize_t ret; if (f2fs_encrypted_inode(inode) && @@ -1881,8 +2254,11 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) ret = generic_write_checks(iocb, from); if (ret > 0) { ret = f2fs_preallocate_blocks(iocb, from); - if (!ret) + if (!ret) { + blk_start_plug(&plug); ret = __generic_file_write_iter(iocb, from); + blk_finish_plug(&plug); + } } inode_unlock(inode); @@ -1917,6 +2293,8 @@ long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case F2FS_IOC_WRITE_CHECKPOINT: case F2FS_IOC_DEFRAGMENT: break; + case F2FS_IOC_MOVE_RANGE: + break; default: return -ENOIOCTLCMD; } diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index b0051a97824c..8f7fa326ce95 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -96,7 +96,7 @@ int start_gc_thread(struct f2fs_sb_info *sbi) dev_t dev = sbi->sb->s_bdev->bd_dev; int err = 0; - gc_th = kmalloc(sizeof(struct f2fs_gc_kthread), GFP_KERNEL); + gc_th = f2fs_kmalloc(sizeof(struct f2fs_gc_kthread), GFP_KERNEL); if (!gc_th) { err = -ENOMEM; goto out; @@ -465,15 +465,7 @@ next_step: continue; } - /* set page dirty and write it */ - if (gc_type == FG_GC) { - f2fs_wait_on_page_writeback(node_page, NODE, true); - set_page_dirty(node_page); - } else { - if (!PageWriteback(node_page)) - set_page_dirty(node_page); - } - f2fs_put_page(node_page, 1); + move_node_page(node_page, gc_type); stat_inc_node_blk_count(sbi, 1, gc_type); } @@ -546,7 +538,8 @@ static void move_encrypted_block(struct inode *inode, block_t bidx) struct f2fs_io_info fio = { .sbi = F2FS_I_SB(inode), .type = DATA, - .rw = READ_SYNC, + .op = REQ_OP_READ, + .op_flags = READ_SYNC, .encrypted_page = NULL, }; struct dnode_of_data dn; @@ -601,11 +594,11 @@ static void move_encrypted_block(struct inode *inode, block_t bidx) /* write page */ lock_page(fio.encrypted_page); - if (unlikely(!PageUptodate(fio.encrypted_page))) { + if (unlikely(fio.encrypted_page->mapping != META_MAPPING(fio.sbi))) { err = -EIO; goto put_page_out; } - if (unlikely(fio.encrypted_page->mapping != META_MAPPING(fio.sbi))) { + if (unlikely(!PageUptodate(fio.encrypted_page))) { err = -EIO; goto put_page_out; } @@ -620,14 +613,15 @@ static void move_encrypted_block(struct inode *inode, block_t bidx) /* allocate block address */ f2fs_wait_on_page_writeback(dn.node_page, NODE, true); - fio.rw = WRITE_SYNC; + fio.op = REQ_OP_WRITE; + fio.op_flags = WRITE_SYNC; fio.new_blkaddr = newaddr; f2fs_submit_page_mbio(&fio); f2fs_update_data_blkaddr(&dn, newaddr); - set_inode_flag(F2FS_I(inode), FI_APPEND_WRITE); + set_inode_flag(inode, FI_APPEND_WRITE); if (page->index == 0) - set_inode_flag(F2FS_I(inode), FI_FIRST_BLOCK_WRITTEN); + set_inode_flag(inode, FI_FIRST_BLOCK_WRITTEN); put_page_out: f2fs_put_page(fio.encrypted_page, 1); recover_block: @@ -657,16 +651,28 @@ static void move_data_page(struct inode *inode, block_t bidx, int gc_type) struct f2fs_io_info fio = { .sbi = F2FS_I_SB(inode), .type = DATA, - .rw = WRITE_SYNC, + .op = REQ_OP_WRITE, + .op_flags = WRITE_SYNC, .page = page, .encrypted_page = NULL, }; + bool is_dirty = PageDirty(page); + int err; + +retry: set_page_dirty(page); f2fs_wait_on_page_writeback(page, DATA, true); if (clear_page_dirty_for_io(page)) inode_dec_dirty_pages(inode); + set_cold_data(page); - do_write_data_page(&fio); + + err = do_write_data_page(&fio); + if (err == -ENOMEM && is_dirty) { + congestion_wait(BLK_RW_ASYNC, HZ/50); + goto retry; + } + clear_cold_data(page); } out: @@ -738,7 +744,8 @@ next_step: start_bidx = start_bidx_of_node(nofs, inode); data_page = get_read_data_page(inode, - start_bidx + ofs_in_node, READA, true); + start_bidx + ofs_in_node, REQ_RAHEAD, + true); if (IS_ERR(data_page)) { iput(inode); continue; @@ -752,12 +759,32 @@ next_step: /* phase 3 */ inode = find_gc_inode(gc_list, dni.ino); if (inode) { + struct f2fs_inode_info *fi = F2FS_I(inode); + bool locked = false; + + if (S_ISREG(inode->i_mode)) { + if (!down_write_trylock(&fi->dio_rwsem[READ])) + continue; + if (!down_write_trylock( + &fi->dio_rwsem[WRITE])) { + up_write(&fi->dio_rwsem[READ]); + continue; + } + locked = true; + } + start_bidx = start_bidx_of_node(nofs, inode) + ofs_in_node; if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) move_encrypted_block(inode, start_bidx); else move_data_page(inode, start_bidx, gc_type); + + if (locked) { + up_write(&fi->dio_rwsem[WRITE]); + up_write(&fi->dio_rwsem[READ]); + } + stat_inc_data_blk_count(sbi, 1, gc_type); } } @@ -806,6 +833,10 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, blk_start_plug(&plug); for (segno = start_segno; segno < end_segno; segno++) { + + if (get_valid_blocks(sbi, segno, 1) == 0) + continue; + /* find segment summary of victim */ sum_page = find_get_page(META_MAPPING(sbi), GET_SUM_BLOCK(sbi, segno)); @@ -834,18 +865,9 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, f2fs_put_page(sum_page, 0); } - if (gc_type == FG_GC) { - if (type == SUM_TYPE_NODE) { - struct writeback_control wbc = { - .sync_mode = WB_SYNC_ALL, - .nr_to_write = LONG_MAX, - .for_reclaim = 0, - }; - sync_node_pages(sbi, 0, &wbc); - } else { - f2fs_submit_merged_bio(sbi, DATA, WRITE); - } - } + if (gc_type == FG_GC) + f2fs_submit_merged_bio(sbi, + (type == SUM_TYPE_NODE) ? NODE : DATA, WRITE); blk_finish_plug(&plug); @@ -890,10 +912,13 @@ gc_more: * enough free sections, we should flush dent/node blocks and do * garbage collections. */ - if (__get_victim(sbi, &segno, gc_type) || prefree_segments(sbi)) + if (__get_victim(sbi, &segno, gc_type) || + prefree_segments(sbi)) { write_checkpoint(sbi, &cpc); - else if (has_not_enough_free_secs(sbi, 0)) + segno = NULL_SEGNO; + } else if (has_not_enough_free_secs(sbi, 0)) { write_checkpoint(sbi, &cpc); + } } if (segno == NULL_SEGNO && !__get_victim(sbi, &segno, gc_type)) diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index a2fbe6f427d3..ccea8735de59 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -59,7 +59,8 @@ void read_inline_data(struct page *page, struct page *ipage) memcpy(dst_addr, src_addr, MAX_INLINE_DATA); flush_dcache_page(page); kunmap_atomic(dst_addr); - SetPageUptodate(page); + if (!PageUptodate(page)) + SetPageUptodate(page); } bool truncate_inline_inode(struct page *ipage, u64 from) @@ -73,7 +74,7 @@ bool truncate_inline_inode(struct page *ipage, u64 from) f2fs_wait_on_page_writeback(ipage, NODE, true); memset(addr + from, 0, MAX_INLINE_DATA - from); - + set_page_dirty(ipage); return true; } @@ -97,7 +98,8 @@ int f2fs_read_inline_data(struct inode *inode, struct page *page) else read_inline_data(page, ipage); - SetPageUptodate(page); + if (!PageUptodate(page)) + SetPageUptodate(page); f2fs_put_page(ipage, 1); unlock_page(page); return 0; @@ -108,7 +110,8 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page) struct f2fs_io_info fio = { .sbi = F2FS_I_SB(dn->inode), .type = DATA, - .rw = WRITE_SYNC | REQ_PRIO, + .op = REQ_OP_WRITE, + .op_flags = WRITE_SYNC | REQ_PRIO, .page = page, .encrypted_page = NULL, }; @@ -138,7 +141,7 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page) inode_dec_dirty_pages(dn->inode); /* this converted inline_data should be recovered. */ - set_inode_flag(F2FS_I(dn->inode), FI_APPEND_WRITE); + set_inode_flag(dn->inode, FI_APPEND_WRITE); /* clear inline data and flag after data writeback */ truncate_inline_inode(dn->inode_page, 0); @@ -146,7 +149,6 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page) clear_out: stat_dec_inline_inode(dn->inode); f2fs_clear_inline_inode(dn->inode); - sync_inode_page(dn); f2fs_put_dnode(dn); return 0; } @@ -161,7 +163,7 @@ int f2fs_convert_inline_inode(struct inode *inode) if (!f2fs_has_inline_data(inode)) return 0; - page = grab_cache_page(inode->i_mapping, 0); + page = f2fs_grab_cache_page(inode->i_mapping, 0, false); if (!page) return -ENOMEM; @@ -212,11 +214,11 @@ int f2fs_write_inline_data(struct inode *inode, struct page *page) dst_addr = inline_data_addr(dn.inode_page); memcpy(dst_addr, src_addr, MAX_INLINE_DATA); kunmap_atomic(src_addr); + set_page_dirty(dn.inode_page); - set_inode_flag(F2FS_I(inode), FI_APPEND_WRITE); - set_inode_flag(F2FS_I(inode), FI_DATA_EXIST); + set_inode_flag(inode, FI_APPEND_WRITE); + set_inode_flag(inode, FI_DATA_EXIST); - sync_inode_page(&dn); clear_inline_node(dn.inode_page); f2fs_put_dnode(&dn); return 0; @@ -252,10 +254,10 @@ process_inline: dst_addr = inline_data_addr(ipage); memcpy(dst_addr, src_addr, MAX_INLINE_DATA); - set_inode_flag(F2FS_I(inode), FI_INLINE_DATA); - set_inode_flag(F2FS_I(inode), FI_DATA_EXIST); + set_inode_flag(inode, FI_INLINE_DATA); + set_inode_flag(inode, FI_DATA_EXIST); - update_inode(inode, ipage); + set_page_dirty(ipage); f2fs_put_page(ipage, 1); return true; } @@ -266,7 +268,6 @@ process_inline: if (!truncate_inline_inode(ipage, 0)) return false; f2fs_clear_inline_inode(inode); - update_inode(inode, ipage); f2fs_put_page(ipage, 1); } else if (ri && (ri->i_inline & F2FS_INLINE_DATA)) { if (truncate_blocks(inode, 0, false)) @@ -288,8 +289,10 @@ struct f2fs_dir_entry *find_in_inline_dir(struct inode *dir, f2fs_hash_t namehash; ipage = get_node_page(sbi, dir->i_ino); - if (IS_ERR(ipage)) + if (IS_ERR(ipage)) { + *res_page = ipage; return NULL; + } namehash = f2fs_dentry_hash(&name); @@ -303,30 +306,6 @@ struct f2fs_dir_entry *find_in_inline_dir(struct inode *dir, else f2fs_put_page(ipage, 0); - /* - * For the most part, it should be a bug when name_len is zero. - * We stop here for figuring out where the bugs has occurred. - */ - f2fs_bug_on(sbi, d.max < 0); - return de; -} - -struct f2fs_dir_entry *f2fs_parent_inline_dir(struct inode *dir, - struct page **p) -{ - struct f2fs_sb_info *sbi = F2FS_I_SB(dir); - struct page *ipage; - struct f2fs_dir_entry *de; - struct f2fs_inline_dentry *dentry_blk; - - ipage = get_node_page(sbi, dir->i_ino); - if (IS_ERR(ipage)) - return NULL; - - dentry_blk = inline_data_addr(ipage); - de = &dentry_blk->dentry[1]; - *p = ipage; - unlock_page(ipage); return de; } @@ -344,10 +323,8 @@ int make_empty_inline_dir(struct inode *inode, struct inode *parent, set_page_dirty(ipage); /* update i_size to MAX_INLINE_DATA */ - if (i_size_read(inode) < MAX_INLINE_DATA) { - i_size_write(inode, MAX_INLINE_DATA); - set_inode_flag(F2FS_I(inode), FI_UPDATE_DIR); - } + if (i_size_read(inode) < MAX_INLINE_DATA) + f2fs_i_size_write(inode, MAX_INLINE_DATA); return 0; } @@ -355,7 +332,7 @@ int make_empty_inline_dir(struct inode *inode, struct inode *parent, * NOTE: ipage is grabbed by caller, but if any error occurs, we should * release ipage in this function. */ -static int f2fs_convert_inline_dir(struct inode *dir, struct page *ipage, +static int f2fs_move_inline_dirents(struct inode *dir, struct page *ipage, struct f2fs_inline_dentry *inline_dentry) { struct page *page; @@ -363,7 +340,7 @@ static int f2fs_convert_inline_dir(struct inode *dir, struct page *ipage, struct f2fs_dentry_block *dentry_blk; int err; - page = grab_cache_page(dir->i_mapping, 0); + page = f2fs_grab_cache_page(dir->i_mapping, 0, false); if (!page) { f2fs_put_page(ipage, 1); return -ENOMEM; @@ -396,26 +373,121 @@ static int f2fs_convert_inline_dir(struct inode *dir, struct page *ipage, NR_INLINE_DENTRY * F2FS_SLOT_LEN); kunmap_atomic(dentry_blk); - SetPageUptodate(page); + if (!PageUptodate(page)) + SetPageUptodate(page); set_page_dirty(page); /* clear inline dir and flag after data writeback */ truncate_inline_inode(ipage, 0); stat_dec_inline_dir(dir); - clear_inode_flag(F2FS_I(dir), FI_INLINE_DENTRY); - - if (i_size_read(dir) < PAGE_SIZE) { - i_size_write(dir, PAGE_SIZE); - set_inode_flag(F2FS_I(dir), FI_UPDATE_DIR); - } + clear_inode_flag(dir, FI_INLINE_DENTRY); - sync_inode_page(&dn); + f2fs_i_depth_write(dir, 1); + if (i_size_read(dir) < PAGE_SIZE) + f2fs_i_size_write(dir, PAGE_SIZE); out: f2fs_put_page(page, 1); return err; } +static int f2fs_add_inline_entries(struct inode *dir, + struct f2fs_inline_dentry *inline_dentry) +{ + struct f2fs_dentry_ptr d; + unsigned long bit_pos = 0; + int err = 0; + + make_dentry_ptr(NULL, &d, (void *)inline_dentry, 2); + + while (bit_pos < d.max) { + struct f2fs_dir_entry *de; + struct qstr new_name; + nid_t ino; + umode_t fake_mode; + + if (!test_bit_le(bit_pos, d.bitmap)) { + bit_pos++; + continue; + } + + de = &d.dentry[bit_pos]; + + if (unlikely(!de->name_len)) { + bit_pos++; + continue; + } + + new_name.name = d.filename[bit_pos]; + new_name.len = de->name_len; + + ino = le32_to_cpu(de->ino); + fake_mode = get_de_type(de) << S_SHIFT; + + err = f2fs_add_regular_entry(dir, &new_name, NULL, + ino, fake_mode); + if (err) + goto punch_dentry_pages; + + bit_pos += GET_DENTRY_SLOTS(le16_to_cpu(de->name_len)); + } + return 0; +punch_dentry_pages: + truncate_inode_pages(&dir->i_data, 0); + truncate_blocks(dir, 0, false); + remove_dirty_inode(dir); + return err; +} + +static int f2fs_move_rehashed_dirents(struct inode *dir, struct page *ipage, + struct f2fs_inline_dentry *inline_dentry) +{ + struct f2fs_inline_dentry *backup_dentry; + int err; + + backup_dentry = f2fs_kmalloc(sizeof(struct f2fs_inline_dentry), + GFP_F2FS_ZERO); + if (!backup_dentry) { + f2fs_put_page(ipage, 1); + return -ENOMEM; + } + + memcpy(backup_dentry, inline_dentry, MAX_INLINE_DATA); + truncate_inline_inode(ipage, 0); + + unlock_page(ipage); + + err = f2fs_add_inline_entries(dir, backup_dentry); + if (err) + goto recover; + + lock_page(ipage); + + stat_dec_inline_dir(dir); + clear_inode_flag(dir, FI_INLINE_DENTRY); + kfree(backup_dentry); + return 0; +recover: + lock_page(ipage); + memcpy(inline_dentry, backup_dentry, MAX_INLINE_DATA); + f2fs_i_depth_write(dir, 0); + f2fs_i_size_write(dir, MAX_INLINE_DATA); + set_page_dirty(ipage); + f2fs_put_page(ipage, 1); + + kfree(backup_dentry); + return err; +} + +static int f2fs_convert_inline_dir(struct inode *dir, struct page *ipage, + struct f2fs_inline_dentry *inline_dentry) +{ + if (!F2FS_I(dir)->i_dir_level) + return f2fs_move_inline_dirents(dir, ipage, inline_dentry); + else + return f2fs_move_rehashed_dirents(dir, ipage, inline_dentry); +} + int f2fs_add_inline_entry(struct inode *dir, const struct qstr *name, struct inode *inode, nid_t ino, umode_t mode) { @@ -464,8 +536,7 @@ int f2fs_add_inline_entry(struct inode *dir, const struct qstr *name, /* we don't need to mark_inode_dirty now */ if (inode) { - F2FS_I(inode)->i_pino = dir->i_ino; - update_inode(inode, page); + f2fs_i_pino_write(inode, dir->i_ino); f2fs_put_page(page, 1); } @@ -473,11 +544,6 @@ int f2fs_add_inline_entry(struct inode *dir, const struct qstr *name, fail: if (inode) up_write(&F2FS_I(inode)->i_sem); - - if (is_inode_flag_set(F2FS_I(dir), FI_UPDATE_DIR)) { - update_inode(dir, ipage); - clear_inode_flag(F2FS_I(dir), FI_UPDATE_DIR); - } out: f2fs_put_page(ipage, 1); return err; @@ -501,13 +567,13 @@ void f2fs_delete_inline_entry(struct f2fs_dir_entry *dentry, struct page *page, &inline_dentry->dentry_bitmap); set_page_dirty(page); + f2fs_put_page(page, 1); dir->i_ctime = dir->i_mtime = CURRENT_TIME; + f2fs_mark_inode_dirty_sync(dir); if (inode) - f2fs_drop_nlink(dir, inode, page); - - f2fs_put_page(page, 1); + f2fs_drop_nlink(dir, inode); } bool f2fs_empty_inline_dir(struct inode *dir) diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index cb269c46ac25..9ac5efc15347 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -18,6 +18,13 @@ #include <trace/events/f2fs.h> +void f2fs_mark_inode_dirty_sync(struct inode *inode) +{ + if (f2fs_inode_dirtied(inode)) + return; + mark_inode_dirty_sync(inode); +} + void f2fs_set_inode_flags(struct inode *inode) { unsigned int flags = F2FS_I(inode)->i_flags; @@ -35,6 +42,7 @@ void f2fs_set_inode_flags(struct inode *inode) new_fl |= S_DIRSYNC; inode_set_flags(inode, new_fl, S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC); + f2fs_mark_inode_dirty_sync(inode); } static void __get_inode_rdev(struct inode *inode, struct f2fs_inode *ri) @@ -85,8 +93,8 @@ static void __recover_inline_status(struct inode *inode, struct page *ipage) if (*start++) { f2fs_wait_on_page_writeback(ipage, NODE, true); - set_inode_flag(F2FS_I(inode), FI_DATA_EXIST); - set_raw_inline(F2FS_I(inode), F2FS_INODE(ipage)); + set_inode_flag(inode, FI_DATA_EXIST); + set_raw_inline(inode, F2FS_INODE(ipage)); set_page_dirty(ipage); return; } @@ -141,7 +149,7 @@ static int do_read_inode(struct inode *inode) if (f2fs_init_extent_tree(inode, &ri->i_ext)) set_page_dirty(node_page); - get_inline_info(fi, ri); + get_inline_info(inode, ri); /* check data exist */ if (f2fs_has_inline_data(inode) && !f2fs_exist_data(inode)) @@ -151,7 +159,10 @@ static int do_read_inode(struct inode *inode) __get_inode_rdev(inode, ri); if (__written_first_block(ri)) - set_inode_flag(F2FS_I(inode), FI_FIRST_BLOCK_WRITTEN); + set_inode_flag(inode, FI_FIRST_BLOCK_WRITTEN); + + if (!need_inode_block_update(sbi, inode->i_ino)) + fi->last_disk_size = inode->i_size; f2fs_put_page(node_page, 1); @@ -227,6 +238,8 @@ int update_inode(struct inode *inode, struct page *node_page) { struct f2fs_inode *ri; + f2fs_inode_synced(inode); + f2fs_wait_on_page_writeback(node_page, NODE, true); ri = F2FS_INODE(node_page); @@ -244,7 +257,7 @@ int update_inode(struct inode *inode, struct page *node_page) &ri->i_ext); else memset(&ri->i_ext, 0, sizeof(ri->i_ext)); - set_raw_inline(F2FS_I(inode), ri); + set_raw_inline(inode, ri); ri->i_atime = cpu_to_le64(inode->i_atime.tv_sec); ri->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec); @@ -261,7 +274,6 @@ int update_inode(struct inode *inode, struct page *node_page) __set_inode_rdev(inode, ri); set_cold_node(inode, node_page); - clear_inode_flag(F2FS_I(inode), FI_DIRTY_INODE); /* deleted inode */ if (inode->i_nlink == 0) @@ -283,8 +295,9 @@ retry: cond_resched(); goto retry; } else if (err != -ENOENT) { - f2fs_stop_checkpoint(sbi); + f2fs_stop_checkpoint(sbi, false); } + f2fs_inode_synced(inode); return 0; } ret = update_inode(inode, node_page); @@ -300,7 +313,7 @@ int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc) inode->i_ino == F2FS_META_INO(sbi)) return 0; - if (!is_inode_flag_set(F2FS_I(inode), FI_DIRTY_INODE)) + if (!is_inode_flag_set(inode, FI_DIRTY_INODE)) return 0; /* @@ -318,8 +331,7 @@ int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc) void f2fs_evict_inode(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct f2fs_inode_info *fi = F2FS_I(inode); - nid_t xnid = fi->i_xattr_nid; + nid_t xnid = F2FS_I(inode)->i_xattr_nid; int err = 0; /* some remained atomic pages should discarded */ @@ -341,12 +353,17 @@ void f2fs_evict_inode(struct inode *inode) if (inode->i_nlink || is_bad_inode(inode)) goto no_delete; +#ifdef CONFIG_F2FS_FAULT_INJECTION + if (time_to_inject(FAULT_EVICT_INODE)) + goto no_delete; +#endif + sb_start_intwrite(inode->i_sb); - set_inode_flag(fi, FI_NO_ALLOC); + set_inode_flag(inode, FI_NO_ALLOC); i_size_write(inode, 0); - +retry: if (F2FS_HAS_BLOCKS(inode)) - err = f2fs_truncate(inode, true); + err = f2fs_truncate(inode); if (!err) { f2fs_lock_op(sbi); @@ -354,6 +371,14 @@ void f2fs_evict_inode(struct inode *inode) f2fs_unlock_op(sbi); } + /* give more chances, if ENOMEM case */ + if (err == -ENOMEM) { + err = 0; + goto retry; + } + + if (err) + update_inode_page(inode); sb_end_intwrite(inode->i_sb); no_delete: stat_dec_inline_xattr(inode); @@ -363,31 +388,16 @@ no_delete: invalidate_mapping_pages(NODE_MAPPING(sbi), inode->i_ino, inode->i_ino); if (xnid) invalidate_mapping_pages(NODE_MAPPING(sbi), xnid, xnid); - if (is_inode_flag_set(fi, FI_APPEND_WRITE)) + if (is_inode_flag_set(inode, FI_APPEND_WRITE)) add_ino_entry(sbi, inode->i_ino, APPEND_INO); - if (is_inode_flag_set(fi, FI_UPDATE_WRITE)) + if (is_inode_flag_set(inode, FI_UPDATE_WRITE)) add_ino_entry(sbi, inode->i_ino, UPDATE_INO); - if (is_inode_flag_set(fi, FI_FREE_NID)) { - if (err && err != -ENOENT) - alloc_nid_done(sbi, inode->i_ino); - else - alloc_nid_failed(sbi, inode->i_ino); - clear_inode_flag(fi, FI_FREE_NID); - } - - if (err && err != -ENOENT) { - if (!exist_written_data(sbi, inode->i_ino, ORPHAN_INO)) { - /* - * get here because we failed to release resource - * of inode previously, reminder our user to run fsck - * for fixing. - */ - set_sbi_flag(sbi, SBI_NEED_FSCK); - f2fs_msg(sbi->sb, KERN_WARNING, - "inode (ino:%lu) resource leak, run fsck " - "to fix this issue!", inode->i_ino); - } + if (is_inode_flag_set(inode, FI_FREE_NID)) { + alloc_nid_failed(sbi, inode->i_ino); + clear_inode_flag(inode, FI_FREE_NID); } + f2fs_bug_on(sbi, err && + !exist_written_data(sbi, inode->i_ino, ORPHAN_INO)); out_clear: fscrypt_put_encryption_info(inode, NULL); clear_inode(inode); @@ -397,37 +407,32 @@ out_clear: void handle_failed_inode(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - int err = 0; + struct node_info ni; - clear_nlink(inode); - make_bad_inode(inode); + /* don't make bad inode, since it becomes a regular file. */ unlock_new_inode(inode); - i_size_write(inode, 0); - if (F2FS_HAS_BLOCKS(inode)) - err = f2fs_truncate(inode, false); - - if (!err) - err = remove_inode_page(inode); - /* - * if we skip truncate_node in remove_inode_page bacause we failed - * before, it's better to find another way to release resource of - * this inode (e.g. valid block count, node block or nid). Here we - * choose to add this inode to orphan list, so that we can call iput - * for releasing in orphan recovery flow. - * * Note: we should add inode to orphan list before f2fs_unlock_op() * so we can prevent losing this orphan when encoutering checkpoint * and following suddenly power-off. */ - if (err && err != -ENOENT) { - err = acquire_orphan_inode(sbi); - if (!err) - add_orphan_inode(sbi, inode->i_ino); + get_node_info(sbi, inode->i_ino, &ni); + + if (ni.blk_addr != NULL_ADDR) { + int err = acquire_orphan_inode(sbi); + if (err) { + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_msg(sbi->sb, KERN_WARNING, + "Too many orphan inodes, run fsck to fix."); + } else { + add_orphan_inode(inode); + } + alloc_nid_done(sbi, inode->i_ino); + } else { + set_inode_flag(inode, FI_FREE_NID); } - set_inode_flag(F2FS_I(inode), FI_FREE_NID); f2fs_unlock_op(sbi); /* iput will drop the inode object */ diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 324ed3812f30..73fa356f8fbb 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -60,10 +60,14 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) if (f2fs_encrypted_inode(dir) && f2fs_may_encrypt(inode)) f2fs_set_encrypted_inode(inode); + set_inode_flag(inode, FI_NEW_INODE); + + if (test_opt(sbi, INLINE_XATTR)) + set_inode_flag(inode, FI_INLINE_XATTR); if (test_opt(sbi, INLINE_DATA) && f2fs_may_inline_data(inode)) - set_inode_flag(F2FS_I(inode), FI_INLINE_DATA); + set_inode_flag(inode, FI_INLINE_DATA); if (f2fs_may_inline_dentry(inode)) - set_inode_flag(F2FS_I(inode), FI_INLINE_DENTRY); + set_inode_flag(inode, FI_INLINE_DENTRY); f2fs_init_extent_tree(inode, NULL); @@ -72,14 +76,13 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) stat_inc_inline_dir(inode); trace_f2fs_new_inode(inode, 0); - mark_inode_dirty(inode); return inode; fail: trace_f2fs_new_inode(inode, err); make_bad_inode(inode); if (nid_free) - set_inode_flag(F2FS_I(inode), FI_FREE_NID); + set_inode_flag(inode, FI_FREE_NID); iput(inode); return ERR_PTR(err); } @@ -177,7 +180,7 @@ static int f2fs_link(struct dentry *old_dentry, struct inode *dir, inode->i_ctime = CURRENT_TIME; ihold(inode); - set_inode_flag(F2FS_I(inode), FI_INC_LINK); + set_inode_flag(inode, FI_INC_LINK); f2fs_lock_op(sbi); err = f2fs_add_link(dentry, inode); if (err) @@ -190,7 +193,7 @@ static int f2fs_link(struct dentry *old_dentry, struct inode *dir, f2fs_sync_fs(sbi->sb, 1); return 0; out: - clear_inode_flag(F2FS_I(inode), FI_INC_LINK); + clear_inode_flag(inode, FI_INC_LINK); iput(inode); f2fs_unlock_op(sbi); return err; @@ -199,9 +202,13 @@ out: struct dentry *f2fs_get_parent(struct dentry *child) { struct qstr dotdot = QSTR_INIT("..", 2); - unsigned long ino = f2fs_inode_by_name(d_inode(child), &dotdot); - if (!ino) + struct page *page; + unsigned long ino = f2fs_inode_by_name(d_inode(child), &dotdot, &page); + if (!ino) { + if (IS_ERR(page)) + return ERR_CAST(page); return ERR_PTR(-ENOENT); + } return d_obtain_alias(f2fs_iget(child->d_sb, ino)); } @@ -229,6 +236,9 @@ static int __recover_dot_dentries(struct inode *dir, nid_t pino) if (de) { f2fs_dentry_kunmap(dir, page); f2fs_put_page(page, 0); + } else if (IS_ERR(page)) { + err = PTR_ERR(page); + goto out; } else { err = __f2fs_add_link(dir, &dot, NULL, dir->i_ino, S_IFDIR); if (err) @@ -239,14 +249,14 @@ static int __recover_dot_dentries(struct inode *dir, nid_t pino) if (de) { f2fs_dentry_kunmap(dir, page); f2fs_put_page(page, 0); + } else if (IS_ERR(page)) { + err = PTR_ERR(page); } else { err = __f2fs_add_link(dir, &dotdot, NULL, pino, S_IFDIR); } out: - if (!err) { - clear_inode_flag(F2FS_I(dir), FI_INLINE_DOTS); - mark_inode_dirty(dir); - } + if (!err) + clear_inode_flag(dir, FI_INLINE_DOTS); f2fs_unlock_op(sbi); return err; @@ -281,8 +291,11 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry, return ERR_PTR(-ENAMETOOLONG); de = f2fs_find_entry(dir, &dentry->d_name, &page); - if (!de) + if (!de) { + if (IS_ERR(page)) + return (struct dentry *)page; return d_splice_alias(inode, dentry); + } ino = le32_to_cpu(de->ino); f2fs_dentry_kunmap(dir, page); @@ -329,8 +342,11 @@ static int f2fs_unlink(struct inode *dir, struct dentry *dentry) trace_f2fs_unlink_enter(dir, dentry); de = f2fs_find_entry(dir, &dentry->d_name, &page); - if (!de) + if (!de) { + if (IS_ERR(page)) + err = PTR_ERR(page); goto fail; + } f2fs_balance_fs(sbi, true); @@ -345,9 +361,6 @@ static int f2fs_unlink(struct inode *dir, struct dentry *dentry) f2fs_delete_entry(de, page, dir, inode); f2fs_unlock_op(sbi); - /* In order to evict this inode, we set it dirty */ - mark_inode_dirty(inode); - if (IS_DIRSYNC(dir)) f2fs_sync_fs(sbi->sb, 1); fail: @@ -492,7 +505,7 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) f2fs_balance_fs(sbi, true); - set_inode_flag(F2FS_I(inode), FI_INC_LINK); + set_inode_flag(inode, FI_INC_LINK); f2fs_lock_op(sbi); err = f2fs_add_link(dentry, inode); if (err) @@ -509,7 +522,7 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) return 0; out_fail: - clear_inode_flag(F2FS_I(inode), FI_INC_LINK); + clear_inode_flag(inode, FI_INC_LINK); handle_failed_inode(inode); return err; } @@ -592,17 +605,17 @@ static int __f2fs_tmpfile(struct inode *dir, struct dentry *dentry, * add this non-linked tmpfile to orphan list, in this way we could * remove all unused data of tmpfile after abnormal power-off. */ - add_orphan_inode(sbi, inode->i_ino); - f2fs_unlock_op(sbi); - + add_orphan_inode(inode); alloc_nid_done(sbi, inode->i_ino); if (whiteout) { - inode_dec_link_count(inode); + f2fs_i_links_write(inode, false); *whiteout = inode; } else { d_tmpfile(dentry, inode); } + /* link_count was changed by d_tmpfile as well. */ + f2fs_unlock_op(sbi); unlock_new_inode(inode); return 0; @@ -652,14 +665,19 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, } old_entry = f2fs_find_entry(old_dir, &old_dentry->d_name, &old_page); - if (!old_entry) + if (!old_entry) { + if (IS_ERR(old_page)) + err = PTR_ERR(old_page); goto out; + } if (S_ISDIR(old_inode->i_mode)) { - err = -EIO; old_dir_entry = f2fs_parent_dir(old_inode, &old_dir_page); - if (!old_dir_entry) + if (!old_dir_entry) { + if (IS_ERR(old_dir_page)) + err = PTR_ERR(old_dir_page); goto out_old; + } } if (flags & RENAME_WHITEOUT) { @@ -677,8 +695,11 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, err = -ENOENT; new_entry = f2fs_find_entry(new_dir, &new_dentry->d_name, &new_page); - if (!new_entry) + if (!new_entry) { + if (IS_ERR(new_page)) + err = PTR_ERR(new_page); goto out_whiteout; + } f2fs_balance_fs(sbi, true); @@ -700,19 +721,14 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, new_inode->i_ctime = CURRENT_TIME; down_write(&F2FS_I(new_inode)->i_sem); if (old_dir_entry) - drop_nlink(new_inode); - drop_nlink(new_inode); + f2fs_i_links_write(new_inode, false); + f2fs_i_links_write(new_inode, false); up_write(&F2FS_I(new_inode)->i_sem); - mark_inode_dirty(new_inode); - if (!new_inode->i_nlink) - add_orphan_inode(sbi, new_inode->i_ino); + add_orphan_inode(new_inode); else release_orphan_inode(sbi); - - update_inode_page(old_inode); - update_inode_page(new_inode); } else { f2fs_balance_fs(sbi, true); @@ -724,10 +740,8 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, goto out_whiteout; } - if (old_dir_entry) { - inc_nlink(new_dir); - update_inode_page(new_dir); - } + if (old_dir_entry) + f2fs_i_links_write(new_dir, true); /* * old entry and new entry can locate in the same inline @@ -743,7 +757,9 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, old_entry = f2fs_find_entry(old_dir, &old_dentry->d_name, &old_page); if (!old_entry) { - err = -EIO; + err = -ENOENT; + if (IS_ERR(old_page)) + err = PTR_ERR(old_page); f2fs_unlock_op(sbi); goto out_whiteout; } @@ -757,13 +773,13 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, up_write(&F2FS_I(old_inode)->i_sem); old_inode->i_ctime = CURRENT_TIME; - mark_inode_dirty(old_inode); + f2fs_mark_inode_dirty_sync(old_inode); f2fs_delete_entry(old_entry, old_page, old_dir, NULL); if (whiteout) { whiteout->i_state |= I_LINKABLE; - set_inode_flag(F2FS_I(whiteout), FI_INC_LINK); + set_inode_flag(whiteout, FI_INC_LINK); err = f2fs_add_link(old_dentry, whiteout); if (err) goto put_out_dir; @@ -775,14 +791,11 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, if (old_dir != new_dir && !whiteout) { f2fs_set_link(old_inode, old_dir_entry, old_dir_page, new_dir); - update_inode_page(old_inode); } else { f2fs_dentry_kunmap(old_inode, old_dir_page); f2fs_put_page(old_dir_page, 0); } - drop_nlink(old_dir); - mark_inode_dirty(old_dir); - update_inode_page(old_dir); + f2fs_i_links_write(old_dir, false); } f2fs_unlock_op(sbi); @@ -832,29 +845,39 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, return -EPERM; old_entry = f2fs_find_entry(old_dir, &old_dentry->d_name, &old_page); - if (!old_entry) + if (!old_entry) { + if (IS_ERR(old_page)) + err = PTR_ERR(old_page); goto out; + } new_entry = f2fs_find_entry(new_dir, &new_dentry->d_name, &new_page); - if (!new_entry) + if (!new_entry) { + if (IS_ERR(new_page)) + err = PTR_ERR(new_page); goto out_old; + } /* prepare for updating ".." directory entry info later */ if (old_dir != new_dir) { if (S_ISDIR(old_inode->i_mode)) { - err = -EIO; old_dir_entry = f2fs_parent_dir(old_inode, &old_dir_page); - if (!old_dir_entry) + if (!old_dir_entry) { + if (IS_ERR(old_dir_page)) + err = PTR_ERR(old_dir_page); goto out_new; + } } if (S_ISDIR(new_inode->i_mode)) { - err = -EIO; new_dir_entry = f2fs_parent_dir(new_inode, &new_dir_page); - if (!new_dir_entry) + if (!new_dir_entry) { + if (IS_ERR(new_dir_page)) + err = PTR_ERR(new_dir_page); goto out_old_dir; + } } } @@ -904,19 +927,13 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, file_lost_pino(old_inode); up_write(&F2FS_I(old_inode)->i_sem); - update_inode_page(old_inode); - old_dir->i_ctime = CURRENT_TIME; if (old_nlink) { down_write(&F2FS_I(old_dir)->i_sem); - if (old_nlink < 0) - drop_nlink(old_dir); - else - inc_nlink(old_dir); + f2fs_i_links_write(old_dir, old_nlink > 0); up_write(&F2FS_I(old_dir)->i_sem); } - mark_inode_dirty(old_dir); - update_inode_page(old_dir); + f2fs_mark_inode_dirty_sync(old_dir); /* update directory entry info of new dir inode */ f2fs_set_link(new_dir, new_entry, new_page, old_inode); @@ -925,19 +942,13 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, file_lost_pino(new_inode); up_write(&F2FS_I(new_inode)->i_sem); - update_inode_page(new_inode); - new_dir->i_ctime = CURRENT_TIME; if (new_nlink) { down_write(&F2FS_I(new_dir)->i_sem); - if (new_nlink < 0) - drop_nlink(new_dir); - else - inc_nlink(new_dir); + f2fs_i_links_write(new_dir, new_nlink > 0); up_write(&F2FS_I(new_dir)->i_sem); } - mark_inode_dirty(new_dir); - update_inode_page(new_dir); + f2fs_mark_inode_dirty_sync(new_dir); f2fs_unlock_op(sbi); diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 1a33de9d84b1..b2fa4b615925 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -52,6 +52,10 @@ bool available_free_memory(struct f2fs_sb_info *sbi, int type) mem_size = (nm_i->nat_cnt * sizeof(struct nat_entry)) >> PAGE_SHIFT; res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 2); + if (excess_cached_nats(sbi)) + res = false; + if (nm_i->nat_cnt > DEF_NAT_CACHE_THRESHOLD) + res = false; } else if (type == DIRTY_DENTS) { if (sbi->sb->s_bdi->wb.dirty_exceeded) return false; @@ -202,14 +206,14 @@ int need_dentry_mark(struct f2fs_sb_info *sbi, nid_t nid) struct nat_entry *e; bool need = false; - down_read(&nm_i->nat_tree_lock); + percpu_down_read(&nm_i->nat_tree_lock); e = __lookup_nat_cache(nm_i, nid); if (e) { if (!get_nat_flag(e, IS_CHECKPOINTED) && !get_nat_flag(e, HAS_FSYNCED_INODE)) need = true; } - up_read(&nm_i->nat_tree_lock); + percpu_up_read(&nm_i->nat_tree_lock); return need; } @@ -219,11 +223,11 @@ bool is_checkpointed_node(struct f2fs_sb_info *sbi, nid_t nid) struct nat_entry *e; bool is_cp = true; - down_read(&nm_i->nat_tree_lock); + percpu_down_read(&nm_i->nat_tree_lock); e = __lookup_nat_cache(nm_i, nid); if (e && !get_nat_flag(e, IS_CHECKPOINTED)) is_cp = false; - up_read(&nm_i->nat_tree_lock); + percpu_up_read(&nm_i->nat_tree_lock); return is_cp; } @@ -233,13 +237,13 @@ bool need_inode_block_update(struct f2fs_sb_info *sbi, nid_t ino) struct nat_entry *e; bool need_update = true; - down_read(&nm_i->nat_tree_lock); + percpu_down_read(&nm_i->nat_tree_lock); e = __lookup_nat_cache(nm_i, ino); if (e && get_nat_flag(e, HAS_LAST_FSYNC) && (get_nat_flag(e, IS_CHECKPOINTED) || get_nat_flag(e, HAS_FSYNCED_INODE))) need_update = false; - up_read(&nm_i->nat_tree_lock); + percpu_up_read(&nm_i->nat_tree_lock); return need_update; } @@ -280,7 +284,7 @@ static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni, struct f2fs_nm_info *nm_i = NM_I(sbi); struct nat_entry *e; - down_write(&nm_i->nat_tree_lock); + percpu_down_write(&nm_i->nat_tree_lock); e = __lookup_nat_cache(nm_i, ni->nid); if (!e) { e = grab_nat_entry(nm_i, ni->nid); @@ -330,7 +334,7 @@ static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni, set_nat_flag(e, HAS_FSYNCED_INODE, true); set_nat_flag(e, HAS_LAST_FSYNC, fsync_done); } - up_write(&nm_i->nat_tree_lock); + percpu_up_write(&nm_i->nat_tree_lock); } int try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink) @@ -338,8 +342,7 @@ int try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink) struct f2fs_nm_info *nm_i = NM_I(sbi); int nr = nr_shrink; - if (!down_write_trylock(&nm_i->nat_tree_lock)) - return 0; + percpu_down_write(&nm_i->nat_tree_lock); while (nr_shrink && !list_empty(&nm_i->nat_entries)) { struct nat_entry *ne; @@ -348,7 +351,7 @@ int try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink) __del_from_nat_cache(nm_i, ne); nr_shrink--; } - up_write(&nm_i->nat_tree_lock); + percpu_up_write(&nm_i->nat_tree_lock); return nr - nr_shrink; } @@ -370,13 +373,13 @@ void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni) ni->nid = nid; /* Check nat cache */ - down_read(&nm_i->nat_tree_lock); + percpu_down_read(&nm_i->nat_tree_lock); e = __lookup_nat_cache(nm_i, nid); if (e) { ni->ino = nat_get_ino(e); ni->blk_addr = nat_get_blkaddr(e); ni->version = nat_get_version(e); - up_read(&nm_i->nat_tree_lock); + percpu_up_read(&nm_i->nat_tree_lock); return; } @@ -400,11 +403,34 @@ void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni) node_info_from_raw_nat(ni, &ne); f2fs_put_page(page, 1); cache: - up_read(&nm_i->nat_tree_lock); + percpu_up_read(&nm_i->nat_tree_lock); /* cache nat entry */ - down_write(&nm_i->nat_tree_lock); + percpu_down_write(&nm_i->nat_tree_lock); cache_nat_entry(sbi, nid, &ne); - up_write(&nm_i->nat_tree_lock); + percpu_up_write(&nm_i->nat_tree_lock); +} + +/* + * readahead MAX_RA_NODE number of node pages. + */ +static void ra_node_pages(struct page *parent, int start, int n) +{ + struct f2fs_sb_info *sbi = F2FS_P_SB(parent); + struct blk_plug plug; + int i, end; + nid_t nid; + + blk_start_plug(&plug); + + /* Then, try readahead for siblings of the desired node */ + end = start + n; + end = min(end, NIDS_PER_BLOCK); + for (i = start; i < end; i++) { + nid = get_nid(parent, i, false); + ra_node_page(sbi, nid); + } + + blk_finish_plug(&plug); } pgoff_t get_next_page_offset(struct dnode_of_data *dn, pgoff_t pgofs) @@ -623,6 +649,7 @@ release_out: if (err == -ENOENT) { dn->cur_level = i; dn->max_level = level; + dn->ofs_in_node = offset[level]; } return err; } @@ -647,8 +674,7 @@ static void truncate_node(struct dnode_of_data *dn) if (dn->nid == dn->inode->i_ino) { remove_orphan_inode(sbi, dn->nid); dec_valid_inode_count(sbi); - } else { - sync_inode_page(dn); + f2fs_inode_synced(dn->inode); } invalidate: clear_node_page_dirty(dn->node_page); @@ -707,6 +733,8 @@ static int truncate_nodes(struct dnode_of_data *dn, unsigned int nofs, return PTR_ERR(page); } + ra_node_pages(page, ofs, NIDS_PER_BLOCK); + rn = F2FS_NODE(page); if (depth < 3) { for (i = ofs; i < NIDS_PER_BLOCK; i++, freed++) { @@ -784,6 +812,8 @@ static int truncate_partial_nodes(struct dnode_of_data *dn, nid[i + 1] = get_nid(pages[i], offset[i + 1], false); } + ra_node_pages(pages[idx], offset[idx + 1], NIDS_PER_BLOCK); + /* free direct nodes linked to a partial indirect node */ for (i = offset[idx + 1]; i < NIDS_PER_BLOCK; i++) { child_nid = get_nid(pages[idx], i, false); @@ -832,7 +862,7 @@ int truncate_inode_blocks(struct inode *inode, pgoff_t from) trace_f2fs_truncate_inode_blocks_enter(inode, from); level = get_node_path(inode, from, offset, noffset); -restart: + page = get_node_page(sbi, inode->i_ino); if (IS_ERR(page)) { trace_f2fs_truncate_inode_blocks_exit(inode, PTR_ERR(page)); @@ -896,10 +926,7 @@ skip_partial: if (offset[1] == 0 && ri->i_nid[offset[0] - NODE_DIR1_BLOCK]) { lock_page(page); - if (unlikely(page->mapping != NODE_MAPPING(sbi))) { - f2fs_put_page(page, 1); - goto restart; - } + BUG_ON(page->mapping != NODE_MAPPING(sbi)); f2fs_wait_on_page_writeback(page, NODE, true); ri->i_nid[offset[0] - NODE_DIR1_BLOCK] = 0; set_page_dirty(page); @@ -929,7 +956,7 @@ int truncate_xattr_node(struct inode *inode, struct page *page) if (IS_ERR(npage)) return PTR_ERR(npage); - F2FS_I(inode)->i_xattr_nid = 0; + f2fs_i_xnid_write(inode, 0); /* need to do checkpoint during fsync */ F2FS_I(inode)->xattr_ver = cur_cp_version(F2FS_CKPT(sbi)); @@ -995,10 +1022,10 @@ struct page *new_node_page(struct dnode_of_data *dn, struct page *page; int err; - if (unlikely(is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC))) + if (unlikely(is_inode_flag_set(dn->inode, FI_NO_ALLOC))) return ERR_PTR(-EPERM); - page = grab_cache_page(NODE_MAPPING(sbi), dn->nid); + page = f2fs_grab_cache_page(NODE_MAPPING(sbi), dn->nid, false); if (!page) return ERR_PTR(-ENOMEM); @@ -1018,21 +1045,16 @@ struct page *new_node_page(struct dnode_of_data *dn, f2fs_wait_on_page_writeback(page, NODE, true); fill_node_footer(page, dn->nid, dn->inode->i_ino, ofs, true); set_cold_node(dn->inode, page); - SetPageUptodate(page); + if (!PageUptodate(page)) + SetPageUptodate(page); if (set_page_dirty(page)) dn->node_changed = true; if (f2fs_has_xattr_block(ofs)) - F2FS_I(dn->inode)->i_xattr_nid = dn->nid; + f2fs_i_xnid_write(dn->inode, dn->nid); - dn->node_page = page; - if (ipage) - update_inode(dn->inode, ipage); - else - sync_inode_page(dn); if (ofs == 0) inc_valid_inode_count(sbi); - return page; fail: @@ -1046,18 +1068,22 @@ fail: * 0: f2fs_put_page(page, 0) * LOCKED_PAGE or error: f2fs_put_page(page, 1) */ -static int read_node_page(struct page *page, int rw) +static int read_node_page(struct page *page, int op_flags) { struct f2fs_sb_info *sbi = F2FS_P_SB(page); struct node_info ni; struct f2fs_io_info fio = { .sbi = sbi, .type = NODE, - .rw = rw, + .op = REQ_OP_READ, + .op_flags = op_flags, .page = page, .encrypted_page = NULL, }; + if (PageUptodate(page)) + return LOCKED_PAGE; + get_node_info(sbi, page->index, &ni); if (unlikely(ni.blk_addr == NULL_ADDR)) { @@ -1065,9 +1091,6 @@ static int read_node_page(struct page *page, int rw) return -ENOENT; } - if (PageUptodate(page)) - return LOCKED_PAGE; - fio.new_blkaddr = fio.old_blkaddr = ni.blk_addr; return f2fs_submit_page_bio(&fio); } @@ -1090,37 +1113,14 @@ void ra_node_page(struct f2fs_sb_info *sbi, nid_t nid) if (apage) return; - apage = grab_cache_page(NODE_MAPPING(sbi), nid); + apage = f2fs_grab_cache_page(NODE_MAPPING(sbi), nid, false); if (!apage) return; - err = read_node_page(apage, READA); + err = read_node_page(apage, REQ_RAHEAD); f2fs_put_page(apage, err ? 1 : 0); } -/* - * readahead MAX_RA_NODE number of node pages. - */ -static void ra_node_pages(struct page *parent, int start) -{ - struct f2fs_sb_info *sbi = F2FS_P_SB(parent); - struct blk_plug plug; - int i, end; - nid_t nid; - - blk_start_plug(&plug); - - /* Then, try readahead for siblings of the desired node */ - end = start + MAX_RA_NODE; - end = min(end, NIDS_PER_BLOCK); - for (i = start; i < end; i++) { - nid = get_nid(parent, i, false); - ra_node_page(sbi, nid); - } - - blk_finish_plug(&plug); -} - static struct page *__get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid, struct page *parent, int start) { @@ -1131,7 +1131,7 @@ static struct page *__get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid, return ERR_PTR(-ENOENT); f2fs_bug_on(sbi, check_nid_range(sbi, nid)); repeat: - page = grab_cache_page(NODE_MAPPING(sbi), nid); + page = f2fs_grab_cache_page(NODE_MAPPING(sbi), nid, false); if (!page) return ERR_PTR(-ENOMEM); @@ -1144,20 +1144,25 @@ repeat: } if (parent) - ra_node_pages(parent, start + 1); + ra_node_pages(parent, start + 1, MAX_RA_NODE); lock_page(page); - if (unlikely(!PageUptodate(page))) { - f2fs_put_page(page, 1); - return ERR_PTR(-EIO); - } if (unlikely(page->mapping != NODE_MAPPING(sbi))) { f2fs_put_page(page, 1); goto repeat; } + + if (unlikely(!PageUptodate(page))) + goto out_err; page_hit: - f2fs_bug_on(sbi, nid != nid_of_node(page)); + if(unlikely(nid != nid_of_node(page))) { + f2fs_bug_on(sbi, 1); + ClearPageUptodate(page); +out_err: + f2fs_put_page(page, 1); + return ERR_PTR(-EIO); + } return page; } @@ -1174,41 +1179,21 @@ struct page *get_node_page_ra(struct page *parent, int start) return __get_node_page(sbi, nid, parent, start); } -void sync_inode_page(struct dnode_of_data *dn) -{ - int ret = 0; - - if (IS_INODE(dn->node_page) || dn->inode_page == dn->node_page) { - ret = update_inode(dn->inode, dn->node_page); - } else if (dn->inode_page) { - if (!dn->inode_page_locked) - lock_page(dn->inode_page); - ret = update_inode(dn->inode, dn->inode_page); - if (!dn->inode_page_locked) - unlock_page(dn->inode_page); - } else { - ret = update_inode_page(dn->inode); - } - dn->node_changed = ret ? true: false; -} - static void flush_inline_data(struct f2fs_sb_info *sbi, nid_t ino) { struct inode *inode; struct page *page; + int ret; /* should flush inline_data before evict_inode */ inode = ilookup(sbi->sb, ino); if (!inode) return; - page = pagecache_get_page(inode->i_mapping, 0, FGP_NOWAIT, 0); + page = pagecache_get_page(inode->i_mapping, 0, FGP_LOCK|FGP_NOWAIT, 0); if (!page) goto iput_out; - if (!trylock_page(page)) - goto release_out; - if (!PageUptodate(page)) goto page_out; @@ -1218,24 +1203,219 @@ static void flush_inline_data(struct f2fs_sb_info *sbi, nid_t ino) if (!clear_page_dirty_for_io(page)) goto page_out; - if (!f2fs_write_inline_data(inode, page)) - inode_dec_dirty_pages(inode); - else + ret = f2fs_write_inline_data(inode, page); + inode_dec_dirty_pages(inode); + if (ret) set_page_dirty(page); page_out: - unlock_page(page); -release_out: - f2fs_put_page(page, 0); + f2fs_put_page(page, 1); iput_out: iput(inode); } -int sync_node_pages(struct f2fs_sb_info *sbi, nid_t ino, - struct writeback_control *wbc) +void move_node_page(struct page *node_page, int gc_type) +{ + if (gc_type == FG_GC) { + struct f2fs_sb_info *sbi = F2FS_P_SB(node_page); + struct writeback_control wbc = { + .sync_mode = WB_SYNC_ALL, + .nr_to_write = 1, + .for_reclaim = 0, + }; + + set_page_dirty(node_page); + f2fs_wait_on_page_writeback(node_page, NODE, true); + + f2fs_bug_on(sbi, PageWriteback(node_page)); + if (!clear_page_dirty_for_io(node_page)) + goto out_page; + + if (NODE_MAPPING(sbi)->a_ops->writepage(node_page, &wbc)) + unlock_page(node_page); + goto release_page; + } else { + /* set page dirty and write it */ + if (!PageWriteback(node_page)) + set_page_dirty(node_page); + } +out_page: + unlock_page(node_page); +release_page: + f2fs_put_page(node_page, 0); +} + +static struct page *last_fsync_dnode(struct f2fs_sb_info *sbi, nid_t ino) +{ + pgoff_t index, end; + struct pagevec pvec; + struct page *last_page = NULL; + + pagevec_init(&pvec, 0); + index = 0; + end = ULONG_MAX; + + while (index <= end) { + int i, nr_pages; + nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index, + PAGECACHE_TAG_DIRTY, + min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1); + if (nr_pages == 0) + break; + + for (i = 0; i < nr_pages; i++) { + struct page *page = pvec.pages[i]; + + if (unlikely(f2fs_cp_error(sbi))) { + f2fs_put_page(last_page, 0); + pagevec_release(&pvec); + return ERR_PTR(-EIO); + } + + if (!IS_DNODE(page) || !is_cold_node(page)) + continue; + if (ino_of_node(page) != ino) + continue; + + lock_page(page); + + if (unlikely(page->mapping != NODE_MAPPING(sbi))) { +continue_unlock: + unlock_page(page); + continue; + } + if (ino_of_node(page) != ino) + goto continue_unlock; + + if (!PageDirty(page)) { + /* someone wrote it for us */ + goto continue_unlock; + } + + if (last_page) + f2fs_put_page(last_page, 0); + + get_page(page); + last_page = page; + unlock_page(page); + } + pagevec_release(&pvec); + cond_resched(); + } + return last_page; +} + +int fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode, + struct writeback_control *wbc, bool atomic) +{ + pgoff_t index, end; + struct pagevec pvec; + int ret = 0; + struct page *last_page = NULL; + bool marked = false; + nid_t ino = inode->i_ino; + + if (atomic) { + last_page = last_fsync_dnode(sbi, ino); + if (IS_ERR_OR_NULL(last_page)) + return PTR_ERR_OR_ZERO(last_page); + } +retry: + pagevec_init(&pvec, 0); + index = 0; + end = ULONG_MAX; + + while (index <= end) { + int i, nr_pages; + nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index, + PAGECACHE_TAG_DIRTY, + min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1); + if (nr_pages == 0) + break; + + for (i = 0; i < nr_pages; i++) { + struct page *page = pvec.pages[i]; + + if (unlikely(f2fs_cp_error(sbi))) { + f2fs_put_page(last_page, 0); + pagevec_release(&pvec); + return -EIO; + } + + if (!IS_DNODE(page) || !is_cold_node(page)) + continue; + if (ino_of_node(page) != ino) + continue; + + lock_page(page); + + if (unlikely(page->mapping != NODE_MAPPING(sbi))) { +continue_unlock: + unlock_page(page); + continue; + } + if (ino_of_node(page) != ino) + goto continue_unlock; + + if (!PageDirty(page) && page != last_page) { + /* someone wrote it for us */ + goto continue_unlock; + } + + f2fs_wait_on_page_writeback(page, NODE, true); + BUG_ON(PageWriteback(page)); + + if (!atomic || page == last_page) { + set_fsync_mark(page, 1); + if (IS_INODE(page)) { + if (is_inode_flag_set(inode, + FI_DIRTY_INODE)) + update_inode(inode, page); + set_dentry_mark(page, + need_dentry_mark(sbi, ino)); + } + /* may be written by other thread */ + if (!PageDirty(page)) + set_page_dirty(page); + } + + if (!clear_page_dirty_for_io(page)) + goto continue_unlock; + + ret = NODE_MAPPING(sbi)->a_ops->writepage(page, wbc); + if (ret) { + unlock_page(page); + f2fs_put_page(last_page, 0); + break; + } + if (page == last_page) { + f2fs_put_page(page, 0); + marked = true; + break; + } + } + pagevec_release(&pvec); + cond_resched(); + + if (ret || marked) + break; + } + if (!ret && atomic && !marked) { + f2fs_msg(sbi->sb, KERN_DEBUG, + "Retry to write fsync mark: ino=%u, idx=%lx", + ino, last_page->index); + lock_page(last_page); + set_page_dirty(last_page); + unlock_page(last_page); + goto retry; + } + return ret ? -EIO: 0; +} + +int sync_node_pages(struct f2fs_sb_info *sbi, struct writeback_control *wbc) { pgoff_t index, end; struct pagevec pvec; - int step = ino ? 2 : 0; + int step = 0; int nwritten = 0; pagevec_init(&pvec, 0); @@ -1274,15 +1454,8 @@ next_step: if (step == 2 && (!IS_DNODE(page) || !is_cold_node(page))) continue; - - /* - * If an fsync mode, - * we should not skip writing node pages. - */ lock_node: - if (ino && ino_of_node(page) == ino) - lock_page(page); - else if (!trylock_page(page)) + if (!trylock_page(page)) continue; if (unlikely(page->mapping != NODE_MAPPING(sbi))) { @@ -1290,8 +1463,6 @@ continue_unlock: unlock_page(page); continue; } - if (ino && ino_of_node(page) != ino) - goto continue_unlock; if (!PageDirty(page)) { /* someone wrote it for us */ @@ -1299,7 +1470,7 @@ continue_unlock: } /* flush inline_data */ - if (!ino && is_inline_node(page)) { + if (is_inline_node(page)) { clear_inline_node(page); unlock_page(page); flush_inline_data(sbi, ino_of_node(page)); @@ -1312,17 +1483,8 @@ continue_unlock: if (!clear_page_dirty_for_io(page)) goto continue_unlock; - /* called by fsync() */ - if (ino && IS_DNODE(page)) { - set_fsync_mark(page, 1); - if (IS_INODE(page)) - set_dentry_mark(page, - need_dentry_mark(sbi, ino)); - nwritten++; - } else { - set_fsync_mark(page, 0); - set_dentry_mark(page, 0); - } + set_fsync_mark(page, 0); + set_dentry_mark(page, 0); if (NODE_MAPPING(sbi)->a_ops->writepage(page, wbc)) unlock_page(page); @@ -1397,7 +1559,8 @@ static int f2fs_write_node_page(struct page *page, struct f2fs_io_info fio = { .sbi = sbi, .type = NODE, - .rw = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : WRITE, + .op = REQ_OP_WRITE, + .op_flags = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : 0, .page = page, .encrypted_page = NULL, }; @@ -1457,6 +1620,7 @@ static int f2fs_write_node_pages(struct address_space *mapping, struct writeback_control *wbc) { struct f2fs_sb_info *sbi = F2FS_M_SB(mapping); + struct blk_plug plug; long diff; /* balancing f2fs's metadata in background */ @@ -1470,7 +1634,9 @@ static int f2fs_write_node_pages(struct address_space *mapping, diff = nr_pages_to_write(sbi, NODE, wbc); wbc->sync_mode = WB_SYNC_NONE; - sync_node_pages(sbi, 0, wbc); + blk_start_plug(&plug); + sync_node_pages(sbi, wbc); + blk_finish_plug(&plug); wbc->nr_to_write = max((long)0, wbc->nr_to_write - diff); return 0; @@ -1484,9 +1650,10 @@ static int f2fs_set_node_page_dirty(struct page *page) { trace_f2fs_set_page_dirty(page, NODE); - SetPageUptodate(page); + if (!PageUptodate(page)) + SetPageUptodate(page); if (!PageDirty(page)) { - __set_page_dirty_nobuffers(page); + f2fs_set_page_dirty_nobuffers(page); inc_page_count(F2FS_P_SB(page), F2FS_DIRTY_NODES); SetPagePrivate(page); f2fs_trace_pid(page); @@ -1524,7 +1691,6 @@ static int add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build) struct f2fs_nm_info *nm_i = NM_I(sbi); struct free_nid *i; struct nat_entry *ne; - bool allocated = false; if (!available_free_memory(sbi, FREE_NIDS)) return -1; @@ -1538,8 +1704,6 @@ static int add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build) ne = __lookup_nat_cache(nm_i, nid); if (ne && (!get_nat_flag(ne, IS_CHECKPOINTED) || nat_get_blkaddr(ne) != NULL_ADDR)) - allocated = true; - if (allocated) return 0; } @@ -1608,7 +1772,7 @@ static void scan_nat_page(struct f2fs_sb_info *sbi, } } -static void build_free_nids(struct f2fs_sb_info *sbi) +void build_free_nids(struct f2fs_sb_info *sbi) { struct f2fs_nm_info *nm_i = NM_I(sbi); struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); @@ -1617,14 +1781,14 @@ static void build_free_nids(struct f2fs_sb_info *sbi) nid_t nid = nm_i->next_scan_nid; /* Enough entries */ - if (nm_i->fcnt > NAT_ENTRY_PER_BLOCK) + if (nm_i->fcnt >= NAT_ENTRY_PER_BLOCK) return; /* readahead nat pages to be scanned */ ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nid), FREE_NID_PAGES, META_NAT, true); - down_read(&nm_i->nat_tree_lock); + percpu_down_read(&nm_i->nat_tree_lock); while (1) { struct page *page = get_current_nat_page(sbi, nid); @@ -1656,7 +1820,7 @@ static void build_free_nids(struct f2fs_sb_info *sbi) remove_free_nid(nm_i, nid); } up_read(&curseg->journal_rwsem); - up_read(&nm_i->nat_tree_lock); + percpu_up_read(&nm_i->nat_tree_lock); ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nm_i->next_scan_nid), nm_i->ra_nid_pages, META_NAT, false); @@ -1672,6 +1836,10 @@ bool alloc_nid(struct f2fs_sb_info *sbi, nid_t *nid) struct f2fs_nm_info *nm_i = NM_I(sbi); struct free_nid *i = NULL; retry: +#ifdef CONFIG_F2FS_FAULT_INJECTION + if (time_to_inject(FAULT_ALLOC_NID)) + return false; +#endif if (unlikely(sbi->total_valid_node_count + 1 > nm_i->available_nids)) return false; @@ -1751,12 +1919,15 @@ int try_to_free_nids(struct f2fs_sb_info *sbi, int nr_shrink) struct free_nid *i, *next; int nr = nr_shrink; + if (nm_i->fcnt <= MAX_FREE_NIDS) + return 0; + if (!mutex_trylock(&nm_i->build_lock)) return 0; spin_lock(&nm_i->free_nid_list_lock); list_for_each_entry_safe(i, next, &nm_i->free_nid_list, list) { - if (nr_shrink <= 0 || nm_i->fcnt <= NAT_ENTRY_PER_BLOCK) + if (nr_shrink <= 0 || nm_i->fcnt <= MAX_FREE_NIDS) break; if (i->state == NID_ALLOC) continue; @@ -1783,7 +1954,7 @@ void recover_inline_xattr(struct inode *inode, struct page *page) ri = F2FS_INODE(page); if (!(ri->i_inline & F2FS_INLINE_XATTR)) { - clear_inode_flag(F2FS_I(inode), FI_INLINE_XATTR); + clear_inode_flag(inode, FI_INLINE_XATTR); goto update_inode; } @@ -1825,13 +1996,11 @@ recover_xnid: get_node_info(sbi, new_xnid, &ni); ni.ino = inode->i_ino; set_node_addr(sbi, &ni, NEW_ADDR, false); - F2FS_I(inode)->i_xattr_nid = new_xnid; + f2fs_i_xnid_write(inode, new_xnid); /* 3: update xattr blkaddr */ refresh_sit_entry(sbi, NEW_ADDR, blkaddr); set_node_addr(sbi, &ni, blkaddr, false); - - update_inode_page(inode); } int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page) @@ -1846,14 +2015,15 @@ int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page) if (unlikely(old_ni.blk_addr != NULL_ADDR)) return -EINVAL; - ipage = grab_cache_page(NODE_MAPPING(sbi), ino); + ipage = f2fs_grab_cache_page(NODE_MAPPING(sbi), ino, false); if (!ipage) return -ENOMEM; /* Should not use this inode from free nid list */ remove_free_nid(NM_I(sbi), ino); - SetPageUptodate(ipage); + if (!PageUptodate(ipage)) + SetPageUptodate(ipage); fill_node_footer(ipage, ino, ino, 0, true); src = F2FS_INODE(page); @@ -2039,7 +2209,7 @@ void flush_nat_entries(struct f2fs_sb_info *sbi) if (!nm_i->dirty_nat_cnt) return; - down_write(&nm_i->nat_tree_lock); + percpu_down_write(&nm_i->nat_tree_lock); /* * if there are no enough space in journal to store dirty nat @@ -2062,7 +2232,7 @@ void flush_nat_entries(struct f2fs_sb_info *sbi) list_for_each_entry_safe(set, tmp, &sets, set_list) __flush_nat_entry_set(sbi, set); - up_write(&nm_i->nat_tree_lock); + percpu_up_write(&nm_i->nat_tree_lock); f2fs_bug_on(sbi, nm_i->dirty_nat_cnt); } @@ -2098,7 +2268,8 @@ static int init_node_manager(struct f2fs_sb_info *sbi) mutex_init(&nm_i->build_lock); spin_lock_init(&nm_i->free_nid_list_lock); - init_rwsem(&nm_i->nat_tree_lock); + if (percpu_init_rwsem(&nm_i->nat_tree_lock)) + return -ENOMEM; nm_i->next_scan_nid = le32_to_cpu(sbi->ckpt->next_free_nid); nm_i->bitmap_size = __bitmap_size(sbi, NAT_BITMAP); @@ -2155,7 +2326,7 @@ void destroy_node_manager(struct f2fs_sb_info *sbi) spin_unlock(&nm_i->free_nid_list_lock); /* destroy nat cache */ - down_write(&nm_i->nat_tree_lock); + percpu_down_write(&nm_i->nat_tree_lock); while ((found = __gang_lookup_nat_cache(nm_i, nid, NATVEC_SIZE, natvec))) { unsigned idx; @@ -2180,8 +2351,9 @@ void destroy_node_manager(struct f2fs_sb_info *sbi) kmem_cache_free(nat_entry_set_slab, setvec[idx]); } } - up_write(&nm_i->nat_tree_lock); + percpu_up_write(&nm_i->nat_tree_lock); + percpu_free_rwsem(&nm_i->nat_tree_lock); kfree(nm_i->nat_bitmap); sbi->nm_info = NULL; kfree(nm_i); diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index 1f4f9d4569d9..fc7684554b1a 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -15,18 +15,21 @@ #define NAT_BLOCK_OFFSET(start_nid) (start_nid / NAT_ENTRY_PER_BLOCK) /* # of pages to perform synchronous readahead before building free nids */ -#define FREE_NID_PAGES 4 +#define FREE_NID_PAGES 8 +#define MAX_FREE_NIDS (NAT_ENTRY_PER_BLOCK * FREE_NID_PAGES) -#define DEF_RA_NID_PAGES 4 /* # of nid pages to be readaheaded */ +#define DEF_RA_NID_PAGES 0 /* # of nid pages to be readaheaded */ /* maximum readahead size for node during getting data blocks */ #define MAX_RA_NODE 128 /* control the memory footprint threshold (10MB per 1GB ram) */ -#define DEF_RAM_THRESHOLD 10 +#define DEF_RAM_THRESHOLD 1 /* control dirty nats ratio threshold (default: 10% over max nid count) */ #define DEF_DIRTY_NAT_RATIO_THRESHOLD 10 +/* control total # of nats */ +#define DEF_NAT_CACHE_THRESHOLD 100000 /* vector size for gang look-up from nat cache that consists of radix tree */ #define NATVEC_SIZE 64 @@ -126,6 +129,11 @@ static inline bool excess_dirty_nats(struct f2fs_sb_info *sbi) NM_I(sbi)->dirty_nats_ratio / 100; } +static inline bool excess_cached_nats(struct f2fs_sb_info *sbi) +{ + return NM_I(sbi)->nat_cnt >= DEF_NAT_CACHE_THRESHOLD; +} + enum mem_type { FREE_NIDS, /* indicates the free nid list */ NAT_ENTRIES, /* indicates the cached nat entry */ diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 011942f94d64..9e652d5a659b 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -49,8 +49,9 @@ static struct kmem_cache *fsync_entry_slab; bool space_for_roll_forward(struct f2fs_sb_info *sbi) { - if (sbi->last_valid_block_count + sbi->alloc_valid_block_count - > sbi->user_block_count) + s64 nalloc = percpu_counter_sum_positive(&sbi->alloc_valid_block_count); + + if (sbi->last_valid_block_count + nalloc > sbi->user_block_count) return false; return true; } @@ -67,7 +68,30 @@ static struct fsync_inode_entry *get_fsync_inode(struct list_head *head, return NULL; } -static int recover_dentry(struct inode *inode, struct page *ipage) +static struct fsync_inode_entry *add_fsync_inode(struct list_head *head, + struct inode *inode) +{ + struct fsync_inode_entry *entry; + + entry = kmem_cache_alloc(fsync_entry_slab, GFP_F2FS_ZERO); + if (!entry) + return NULL; + + entry->inode = inode; + list_add_tail(&entry->list, head); + + return entry; +} + +static void del_fsync_inode(struct fsync_inode_entry *entry) +{ + iput(entry->inode); + list_del(&entry->list); + kmem_cache_free(fsync_entry_slab, entry); +} + +static int recover_dentry(struct inode *inode, struct page *ipage, + struct list_head *dir_list) { struct f2fs_inode *raw_inode = F2FS_INODE(ipage); nid_t pino = le32_to_cpu(raw_inode->i_pino); @@ -75,18 +99,29 @@ static int recover_dentry(struct inode *inode, struct page *ipage) struct qstr name; struct page *page; struct inode *dir, *einode; + struct fsync_inode_entry *entry; int err = 0; - dir = f2fs_iget(inode->i_sb, pino); - if (IS_ERR(dir)) { - err = PTR_ERR(dir); - goto out; + entry = get_fsync_inode(dir_list, pino); + if (!entry) { + dir = f2fs_iget(inode->i_sb, pino); + if (IS_ERR(dir)) { + err = PTR_ERR(dir); + goto out; + } + + entry = add_fsync_inode(dir_list, dir); + if (!entry) { + err = -ENOMEM; + iput(dir); + goto out; + } } - if (file_enc_name(inode)) { - iput(dir); + dir = entry->inode; + + if (file_enc_name(inode)) return 0; - } name.len = le32_to_cpu(raw_inode->i_namelen); name.name = raw_inode->i_name; @@ -94,7 +129,7 @@ static int recover_dentry(struct inode *inode, struct page *ipage) if (unlikely(name.len > F2FS_NAME_LEN)) { WARN_ON(1); err = -ENAMETOOLONG; - goto out_err; + goto out; } retry: de = f2fs_find_entry(dir, &name, &page); @@ -118,25 +153,17 @@ retry: f2fs_delete_entry(de, page, dir, einode); iput(einode); goto retry; - } - err = __f2fs_add_link(dir, &name, inode, inode->i_ino, inode->i_mode); - if (err) - goto out_err; - - if (is_inode_flag_set(F2FS_I(dir), FI_DELAY_IPUT)) { - iput(dir); + } else if (IS_ERR(page)) { + err = PTR_ERR(page); } else { - add_dirty_dir_inode(dir); - set_inode_flag(F2FS_I(dir), FI_DELAY_IPUT); + err = __f2fs_add_link(dir, &name, inode, + inode->i_ino, inode->i_mode); } - goto out; out_unmap_put: f2fs_dentry_kunmap(dir, page); f2fs_put_page(page, 0); -out_err: - iput(dir); out: f2fs_msg(inode->i_sb, KERN_NOTICE, "%s: ino = %x, name = %s, dir = %lx, err = %d", @@ -151,7 +178,7 @@ static void recover_inode(struct inode *inode, struct page *page) char *name; inode->i_mode = le16_to_cpu(raw->i_mode); - i_size_write(inode, le64_to_cpu(raw->i_size)); + f2fs_i_size_write(inode, le64_to_cpu(raw->i_size)); inode->i_atime.tv_sec = le64_to_cpu(raw->i_mtime); inode->i_ctime.tv_sec = le64_to_cpu(raw->i_ctime); inode->i_mtime.tv_sec = le64_to_cpu(raw->i_mtime); @@ -198,6 +225,7 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head) { unsigned long long cp_ver = cur_cp_version(F2FS_CKPT(sbi)); struct curseg_info *curseg; + struct inode *inode; struct page *page = NULL; block_t blkaddr; int err = 0; @@ -206,8 +234,6 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head) curseg = CURSEG_I(sbi, CURSEG_WARM_NODE); blkaddr = NEXT_FREE_BLKADDR(sbi, curseg); - ra_meta_pages(sbi, blkaddr, 1, META_POR, true); - while (1) { struct fsync_inode_entry *entry; @@ -233,35 +259,32 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head) break; } - /* add this fsync inode to the list */ - entry = kmem_cache_alloc(fsync_entry_slab, GFP_F2FS_ZERO); - if (!entry) { - err = -ENOMEM; - break; - } /* * CP | dnode(F) | inode(DF) * For this case, we should not give up now. */ - entry->inode = f2fs_iget(sbi->sb, ino_of_node(page)); - if (IS_ERR(entry->inode)) { - err = PTR_ERR(entry->inode); - kmem_cache_free(fsync_entry_slab, entry); + inode = f2fs_iget(sbi->sb, ino_of_node(page)); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); if (err == -ENOENT) { err = 0; goto next; } break; } - list_add_tail(&entry->list, head); + + /* add this fsync inode to the list */ + entry = add_fsync_inode(head, inode); + if (!entry) { + err = -ENOMEM; + iput(inode); + break; + } } entry->blkaddr = blkaddr; - if (IS_INODE(page)) { - entry->last_inode = blkaddr; - if (is_dent_dnode(page)) - entry->last_dentry = blkaddr; - } + if (IS_INODE(page) && is_dent_dnode(page)) + entry->last_dentry = blkaddr; next: /* check next segment */ blkaddr = next_blkaddr_of_node(page); @@ -277,11 +300,8 @@ static void destroy_fsync_dnodes(struct list_head *head) { struct fsync_inode_entry *entry, *tmp; - list_for_each_entry_safe(entry, tmp, head, list) { - iput(entry->inode); - list_del(&entry->list); - kmem_cache_free(fsync_entry_slab, entry); - } + list_for_each_entry_safe(entry, tmp, head, list) + del_fsync_inode(entry); } static int check_index_in_prev_nodes(struct f2fs_sb_info *sbi, @@ -438,14 +458,16 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, continue; } + if ((start + 1) << PAGE_SHIFT > i_size_read(inode)) + f2fs_i_size_write(inode, (start + 1) << PAGE_SHIFT); + /* * dest is reserved block, invalidate src block * and then reserve one new block in dnode page. */ if (dest == NEW_ADDR) { truncate_data_blocks_range(&dn, 1); - err = reserve_new_block(&dn); - f2fs_bug_on(sbi, err); + reserve_new_block(&dn); continue; } @@ -454,8 +476,14 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, if (src == NULL_ADDR) { err = reserve_new_block(&dn); +#ifdef CONFIG_F2FS_FAULT_INJECTION + while (err) + err = reserve_new_block(&dn); +#endif /* We should not get -ENOSPC */ f2fs_bug_on(sbi, err); + if (err) + goto err; } /* Check the previous node page having this index */ @@ -470,9 +498,6 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, } } - if (IS_INODE(dn.node_page)) - sync_inode_page(&dn); - copy_node_footer(dn.node_page, page); fill_node_footer(dn.node_page, dn.nid, ni.ino, ofs_of_node(page), false); @@ -486,7 +511,8 @@ out: return err; } -static int recover_data(struct f2fs_sb_info *sbi, struct list_head *head) +static int recover_data(struct f2fs_sb_info *sbi, struct list_head *inode_list, + struct list_head *dir_list) { unsigned long long cp_ver = cur_cp_version(F2FS_CKPT(sbi)); struct curseg_info *curseg; @@ -513,7 +539,7 @@ static int recover_data(struct f2fs_sb_info *sbi, struct list_head *head) break; } - entry = get_fsync_inode(head, ino_of_node(page)); + entry = get_fsync_inode(inode_list, ino_of_node(page)); if (!entry) goto next; /* @@ -521,10 +547,10 @@ static int recover_data(struct f2fs_sb_info *sbi, struct list_head *head) * In this case, we can lose the latest inode(x). * So, call recover_inode for the inode update. */ - if (entry->last_inode == blkaddr) + if (IS_INODE(page)) recover_inode(entry->inode, page); if (entry->last_dentry == blkaddr) { - err = recover_dentry(entry->inode, page); + err = recover_dentry(entry->inode, page, dir_list); if (err) { f2fs_put_page(page, 1); break; @@ -536,11 +562,8 @@ static int recover_data(struct f2fs_sb_info *sbi, struct list_head *head) break; } - if (entry->blkaddr == blkaddr) { - iput(entry->inode); - list_del(&entry->list); - kmem_cache_free(fsync_entry_slab, entry); - } + if (entry->blkaddr == blkaddr) + del_fsync_inode(entry); next: /* check next segment */ blkaddr = next_blkaddr_of_node(page); @@ -551,12 +574,14 @@ next: return err; } -int recover_fsync_data(struct f2fs_sb_info *sbi) +int recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only) { struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_WARM_NODE); struct list_head inode_list; + struct list_head dir_list; block_t blkaddr; int err; + int ret = 0; bool need_writecp = false; fsync_entry_slab = f2fs_kmem_cache_create("f2fs_fsync_inode_entry", @@ -565,6 +590,7 @@ int recover_fsync_data(struct f2fs_sb_info *sbi) return -ENOMEM; INIT_LIST_HEAD(&inode_list); + INIT_LIST_HEAD(&dir_list); /* prevent checkpoint */ mutex_lock(&sbi->cp_mutex); @@ -573,21 +599,22 @@ int recover_fsync_data(struct f2fs_sb_info *sbi) /* step #1: find fsynced inode numbers */ err = find_fsync_dnodes(sbi, &inode_list); - if (err) + if (err || list_empty(&inode_list)) goto out; - if (list_empty(&inode_list)) + if (check_only) { + ret = 1; goto out; + } need_writecp = true; /* step #2: recover data */ - err = recover_data(sbi, &inode_list); + err = recover_data(sbi, &inode_list, &dir_list); if (!err) f2fs_bug_on(sbi, !list_empty(&inode_list)); out: destroy_fsync_dnodes(&inode_list); - kmem_cache_destroy(fsync_entry_slab); /* truncate meta pages to be used by the recovery */ truncate_inode_pages_range(META_MAPPING(sbi), @@ -602,8 +629,12 @@ out: if (err) { bool invalidate = false; - if (discard_next_dnode(sbi, blkaddr)) + if (test_opt(sbi, LFS)) { + update_meta_page(sbi, NULL, blkaddr); + invalidate = true; + } else if (discard_next_dnode(sbi, blkaddr)) { invalidate = true; + } /* Flush all the NAT/SIT pages */ while (get_pages(sbi, F2FS_DIRTY_META)) @@ -625,5 +656,8 @@ out: } else { mutex_unlock(&sbi->cp_mutex); } - return err; + + destroy_fsync_dnodes(&dir_list); + kmem_cache_destroy(fsync_entry_slab); + return ret ? ret: err; } diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 540669d6978e..a46296f57b02 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -223,9 +223,11 @@ static int __revoke_inmem_pages(struct inode *inode, f2fs_put_dnode(&dn); } next: - ClearPageUptodate(page); + /* we don't need to invalidate this in the sccessful status */ + if (drop || recover) + ClearPageUptodate(page); set_page_private(page, 0); - ClearPageUptodate(page); + ClearPagePrivate(page); f2fs_put_page(page, 1); list_del(&cur->list); @@ -239,6 +241,8 @@ void drop_inmem_pages(struct inode *inode) { struct f2fs_inode_info *fi = F2FS_I(inode); + clear_inode_flag(inode, FI_ATOMIC_FILE); + mutex_lock(&fi->inmem_lock); __revoke_inmem_pages(inode, &fi->inmem_pages, true, false); mutex_unlock(&fi->inmem_lock); @@ -253,7 +257,8 @@ static int __commit_inmem_pages(struct inode *inode, struct f2fs_io_info fio = { .sbi = sbi, .type = DATA, - .rw = WRITE_SYNC | REQ_PRIO, + .op = REQ_OP_WRITE, + .op_flags = WRITE_SYNC | REQ_PRIO, .encrypted_page = NULL, }; bool submit_bio = false; @@ -341,6 +346,11 @@ void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need) { if (!need) return; + + /* balance_fs_bg is able to be pending */ + if (excess_cached_nats(sbi)) + f2fs_balance_fs_bg(sbi); + /* * We should do GC or end up with checkpoint, if there are so many dirty * dir/node pages without enough free segments. @@ -362,7 +372,9 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi) try_to_free_nats(sbi, NAT_ENTRY_PER_BLOCK); if (!available_free_memory(sbi, FREE_NIDS)) - try_to_free_nids(sbi, NAT_ENTRY_PER_BLOCK * FREE_NID_PAGES); + try_to_free_nids(sbi, MAX_FREE_NIDS); + else + build_free_nids(sbi); /* checkpoint is the only way to shrink partial cached entries */ if (!available_free_memory(sbi, NAT_ENTRIES) || @@ -402,7 +414,8 @@ repeat: fcc->dispatch_list = llist_reverse_order(fcc->dispatch_list); bio->bi_bdev = sbi->sb->s_bdev; - ret = submit_bio_wait(WRITE_FLUSH, bio); + bio_set_op_attrs(bio, REQ_OP_WRITE, WRITE_FLUSH); + ret = submit_bio_wait(bio); llist_for_each_entry_safe(cmd, next, fcc->dispatch_list, llnode) { @@ -429,24 +442,29 @@ int f2fs_issue_flush(struct f2fs_sb_info *sbi) if (test_opt(sbi, NOBARRIER)) return 0; - if (!test_opt(sbi, FLUSH_MERGE)) { + if (!test_opt(sbi, FLUSH_MERGE) || !atomic_read(&fcc->submit_flush)) { struct bio *bio = f2fs_bio_alloc(0); int ret; + atomic_inc(&fcc->submit_flush); bio->bi_bdev = sbi->sb->s_bdev; - ret = submit_bio_wait(WRITE_FLUSH, bio); + bio_set_op_attrs(bio, REQ_OP_WRITE, WRITE_FLUSH); + ret = submit_bio_wait(bio); + atomic_dec(&fcc->submit_flush); bio_put(bio); return ret; } init_completion(&cmd.wait); + atomic_inc(&fcc->submit_flush); llist_add(&cmd.llnode, &fcc->issue_list); if (!fcc->dispatch_list) wake_up(&fcc->flush_wait_queue); wait_for_completion(&cmd.wait); + atomic_dec(&fcc->submit_flush); return cmd.ret; } @@ -460,6 +478,7 @@ int create_flush_cmd_control(struct f2fs_sb_info *sbi) fcc = kzalloc(sizeof(struct flush_cmd_control), GFP_KERNEL); if (!fcc) return -ENOMEM; + atomic_set(&fcc->submit_flush, 0); init_waitqueue_head(&fcc->flush_wait_queue); init_llist_head(&fcc->issue_list); SM_I(sbi)->cmd_control_info = fcc; @@ -661,6 +680,10 @@ static void add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc) break; end = __find_rev_next_zero_bit(dmap, max_blocks, start + 1); + if (force && start && end != max_blocks + && (end - start) < cpc->trim_minlen) + continue; + __add_discard_entry(sbi, cpc, se, start, end); } } @@ -698,6 +721,8 @@ void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc) struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); unsigned long *prefree_map = dirty_i->dirty_segmap[PRE]; unsigned int start = 0, end = -1; + unsigned int secno, start_segno; + bool force = (cpc->reason == CP_DISCARD); mutex_lock(&dirty_i->seglist_lock); @@ -714,17 +739,31 @@ void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc) dirty_i->nr_dirty[PRE] -= end - start; - if (!test_opt(sbi, DISCARD)) + if (force || !test_opt(sbi, DISCARD)) continue; - f2fs_issue_discard(sbi, START_BLOCK(sbi, start), + if (!test_opt(sbi, LFS) || sbi->segs_per_sec == 1) { + f2fs_issue_discard(sbi, START_BLOCK(sbi, start), (end - start) << sbi->log_blocks_per_seg); + continue; + } +next: + secno = GET_SECNO(sbi, start); + start_segno = secno * sbi->segs_per_sec; + if (!IS_CURSEC(sbi, secno) && + !get_valid_blocks(sbi, start, sbi->segs_per_sec)) + f2fs_issue_discard(sbi, START_BLOCK(sbi, start_segno), + sbi->segs_per_sec << sbi->log_blocks_per_seg); + + start = start_segno + sbi->segs_per_sec; + if (start < end) + goto next; } mutex_unlock(&dirty_i->seglist_lock); /* send small discards */ list_for_each_entry_safe(entry, this, head, list) { - if (cpc->reason == CP_DISCARD && entry->len < cpc->trim_minlen) + if (force && entry->len < cpc->trim_minlen) goto skip; f2fs_issue_discard(sbi, entry->blkaddr, entry->len); cpc->trimmed += entry->len; @@ -1212,6 +1251,9 @@ void allocate_new_segments(struct f2fs_sb_info *sbi) { int i; + if (test_opt(sbi, LFS)) + return; + for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) __allocate_new_segments(sbi, i); } @@ -1385,11 +1427,17 @@ static void do_write_page(struct f2fs_summary *sum, struct f2fs_io_info *fio) { int type = __get_segment_type(fio->page, fio->type); + if (fio->type == NODE || fio->type == DATA) + mutex_lock(&fio->sbi->wio_mutex[fio->type]); + allocate_data_block(fio->sbi, fio->page, fio->old_blkaddr, &fio->new_blkaddr, sum, type); /* writeout dirty page into bdev */ f2fs_submit_page_mbio(fio); + + if (fio->type == NODE || fio->type == DATA) + mutex_unlock(&fio->sbi->wio_mutex[fio->type]); } void write_meta_page(struct f2fs_sb_info *sbi, struct page *page) @@ -1397,7 +1445,8 @@ void write_meta_page(struct f2fs_sb_info *sbi, struct page *page) struct f2fs_io_info fio = { .sbi = sbi, .type = META, - .rw = WRITE_SYNC | REQ_META | REQ_PRIO, + .op = REQ_OP_WRITE, + .op_flags = WRITE_SYNC | REQ_META | REQ_PRIO, .old_blkaddr = page->index, .new_blkaddr = page->index, .page = page, @@ -1405,7 +1454,7 @@ void write_meta_page(struct f2fs_sb_info *sbi, struct page *page) }; if (unlikely(page->index >= MAIN_BLKADDR(sbi))) - fio.rw &= ~REQ_META; + fio.op_flags &= ~REQ_META; set_page_writeback(page); f2fs_submit_page_mbio(&fio); @@ -2369,7 +2418,11 @@ int build_segment_manager(struct f2fs_sb_info *sbi) sm_info->ssa_blkaddr = le32_to_cpu(raw_super->ssa_blkaddr); sm_info->rec_prefree_segments = sm_info->main_segments * DEF_RECLAIM_PREFREE_SEGMENTS / 100; - sm_info->ipu_policy = 1 << F2FS_IPU_FSYNC; + if (sm_info->rec_prefree_segments > DEF_MAX_RECLAIM_PREFREE_SEGMENTS) + sm_info->rec_prefree_segments = DEF_MAX_RECLAIM_PREFREE_SEGMENTS; + + if (!test_opt(sbi, LFS)) + sm_info->ipu_policy = 1 << F2FS_IPU_FSYNC; sm_info->min_ipu_util = DEF_MIN_IPU_UTIL; sm_info->min_fsync_blocks = DEF_MIN_FSYNC_BLOCKS; diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 975c33df65c7..b33f73ec60a4 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -16,6 +16,7 @@ #define NULL_SECNO ((unsigned int)(~0)) #define DEF_RECLAIM_PREFREE_SEGMENTS 5 /* 5% over total segments */ +#define DEF_MAX_RECLAIM_PREFREE_SEGMENTS 4096 /* 8GB in maximum */ /* L: Logical segment # in volume, R: Relative segment # in main area */ #define GET_L2R_SEGNO(free_i, segno) (segno - free_i->start_segno) @@ -158,16 +159,17 @@ struct victim_sel_policy { }; struct seg_entry { - unsigned short valid_blocks; /* # of valid blocks */ + unsigned int type:6; /* segment type like CURSEG_XXX_TYPE */ + unsigned int valid_blocks:10; /* # of valid blocks */ + unsigned int ckpt_valid_blocks:10; /* # of valid blocks last cp */ + unsigned int padding:6; /* padding */ unsigned char *cur_valid_map; /* validity bitmap of blocks */ /* * # of valid blocks and the validity bitmap stored in the the last * checkpoint pack. This information is used by the SSR mode. */ - unsigned short ckpt_valid_blocks; - unsigned char *ckpt_valid_map; + unsigned char *ckpt_valid_map; /* validity bitmap of blocks last cp */ unsigned char *discard_map; - unsigned char type; /* segment type like CURSEG_XXX_TYPE */ unsigned long long mtime; /* modification time of the segment */ }; @@ -469,6 +471,10 @@ static inline bool need_SSR(struct f2fs_sb_info *sbi) { int node_secs = get_blocktype_secs(sbi, F2FS_DIRTY_NODES); int dent_secs = get_blocktype_secs(sbi, F2FS_DIRTY_DENTS); + + if (test_opt(sbi, LFS)) + return false; + return free_sections(sbi) <= (node_secs + 2 * dent_secs + reserved_sections(sbi) + 1); } @@ -478,6 +484,8 @@ static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi, int freed) int node_secs = get_blocktype_secs(sbi, F2FS_DIRTY_NODES); int dent_secs = get_blocktype_secs(sbi, F2FS_DIRTY_DENTS); + node_secs += get_blocktype_secs(sbi, F2FS_DIRTY_IMETA); + if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) return false; @@ -530,6 +538,9 @@ static inline bool need_inplace_update(struct inode *inode) if (S_ISDIR(inode->i_mode) || f2fs_is_atomic_file(inode)) return false; + if (test_opt(sbi, LFS)) + return false; + if (policy & (0x1 << F2FS_IPU_FORCE)) return true; if (policy & (0x1 << F2FS_IPU_SSR) && need_SSR(sbi)) @@ -543,7 +554,7 @@ static inline bool need_inplace_update(struct inode *inode) /* this is only set during fdatasync */ if (policy & (0x1 << F2FS_IPU_FSYNC) && - is_inode_flag_set(F2FS_I(inode), FI_NEED_IPU)) + is_inode_flag_set(inode, FI_NEED_IPU)) return true; return false; @@ -705,9 +716,9 @@ static inline int nr_pages_to_skip(struct f2fs_sb_info *sbi, int type) if (type == DATA) return sbi->blocks_per_seg; else if (type == NODE) - return 3 * sbi->blocks_per_seg; + return 8 * sbi->blocks_per_seg; else if (type == META) - return MAX_BIO_BLOCKS(sbi); + return 8 * MAX_BIO_BLOCKS(sbi); else return 0; } @@ -725,10 +736,8 @@ static inline long nr_pages_to_write(struct f2fs_sb_info *sbi, int type, nr_to_write = wbc->nr_to_write; - if (type == DATA) - desired = 4096; - else if (type == NODE) - desired = 3 * max_hw_blocks(sbi); + if (type == NODE) + desired = 2 * max_hw_blocks(sbi); else desired = MAX_BIO_BLOCKS(sbi); diff --git a/fs/f2fs/shrinker.c b/fs/f2fs/shrinker.c index 93606f281bf9..46c915425923 100644 --- a/fs/f2fs/shrinker.c +++ b/fs/f2fs/shrinker.c @@ -13,6 +13,7 @@ #include <linux/f2fs_fs.h> #include "f2fs.h" +#include "node.h" static LIST_HEAD(f2fs_list); static DEFINE_SPINLOCK(f2fs_list_lock); @@ -25,8 +26,8 @@ static unsigned long __count_nat_entries(struct f2fs_sb_info *sbi) static unsigned long __count_free_nids(struct f2fs_sb_info *sbi) { - if (NM_I(sbi)->fcnt > NAT_ENTRY_PER_BLOCK) - return NM_I(sbi)->fcnt - NAT_ENTRY_PER_BLOCK; + if (NM_I(sbi)->fcnt > MAX_FREE_NIDS) + return NM_I(sbi)->fcnt - MAX_FREE_NIDS; return 0; } diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 006f87d69921..1b86d3f638ef 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -39,6 +39,31 @@ static struct proc_dir_entry *f2fs_proc_root; static struct kmem_cache *f2fs_inode_cachep; static struct kset *f2fs_kset; +#ifdef CONFIG_F2FS_FAULT_INJECTION +struct f2fs_fault_info f2fs_fault; + +char *fault_name[FAULT_MAX] = { + [FAULT_KMALLOC] = "kmalloc", + [FAULT_PAGE_ALLOC] = "page alloc", + [FAULT_ALLOC_NID] = "alloc nid", + [FAULT_ORPHAN] = "orphan", + [FAULT_BLOCK] = "no more block", + [FAULT_DIR_DEPTH] = "too big dir depth", + [FAULT_EVICT_INODE] = "evict_inode fail", +}; + +static void f2fs_build_fault_attr(unsigned int rate) +{ + if (rate) { + atomic_set(&f2fs_fault.inject_ops, 0); + f2fs_fault.inject_rate = rate; + f2fs_fault.inject_type = (1 << FAULT_MAX) - 1; + } else { + memset(&f2fs_fault, 0, sizeof(struct f2fs_fault_info)); + } +} +#endif + /* f2fs-wide shrinker description */ static struct shrinker f2fs_shrinker_info = { .scan_objects = f2fs_shrink_scan, @@ -51,6 +76,7 @@ enum { Opt_disable_roll_forward, Opt_norecovery, Opt_discard, + Opt_nodiscard, Opt_noheap, Opt_user_xattr, Opt_nouser_xattr, @@ -62,12 +88,17 @@ enum { Opt_inline_data, Opt_inline_dentry, Opt_flush_merge, + Opt_noflush_merge, Opt_nobarrier, Opt_fastboot, Opt_extent_cache, Opt_noextent_cache, Opt_noinline_data, Opt_data_flush, + Opt_mode, + Opt_fault_injection, + Opt_lazytime, + Opt_nolazytime, Opt_err, }; @@ -76,6 +107,7 @@ static match_table_t f2fs_tokens = { {Opt_disable_roll_forward, "disable_roll_forward"}, {Opt_norecovery, "norecovery"}, {Opt_discard, "discard"}, + {Opt_nodiscard, "nodiscard"}, {Opt_noheap, "no_heap"}, {Opt_user_xattr, "user_xattr"}, {Opt_nouser_xattr, "nouser_xattr"}, @@ -87,12 +119,17 @@ static match_table_t f2fs_tokens = { {Opt_inline_data, "inline_data"}, {Opt_inline_dentry, "inline_dentry"}, {Opt_flush_merge, "flush_merge"}, + {Opt_noflush_merge, "noflush_merge"}, {Opt_nobarrier, "nobarrier"}, {Opt_fastboot, "fastboot"}, {Opt_extent_cache, "extent_cache"}, {Opt_noextent_cache, "noextent_cache"}, {Opt_noinline_data, "noinline_data"}, {Opt_data_flush, "data_flush"}, + {Opt_mode, "mode=%s"}, + {Opt_fault_injection, "fault_injection=%u"}, + {Opt_lazytime, "lazytime"}, + {Opt_nolazytime, "nolazytime"}, {Opt_err, NULL}, }; @@ -102,6 +139,10 @@ enum { SM_INFO, /* struct f2fs_sm_info */ NM_INFO, /* struct f2fs_nm_info */ F2FS_SBI, /* struct f2fs_sb_info */ +#ifdef CONFIG_F2FS_FAULT_INJECTION + FAULT_INFO_RATE, /* struct f2fs_fault_info */ + FAULT_INFO_TYPE, /* struct f2fs_fault_info */ +#endif }; struct f2fs_attr { @@ -123,6 +164,11 @@ static unsigned char *__struct_ptr(struct f2fs_sb_info *sbi, int struct_type) return (unsigned char *)NM_I(sbi); else if (struct_type == F2FS_SBI) return (unsigned char *)sbi; +#ifdef CONFIG_F2FS_FAULT_INJECTION + else if (struct_type == FAULT_INFO_RATE || + struct_type == FAULT_INFO_TYPE) + return (unsigned char *)&f2fs_fault; +#endif return NULL; } @@ -172,6 +218,10 @@ static ssize_t f2fs_sbi_store(struct f2fs_attr *a, ret = kstrtoul(skip_spaces(buf), 0, &t); if (ret < 0) return ret; +#ifdef CONFIG_F2FS_FAULT_INJECTION + if (a->struct_type == FAULT_INFO_TYPE && t >= (1 << FAULT_MAX)) + return -EINVAL; +#endif *ui = t; return count; } @@ -237,6 +287,10 @@ F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, max_victim_search, max_victim_search); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, dir_level, dir_level); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, cp_interval, interval_time[CP_TIME]); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, idle_interval, interval_time[REQ_TIME]); +#ifdef CONFIG_F2FS_FAULT_INJECTION +F2FS_RW_ATTR(FAULT_INFO_RATE, f2fs_fault_info, inject_rate, inject_rate); +F2FS_RW_ATTR(FAULT_INFO_TYPE, f2fs_fault_info, inject_type, inject_type); +#endif F2FS_GENERAL_RO_ATTR(lifetime_write_kbytes); #define ATTR_LIST(name) (&f2fs_attr_##name.attr) @@ -273,6 +327,22 @@ static struct kobj_type f2fs_ktype = { .release = f2fs_sb_release, }; +#ifdef CONFIG_F2FS_FAULT_INJECTION +/* sysfs for f2fs fault injection */ +static struct kobject f2fs_fault_inject; + +static struct attribute *f2fs_fault_attrs[] = { + ATTR_LIST(inject_rate), + ATTR_LIST(inject_type), + NULL +}; + +static struct kobj_type f2fs_fault_ktype = { + .default_attrs = f2fs_fault_attrs, + .sysfs_ops = &f2fs_attr_ops, +}; +#endif + void f2fs_msg(struct super_block *sb, const char *level, const char *fmt, ...) { struct va_format vaf; @@ -300,6 +370,10 @@ static int parse_options(struct super_block *sb, char *options) char *p, *name; int arg = 0; +#ifdef CONFIG_F2FS_FAULT_INJECTION + f2fs_build_fault_attr(0); +#endif + if (!options) return 0; @@ -354,6 +428,8 @@ static int parse_options(struct super_block *sb, char *options) "the device does not support discard"); } break; + case Opt_nodiscard: + clear_opt(sbi, DISCARD); case Opt_noheap: set_opt(sbi, NOHEAP); break; @@ -415,6 +491,9 @@ static int parse_options(struct super_block *sb, char *options) case Opt_flush_merge: set_opt(sbi, FLUSH_MERGE); break; + case Opt_noflush_merge: + clear_opt(sbi, FLUSH_MERGE); + break; case Opt_nobarrier: set_opt(sbi, NOBARRIER); break; @@ -433,6 +512,39 @@ static int parse_options(struct super_block *sb, char *options) case Opt_data_flush: set_opt(sbi, DATA_FLUSH); break; + case Opt_mode: + name = match_strdup(&args[0]); + + if (!name) + return -ENOMEM; + if (strlen(name) == 8 && + !strncmp(name, "adaptive", 8)) { + set_opt_mode(sbi, F2FS_MOUNT_ADAPTIVE); + } else if (strlen(name) == 3 && + !strncmp(name, "lfs", 3)) { + set_opt_mode(sbi, F2FS_MOUNT_LFS); + } else { + kfree(name); + return -EINVAL; + } + kfree(name); + break; + case Opt_fault_injection: + if (args->from && match_int(args, &arg)) + return -EINVAL; +#ifdef CONFIG_F2FS_FAULT_INJECTION + f2fs_build_fault_attr(arg); +#else + f2fs_msg(sb, KERN_INFO, + "FAULT_INJECTION was not selected"); +#endif + break; + case Opt_lazytime: + sb->s_flags |= MS_LAZYTIME; + break; + case Opt_nolazytime: + sb->s_flags &= ~MS_LAZYTIME; + break; default: f2fs_msg(sb, KERN_ERR, "Unrecognized mount option \"%s\" or missing value", @@ -453,20 +565,22 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb) init_once((void *) fi); + if (percpu_counter_init(&fi->dirty_pages, 0, GFP_NOFS)) { + kmem_cache_free(f2fs_inode_cachep, fi); + return NULL; + } + /* Initialize f2fs-specific inode info */ fi->vfs_inode.i_version = 1; - atomic_set(&fi->dirty_pages, 0); fi->i_current_depth = 1; fi->i_advise = 0; init_rwsem(&fi->i_sem); INIT_LIST_HEAD(&fi->dirty_list); + INIT_LIST_HEAD(&fi->gdirty_list); INIT_LIST_HEAD(&fi->inmem_pages); mutex_init(&fi->inmem_lock); - - set_inode_flag(fi, FI_NEW_INODE); - - if (test_opt(F2FS_SB(sb), INLINE_XATTR)) - set_inode_flag(fi, FI_INLINE_XATTR); + init_rwsem(&fi->dio_rwsem[READ]); + init_rwsem(&fi->dio_rwsem[WRITE]); /* Will be used by directory only */ fi->i_dir_level = F2FS_SB(sb)->dir_level; @@ -482,7 +596,7 @@ static int f2fs_drop_inode(struct inode *inode) * - f2fs_gc -> iput -> evict * - inode_wait_for_writeback(inode) */ - if (!inode_unhashed(inode) && inode->i_state & I_SYNC) { + if ((!inode_unhashed(inode) && inode->i_state & I_SYNC)) { if (!inode->i_nlink && !is_bad_inode(inode)) { /* to avoid evict_inode call simultaneously */ atomic_inc(&inode->i_count); @@ -496,10 +610,10 @@ static int f2fs_drop_inode(struct inode *inode) f2fs_destroy_extent_node(inode); sb_start_intwrite(inode->i_sb); - i_size_write(inode, 0); + f2fs_i_size_write(inode, 0); if (F2FS_HAS_BLOCKS(inode)) - f2fs_truncate(inode, true); + f2fs_truncate(inode); sb_end_intwrite(inode->i_sb); @@ -509,9 +623,47 @@ static int f2fs_drop_inode(struct inode *inode) } return 0; } + return generic_drop_inode(inode); } +int f2fs_inode_dirtied(struct inode *inode) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + + spin_lock(&sbi->inode_lock[DIRTY_META]); + if (is_inode_flag_set(inode, FI_DIRTY_INODE)) { + spin_unlock(&sbi->inode_lock[DIRTY_META]); + return 1; + } + + set_inode_flag(inode, FI_DIRTY_INODE); + list_add_tail(&F2FS_I(inode)->gdirty_list, + &sbi->inode_list[DIRTY_META]); + inc_page_count(sbi, F2FS_DIRTY_IMETA); + stat_inc_dirty_inode(sbi, DIRTY_META); + spin_unlock(&sbi->inode_lock[DIRTY_META]); + + return 0; +} + +void f2fs_inode_synced(struct inode *inode) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + + spin_lock(&sbi->inode_lock[DIRTY_META]); + if (!is_inode_flag_set(inode, FI_DIRTY_INODE)) { + spin_unlock(&sbi->inode_lock[DIRTY_META]); + return; + } + list_del_init(&F2FS_I(inode)->gdirty_list); + clear_inode_flag(inode, FI_DIRTY_INODE); + clear_inode_flag(inode, FI_AUTO_RECOVER); + dec_page_count(sbi, F2FS_DIRTY_IMETA); + stat_dec_dirty_inode(F2FS_I_SB(inode), DIRTY_META); + spin_unlock(&sbi->inode_lock[DIRTY_META]); +} + /* * f2fs_dirty_inode() is called from __mark_inode_dirty() * @@ -519,7 +671,19 @@ static int f2fs_drop_inode(struct inode *inode) */ static void f2fs_dirty_inode(struct inode *inode, int flags) { - set_inode_flag(F2FS_I(inode), FI_DIRTY_INODE); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + + if (inode->i_ino == F2FS_NODE_INO(sbi) || + inode->i_ino == F2FS_META_INO(sbi)) + return; + + if (flags == I_DIRTY_TIME) + return; + + if (is_inode_flag_set(inode, FI_AUTO_RECOVER)) + clear_inode_flag(inode, FI_AUTO_RECOVER); + + f2fs_inode_dirtied(inode); } static void f2fs_i_callback(struct rcu_head *head) @@ -530,15 +694,29 @@ static void f2fs_i_callback(struct rcu_head *head) static void f2fs_destroy_inode(struct inode *inode) { + percpu_counter_destroy(&F2FS_I(inode)->dirty_pages); call_rcu(&inode->i_rcu, f2fs_i_callback); } +static void destroy_percpu_info(struct f2fs_sb_info *sbi) +{ + int i; + + for (i = 0; i < NR_COUNT_TYPE; i++) + percpu_counter_destroy(&sbi->nr_pages[i]); + percpu_counter_destroy(&sbi->alloc_valid_block_count); + percpu_counter_destroy(&sbi->total_valid_inode_count); + + percpu_free_rwsem(&sbi->cp_rwsem); +} + static void f2fs_put_super(struct super_block *sb) { struct f2fs_sb_info *sbi = F2FS_SB(sb); if (sbi->s_proc) { remove_proc_entry("segment_info", sbi->s_proc); + remove_proc_entry("segment_bits", sbi->s_proc); remove_proc_entry(sb->s_id, f2fs_proc_root); } kobject_del(&sbi->s_kobj); @@ -568,15 +746,14 @@ static void f2fs_put_super(struct super_block *sb) * normally superblock is clean, so we need to release this. * In addition, EIO will skip do checkpoint, we need this as well. */ - release_ino_entry(sbi); + release_ino_entry(sbi, true); release_discard_addrs(sbi); f2fs_leave_shrinker(sbi); mutex_unlock(&sbi->umount_mutex); /* our cp_error case, we can wait for any writeback page */ - if (get_pages(sbi, F2FS_WRITEBACK)) - f2fs_flush_merged_bios(sbi); + f2fs_flush_merged_bios(sbi); iput(sbi->node_inode); iput(sbi->meta_inode); @@ -593,6 +770,8 @@ static void f2fs_put_super(struct super_block *sb) if (sbi->s_chksum_driver) crypto_free_shash(sbi->s_chksum_driver); kfree(sbi->raw_super); + + destroy_percpu_info(sbi); kfree(sbi); } @@ -648,7 +827,7 @@ static int f2fs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_bsize = sbi->blocksize; buf->f_blocks = total_count - start_count; - buf->f_bfree = buf->f_blocks - valid_user_blocks(sbi) - ovp_count; + buf->f_bfree = user_block_count - valid_user_blocks(sbi) + ovp_count; buf->f_bavail = user_block_count - valid_user_blocks(sbi); buf->f_files = sbi->total_node_count - F2FS_RESERVED_NODE_NUM; @@ -713,6 +892,12 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) seq_puts(seq, ",noextent_cache"); if (test_opt(sbi, DATA_FLUSH)) seq_puts(seq, ",data_flush"); + + seq_puts(seq, ",mode="); + if (test_opt(sbi, ADAPTIVE)) + seq_puts(seq, "adaptive"); + else if (test_opt(sbi, LFS)) + seq_puts(seq, "lfs"); seq_printf(seq, ",active_logs=%u", sbi->active_logs); return 0; @@ -745,19 +930,46 @@ static int segment_info_seq_show(struct seq_file *seq, void *offset) return 0; } -static int segment_info_open_fs(struct inode *inode, struct file *file) +static int segment_bits_seq_show(struct seq_file *seq, void *offset) { - return single_open(file, segment_info_seq_show, PDE_DATA(inode)); + struct super_block *sb = seq->private; + struct f2fs_sb_info *sbi = F2FS_SB(sb); + unsigned int total_segs = + le32_to_cpu(sbi->raw_super->segment_count_main); + int i, j; + + seq_puts(seq, "format: segment_type|valid_blocks|bitmaps\n" + "segment_type(0:HD, 1:WD, 2:CD, 3:HN, 4:WN, 5:CN)\n"); + + for (i = 0; i < total_segs; i++) { + struct seg_entry *se = get_seg_entry(sbi, i); + + seq_printf(seq, "%-10d", i); + seq_printf(seq, "%d|%-3u|", se->type, + get_valid_blocks(sbi, i, 1)); + for (j = 0; j < SIT_VBLOCK_MAP_SIZE; j++) + seq_printf(seq, "%x ", se->cur_valid_map[j]); + seq_putc(seq, '\n'); + } + return 0; } -static const struct file_operations f2fs_seq_segment_info_fops = { - .owner = THIS_MODULE, - .open = segment_info_open_fs, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, +#define F2FS_PROC_FILE_DEF(_name) \ +static int _name##_open_fs(struct inode *inode, struct file *file) \ +{ \ + return single_open(file, _name##_seq_show, PDE_DATA(inode)); \ +} \ + \ +static const struct file_operations f2fs_seq_##_name##_fops = { \ + .open = _name##_open_fs, \ + .read = seq_read, \ + .llseek = seq_lseek, \ + .release = single_release, \ }; +F2FS_PROC_FILE_DEF(segment_info); +F2FS_PROC_FILE_DEF(segment_bits); + static void default_options(struct f2fs_sb_info *sbi) { /* init some FS parameters */ @@ -766,6 +978,14 @@ static void default_options(struct f2fs_sb_info *sbi) set_opt(sbi, BG_GC); set_opt(sbi, INLINE_DATA); set_opt(sbi, EXTENT_CACHE); + sbi->sb->s_flags |= MS_LAZYTIME; + set_opt(sbi, FLUSH_MERGE); + if (f2fs_sb_mounted_hmsmr(sbi->sb)) { + set_opt_mode(sbi, F2FS_MOUNT_LFS); + set_opt(sbi, DISCARD); + } else { + set_opt_mode(sbi, F2FS_MOUNT_ADAPTIVE); + } #ifdef CONFIG_F2FS_FS_XATTR set_opt(sbi, XATTR_USER); @@ -791,13 +1011,15 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) org_mount_opt = sbi->mount_opt; active_logs = sbi->active_logs; - if (*flags & MS_RDONLY) { - set_opt(sbi, FASTBOOT); - set_sbi_flag(sbi, SBI_IS_DIRTY); + /* recover superblocks we couldn't write due to previous RO mount */ + if (!(*flags & MS_RDONLY) && is_sbi_flag_set(sbi, SBI_NEED_SB_WRITE)) { + err = f2fs_commit_super(sbi, false); + f2fs_msg(sb, KERN_INFO, + "Try to recover all the superblocks, ret: %d", err); + if (!err) + clear_sbi_flag(sbi, SBI_NEED_SB_WRITE); } - sync_filesystem(sb); - sbi->mount_opt.opt = 0; default_options(sbi); @@ -829,7 +1051,6 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) if ((*flags & MS_RDONLY) || !test_opt(sbi, BG_GC)) { if (sbi->gc_thread) { stop_gc_thread(sbi); - f2fs_sync_fs(sb, 1); need_restart_gc = true; } } else if (!sbi->gc_thread) { @@ -839,6 +1060,16 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) need_stop_gc = true; } + if (*flags & MS_RDONLY) { + writeback_inodes_sb(sb, WB_REASON_SYNC); + sync_inodes_sb(sb); + + set_sbi_flag(sbi, SBI_IS_DIRTY); + set_sbi_flag(sbi, SBI_IS_CLOSE); + f2fs_sync_fs(sb, 1); + clear_sbi_flag(sbi, SBI_IS_CLOSE); + } + /* * We stop issue flush thread if FS is mounted as RO * or if flush_merge is not passed in mount option. @@ -852,8 +1083,9 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) } skip: /* Update the POSIXACL Flag */ - sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | + sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | (test_opt(sbi, POSIX_ACL) ? MS_POSIXACL : 0); + return 0; restore_gc: if (need_restart_gc) { @@ -893,6 +1125,12 @@ static int f2fs_get_context(struct inode *inode, void *ctx, size_t len) ctx, len, NULL); } +static int f2fs_key_prefix(struct inode *inode, u8 **key) +{ + *key = F2FS_I_SB(inode)->key_prefix; + return F2FS_I_SB(inode)->key_prefix_size; +} + static int f2fs_set_context(struct inode *inode, const void *ctx, size_t len, void *fs_data) { @@ -909,6 +1147,7 @@ static unsigned f2fs_max_namelen(struct inode *inode) static struct fscrypt_operations f2fs_cryptops = { .get_context = f2fs_get_context, + .key_prefix = f2fs_key_prefix, .set_context = f2fs_set_context, .is_encrypted = f2fs_encrypted_inode, .empty_dir = f2fs_empty_dir, @@ -998,11 +1237,12 @@ static int __f2fs_commit_super(struct buffer_head *bh, return __sync_dirty_buffer(bh, WRITE_FLUSH_FUA); } -static inline bool sanity_check_area_boundary(struct super_block *sb, +static inline bool sanity_check_area_boundary(struct f2fs_sb_info *sbi, struct buffer_head *bh) { struct f2fs_super_block *raw_super = (struct f2fs_super_block *) (bh->b_data + F2FS_SUPER_OFFSET); + struct super_block *sb = sbi->sb; u32 segment0_blkaddr = le32_to_cpu(raw_super->segment0_blkaddr); u32 cp_blkaddr = le32_to_cpu(raw_super->cp_blkaddr); u32 sit_blkaddr = le32_to_cpu(raw_super->sit_blkaddr); @@ -1081,6 +1321,7 @@ static inline bool sanity_check_area_boundary(struct super_block *sb, segment0_blkaddr) >> log_blocks_per_seg); if (f2fs_readonly(sb) || bdev_read_only(sb->s_bdev)) { + set_sbi_flag(sbi, SBI_NEED_SB_WRITE); res = "internally"; } else { err = __f2fs_commit_super(bh, NULL); @@ -1098,11 +1339,12 @@ static inline bool sanity_check_area_boundary(struct super_block *sb, return false; } -static int sanity_check_raw_super(struct super_block *sb, +static int sanity_check_raw_super(struct f2fs_sb_info *sbi, struct buffer_head *bh) { struct f2fs_super_block *raw_super = (struct f2fs_super_block *) (bh->b_data + F2FS_SUPER_OFFSET); + struct super_block *sb = sbi->sb; unsigned int blocksize; if (F2FS_SUPER_MAGIC != le32_to_cpu(raw_super->magic)) { @@ -1169,7 +1411,7 @@ static int sanity_check_raw_super(struct super_block *sb, } /* check CP/SIT/NAT/SSA/MAIN_AREA area boundary */ - if (sanity_check_area_boundary(sb, bh)) + if (sanity_check_area_boundary(sbi, bh)) return 1; return 0; @@ -1201,7 +1443,6 @@ int sanity_check_ckpt(struct f2fs_sb_info *sbi) static void init_sb_info(struct f2fs_sb_info *sbi) { struct f2fs_super_block *raw_super = sbi->raw_super; - int i; sbi->log_sectors_per_block = le32_to_cpu(raw_super->log_sectors_per_block); @@ -1221,9 +1462,6 @@ static void init_sb_info(struct f2fs_sb_info *sbi) sbi->cur_victim_sec = NULL_SECNO; sbi->max_victim_search = DEF_MAX_VICTIM_SEARCH; - for (i = 0; i < NR_COUNT_TYPE; i++) - atomic_set(&sbi->nr_pages[i], 0); - sbi->dir_level = DEF_DIR_LEVEL; sbi->interval_time[CP_TIME] = DEF_CP_INTERVAL; sbi->interval_time[REQ_TIME] = DEF_IDLE_INTERVAL; @@ -1231,6 +1469,35 @@ static void init_sb_info(struct f2fs_sb_info *sbi) INIT_LIST_HEAD(&sbi->s_list); mutex_init(&sbi->umount_mutex); + mutex_init(&sbi->wio_mutex[NODE]); + mutex_init(&sbi->wio_mutex[DATA]); + +#ifdef CONFIG_F2FS_FS_ENCRYPTION + memcpy(sbi->key_prefix, F2FS_KEY_DESC_PREFIX, + F2FS_KEY_DESC_PREFIX_SIZE); + sbi->key_prefix_size = F2FS_KEY_DESC_PREFIX_SIZE; +#endif +} + +static int init_percpu_info(struct f2fs_sb_info *sbi) +{ + int i, err; + + if (percpu_init_rwsem(&sbi->cp_rwsem)) + return -ENOMEM; + + for (i = 0; i < NR_COUNT_TYPE; i++) { + err = percpu_counter_init(&sbi->nr_pages[i], 0, GFP_KERNEL); + if (err) + return err; + } + + err = percpu_counter_init(&sbi->alloc_valid_block_count, 0, GFP_KERNEL); + if (err) + return err; + + return percpu_counter_init(&sbi->total_valid_inode_count, 0, + GFP_KERNEL); } /* @@ -1239,10 +1506,11 @@ static void init_sb_info(struct f2fs_sb_info *sbi) * to get the first valid one. If any one of them is broken, we pass * them recovery flag back to the caller. */ -static int read_raw_super_block(struct super_block *sb, +static int read_raw_super_block(struct f2fs_sb_info *sbi, struct f2fs_super_block **raw_super, int *valid_super_block, int *recovery) { + struct super_block *sb = sbi->sb; int block; struct buffer_head *bh; struct f2fs_super_block *super; @@ -1262,7 +1530,7 @@ static int read_raw_super_block(struct super_block *sb, } /* sanity checking of raw super */ - if (sanity_check_raw_super(sb, bh)) { + if (sanity_check_raw_super(sbi, bh)) { f2fs_msg(sb, KERN_ERR, "Can't find valid F2FS filesystem in %dth superblock", block + 1); @@ -1298,6 +1566,12 @@ int f2fs_commit_super(struct f2fs_sb_info *sbi, bool recover) struct buffer_head *bh; int err; + if ((recover && f2fs_readonly(sbi->sb)) || + bdev_read_only(sbi->sb->s_bdev)) { + set_sbi_flag(sbi, SBI_NEED_SB_WRITE); + return -EROFS; + } + /* write back-up superblock first */ bh = sb_getblk(sbi->sb, sbi->valid_super_block ? 0: 1); if (!bh) @@ -1323,7 +1597,7 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) struct f2fs_sb_info *sbi; struct f2fs_super_block *raw_super; struct inode *root; - long err; + int err; bool retry = true, need_fsck = false; char *options = NULL; int recovery, i, valid_super_block; @@ -1340,6 +1614,8 @@ try_onemore: if (!sbi) return -ENOMEM; + sbi->sb = sb; + /* Load the checksum driver */ sbi->s_chksum_driver = crypto_alloc_shash("crc32", 0, 0); if (IS_ERR(sbi->s_chksum_driver)) { @@ -1355,12 +1631,14 @@ try_onemore: goto free_sbi; } - err = read_raw_super_block(sb, &raw_super, &valid_super_block, + err = read_raw_super_block(sbi, &raw_super, &valid_super_block, &recovery); if (err) goto free_sbi; sb->s_fs_info = sbi; + sbi->raw_super = raw_super; + default_options(sbi); /* parse mount options */ options = kstrdup((const char *)data, GFP_KERNEL); @@ -1390,11 +1668,8 @@ try_onemore: memcpy(sb->s_uuid, raw_super->uuid, sizeof(raw_super->uuid)); /* init f2fs-specific super block info */ - sbi->sb = sb; - sbi->raw_super = raw_super; sbi->valid_super_block = valid_super_block; mutex_init(&sbi->gc_mutex); - mutex_init(&sbi->writepages); mutex_init(&sbi->cp_mutex); init_rwsem(&sbi->node_write); @@ -1411,10 +1686,13 @@ try_onemore: sbi->write_io[i].bio = NULL; } - init_rwsem(&sbi->cp_rwsem); init_waitqueue_head(&sbi->cp_wait); init_sb_info(sbi); + err = init_percpu_info(sbi); + if (err) + goto free_options; + /* get an inode for meta space */ sbi->meta_inode = f2fs_iget(sb, F2FS_META_INO(sbi)); if (IS_ERR(sbi->meta_inode)) { @@ -1431,13 +1709,13 @@ try_onemore: sbi->total_valid_node_count = le32_to_cpu(sbi->ckpt->valid_node_count); - sbi->total_valid_inode_count = - le32_to_cpu(sbi->ckpt->valid_inode_count); + percpu_counter_set(&sbi->total_valid_inode_count, + le32_to_cpu(sbi->ckpt->valid_inode_count)); sbi->user_block_count = le64_to_cpu(sbi->ckpt->user_block_count); sbi->total_valid_block_count = le64_to_cpu(sbi->ckpt->valid_block_count); sbi->last_valid_block_count = sbi->total_valid_block_count; - sbi->alloc_valid_block_count = 0; + for (i = 0; i < NR_INODE_TYPE; i++) { INIT_LIST_HEAD(&sbi->inode_list[i]); spin_lock_init(&sbi->inode_lock[i]); @@ -1515,9 +1793,12 @@ try_onemore: if (f2fs_proc_root) sbi->s_proc = proc_mkdir(sb->s_id, f2fs_proc_root); - if (sbi->s_proc) + if (sbi->s_proc) { proc_create_data("segment_info", S_IRUGO, sbi->s_proc, &f2fs_seq_segment_info_fops, sb); + proc_create_data("segment_bits", S_IRUGO, sbi->s_proc, + &f2fs_seq_segment_bits_fops, sb); + } sbi->s_kobj.kset = f2fs_kset; init_completion(&sbi->s_kobj_unregister); @@ -1541,14 +1822,24 @@ try_onemore: if (need_fsck) set_sbi_flag(sbi, SBI_NEED_FSCK); - err = recover_fsync_data(sbi); - if (err) { + err = recover_fsync_data(sbi, false); + if (err < 0) { need_fsck = true; f2fs_msg(sb, KERN_ERR, - "Cannot recover all fsync data errno=%ld", err); + "Cannot recover all fsync data errno=%d", err); + goto free_kobj; + } + } else { + err = recover_fsync_data(sbi, true); + + if (!f2fs_readonly(sb) && err > 0) { + err = -EINVAL; + f2fs_msg(sb, KERN_ERR, + "Need to recover fsync data"); goto free_kobj; } } + /* recover_fsync_data() cleared this already */ clear_sbi_flag(sbi, SBI_POR_DOING); @@ -1565,10 +1856,10 @@ try_onemore: kfree(options); /* recover broken superblock */ - if (recovery && !f2fs_readonly(sb) && !bdev_read_only(sb->s_bdev)) { + if (recovery) { err = f2fs_commit_super(sbi, true); f2fs_msg(sb, KERN_INFO, - "Try to recover %dth superblock, ret: %ld", + "Try to recover %dth superblock, ret: %d", sbi->valid_super_block ? 1 : 2, err); } @@ -1577,12 +1868,14 @@ try_onemore: return 0; free_kobj: + f2fs_sync_inode_meta(sbi); kobject_del(&sbi->s_kobj); kobject_put(&sbi->s_kobj); wait_for_completion(&sbi->s_kobj_unregister); free_proc: if (sbi->s_proc) { remove_proc_entry("segment_info", sbi->s_proc); + remove_proc_entry("segment_bits", sbi->s_proc); remove_proc_entry(sb->s_id, f2fs_proc_root); } f2fs_destroy_stats(sbi); @@ -1603,6 +1896,7 @@ free_meta_inode: make_bad_inode(sbi->meta_inode); iput(sbi->meta_inode); free_options: + destroy_percpu_info(sbi); kfree(options); free_sb_buf: kfree(raw_super); @@ -1688,6 +1982,16 @@ static int __init init_f2fs_fs(void) err = -ENOMEM; goto free_extent_cache; } +#ifdef CONFIG_F2FS_FAULT_INJECTION + f2fs_fault_inject.kset = f2fs_kset; + f2fs_build_fault_attr(0); + err = kobject_init_and_add(&f2fs_fault_inject, &f2fs_fault_ktype, + NULL, "fault_injection"); + if (err) { + f2fs_fault_inject.kset = NULL; + goto free_kset; + } +#endif err = register_shrinker(&f2fs_shrinker_info); if (err) goto free_kset; @@ -1706,6 +2010,10 @@ free_filesystem: free_shrinker: unregister_shrinker(&f2fs_shrinker_info); free_kset: +#ifdef CONFIG_F2FS_FAULT_INJECTION + if (f2fs_fault_inject.kset) + kobject_put(&f2fs_fault_inject); +#endif kset_unregister(f2fs_kset); free_extent_cache: destroy_extent_cache(); @@ -1725,14 +2033,17 @@ static void __exit exit_f2fs_fs(void) { remove_proc_entry("fs/f2fs", NULL); f2fs_destroy_root_stats(); - unregister_shrinker(&f2fs_shrinker_info); unregister_filesystem(&f2fs_fs_type); + unregister_shrinker(&f2fs_shrinker_info); +#ifdef CONFIG_F2FS_FAULT_INJECTION + kobject_put(&f2fs_fault_inject); +#endif + kset_unregister(f2fs_kset); destroy_extent_cache(); destroy_checkpoint_caches(); destroy_segment_manager_caches(); destroy_node_manager_caches(); destroy_inodecache(); - kset_unregister(f2fs_kset); f2fs_destroy_trace_ios(); } diff --git a/fs/f2fs/trace.c b/fs/f2fs/trace.c index 562ce0821559..73b4e1d1912a 100644 --- a/fs/f2fs/trace.c +++ b/fs/f2fs/trace.c @@ -25,11 +25,11 @@ static inline void __print_last_io(void) if (!last_io.len) return; - trace_printk("%3x:%3x %4x %-16s %2x %5x %12x %4x\n", + trace_printk("%3x:%3x %4x %-16s %2x %5x %5x %12x %4x\n", last_io.major, last_io.minor, last_io.pid, "----------------", last_io.type, - last_io.fio.rw, + last_io.fio.op, last_io.fio.op_flags, last_io.fio.new_blkaddr, last_io.len); memset(&last_io, 0, sizeof(last_io)); @@ -101,7 +101,8 @@ void f2fs_trace_ios(struct f2fs_io_info *fio, int flush) if (last_io.major == major && last_io.minor == minor && last_io.pid == pid && last_io.type == __file_type(inode, pid) && - last_io.fio.rw == fio->rw && + last_io.fio.op == fio->op && + last_io.fio.op_flags == fio->op_flags && last_io.fio.new_blkaddr + last_io.len == fio->new_blkaddr) { last_io.len++; diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index 17fd2b1a6848..c8898b5148eb 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -50,10 +50,11 @@ static int f2fs_xattr_generic_get(const struct xattr_handler *handler, } static int f2fs_xattr_generic_set(const struct xattr_handler *handler, - struct dentry *dentry, const char *name, const void *value, + struct dentry *unused, struct inode *inode, + const char *name, const void *value, size_t size, int flags) { - struct f2fs_sb_info *sbi = F2FS_SB(dentry->d_sb); + struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); switch (handler->flags) { case F2FS_XATTR_INDEX_USER: @@ -69,7 +70,7 @@ static int f2fs_xattr_generic_set(const struct xattr_handler *handler, default: return -EINVAL; } - return f2fs_setxattr(d_inode(dentry), handler->flags, name, + return f2fs_setxattr(inode, handler->flags, name, value, size, NULL, flags); } @@ -95,18 +96,17 @@ static int f2fs_xattr_advise_get(const struct xattr_handler *handler, } static int f2fs_xattr_advise_set(const struct xattr_handler *handler, - struct dentry *dentry, const char *name, const void *value, + struct dentry *unused, struct inode *inode, + const char *name, const void *value, size_t size, int flags) { - struct inode *inode = d_inode(dentry); - if (!inode_owner_or_capable(inode)) return -EPERM; if (value == NULL) return -EINVAL; F2FS_I(inode)->i_advise |= *(char *)value; - mark_inode_dirty(inode); + f2fs_mark_inode_dirty_sync(inode); return 0; } @@ -299,6 +299,7 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize, if (ipage) { inline_addr = inline_xattr_addr(ipage); f2fs_wait_on_page_writeback(ipage, NODE, true); + set_page_dirty(ipage); } else { page = get_node_page(sbi, inode->i_ino); if (IS_ERR(page)) { @@ -441,13 +442,12 @@ static int __f2fs_setxattr(struct inode *inode, int index, const char *name, const void *value, size_t size, struct page *ipage, int flags) { - struct f2fs_inode_info *fi = F2FS_I(inode); struct f2fs_xattr_entry *here, *last; void *base_addr; int found, newsize; size_t len; __u32 new_hsize; - int error = -ENOMEM; + int error = 0; if (name == NULL) return -EINVAL; @@ -465,7 +465,7 @@ static int __f2fs_setxattr(struct inode *inode, int index, base_addr = read_all_xattrs(inode, ipage); if (!base_addr) - goto exit; + return -ENOMEM; /* find entry with wanted name. */ here = __find_xattr(base_addr, index, len, name); @@ -498,7 +498,7 @@ static int __f2fs_setxattr(struct inode *inode, int index, free = free + ENTRY_SIZE(here); if (unlikely(free < newsize)) { - error = -ENOSPC; + error = -E2BIG; goto exit; } } @@ -526,7 +526,6 @@ static int __f2fs_setxattr(struct inode *inode, int index, * Before we come here, old entry is removed. * We just write new entry. */ - memset(last, 0, newsize); last->e_name_index = index; last->e_name_len = len; memcpy(last->e_name, name, len); @@ -540,19 +539,15 @@ static int __f2fs_setxattr(struct inode *inode, int index, if (error) goto exit; - if (is_inode_flag_set(fi, FI_ACL_MODE)) { - inode->i_mode = fi->i_acl_mode; + if (is_inode_flag_set(inode, FI_ACL_MODE)) { + inode->i_mode = F2FS_I(inode)->i_acl_mode; inode->i_ctime = CURRENT_TIME; - clear_inode_flag(fi, FI_ACL_MODE); + clear_inode_flag(inode, FI_ACL_MODE); } if (index == F2FS_XATTR_INDEX_ENCRYPTION && !strcmp(name, F2FS_XATTR_NAME_ENCRYPTION_CONTEXT)) f2fs_set_encrypted_inode(inode); - - if (ipage) - update_inode(inode, ipage); - else - update_inode_page(inode); + f2fs_mark_inode_dirty_sync(inode); exit: kzfree(base_addr); return error; diff --git a/fs/fat/misc.c b/fs/fat/misc.c index c4589e981760..8a8698119ff7 100644 --- a/fs/fat/misc.c +++ b/fs/fat/misc.c @@ -267,7 +267,7 @@ int fat_sync_bhs(struct buffer_head **bhs, int nr_bhs) int i, err = 0; for (i = 0; i < nr_bhs; i++) - write_dirty_buffer(bhs[i], WRITE); + write_dirty_buffer(bhs[i], 0); for (i = 0; i < nr_bhs; i++) { wait_on_buffer(bhs[i]); diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c index b7e2b33aa793..1337c0c7527d 100644 --- a/fs/fat/namei_msdos.c +++ b/fs/fat/namei_msdos.c @@ -154,7 +154,7 @@ static int msdos_hash(const struct dentry *dentry, struct qstr *qstr) error = msdos_format_name(qstr->name, qstr->len, msdos_name, options); if (!error) - qstr->hash = full_name_hash(msdos_name, MSDOS_NAME); + qstr->hash = full_name_hash(dentry, msdos_name, MSDOS_NAME); return 0; } diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c index 7092584f424a..6ccdf3f34f90 100644 --- a/fs/fat/namei_vfat.c +++ b/fs/fat/namei_vfat.c @@ -107,7 +107,7 @@ static unsigned int vfat_striptail_len(const struct qstr *qstr) */ static int vfat_hash(const struct dentry *dentry, struct qstr *qstr) { - qstr->hash = full_name_hash(qstr->name, vfat_striptail_len(qstr)); + qstr->hash = full_name_hash(dentry, qstr->name, vfat_striptail_len(qstr)); return 0; } @@ -127,7 +127,7 @@ static int vfat_hashi(const struct dentry *dentry, struct qstr *qstr) name = qstr->name; len = vfat_striptail_len(qstr); - hash = init_name_hash(); + hash = init_name_hash(dentry); while (len--) hash = partial_name_hash(nls_tolower(t, *name++), hash); qstr->hash = end_name_hash(hash); diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 592cea54cea0..6f9c9f6f5157 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -483,9 +483,9 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id) goto out_free; } inode->i_state |= I_WB_SWITCH; + __iget(inode); spin_unlock(&inode->i_lock); - ihold(inode); isw->inode = inode; atomic_inc(&isw_nr_in_flight); @@ -931,7 +931,8 @@ void wb_start_writeback(struct bdi_writeback *wb, long nr_pages, * This is WB_SYNC_NONE writeback, so if allocation fails just * wakeup the thread for old dirty data writeback */ - work = kzalloc(sizeof(*work), GFP_ATOMIC); + work = kzalloc(sizeof(*work), + GFP_NOWAIT | __GFP_NOMEMALLOC | __GFP_NOWARN); if (!work) { trace_writeback_nowork(wb); wb_wakeup(wb); @@ -980,6 +981,42 @@ void inode_io_list_del(struct inode *inode) } /* + * mark an inode as under writeback on the sb + */ +void sb_mark_inode_writeback(struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + unsigned long flags; + + if (list_empty(&inode->i_wb_list)) { + spin_lock_irqsave(&sb->s_inode_wblist_lock, flags); + if (list_empty(&inode->i_wb_list)) { + list_add_tail(&inode->i_wb_list, &sb->s_inodes_wb); + trace_sb_mark_inode_writeback(inode); + } + spin_unlock_irqrestore(&sb->s_inode_wblist_lock, flags); + } +} + +/* + * clear an inode as under writeback on the sb + */ +void sb_clear_inode_writeback(struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + unsigned long flags; + + if (!list_empty(&inode->i_wb_list)) { + spin_lock_irqsave(&sb->s_inode_wblist_lock, flags); + if (!list_empty(&inode->i_wb_list)) { + list_del_init(&inode->i_wb_list); + trace_sb_clear_inode_writeback(inode); + } + spin_unlock_irqrestore(&sb->s_inode_wblist_lock, flags); + } +} + +/* * Redirty an inode: set its when-it-was dirtied timestamp and move it to the * furthest end of its superblock's dirty-inode list. * @@ -2153,7 +2190,7 @@ EXPORT_SYMBOL(__mark_inode_dirty); */ static void wait_sb_inodes(struct super_block *sb) { - struct inode *inode, *old_inode = NULL; + LIST_HEAD(sync_list); /* * We need to be protected against the filesystem going from @@ -2162,38 +2199,60 @@ static void wait_sb_inodes(struct super_block *sb) WARN_ON(!rwsem_is_locked(&sb->s_umount)); mutex_lock(&sb->s_sync_lock); - spin_lock(&sb->s_inode_list_lock); /* - * Data integrity sync. Must wait for all pages under writeback, - * because there may have been pages dirtied before our sync - * call, but which had writeout started before we write it out. - * In which case, the inode may not be on the dirty list, but - * we still have to wait for that writeout. + * Splice the writeback list onto a temporary list to avoid waiting on + * inodes that have started writeback after this point. + * + * Use rcu_read_lock() to keep the inodes around until we have a + * reference. s_inode_wblist_lock protects sb->s_inodes_wb as well as + * the local list because inodes can be dropped from either by writeback + * completion. + */ + rcu_read_lock(); + spin_lock_irq(&sb->s_inode_wblist_lock); + list_splice_init(&sb->s_inodes_wb, &sync_list); + + /* + * Data integrity sync. Must wait for all pages under writeback, because + * there may have been pages dirtied before our sync call, but which had + * writeout started before we write it out. In which case, the inode + * may not be on the dirty list, but we still have to wait for that + * writeout. */ - list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { + while (!list_empty(&sync_list)) { + struct inode *inode = list_first_entry(&sync_list, struct inode, + i_wb_list); struct address_space *mapping = inode->i_mapping; + /* + * Move each inode back to the wb list before we drop the lock + * to preserve consistency between i_wb_list and the mapping + * writeback tag. Writeback completion is responsible to remove + * the inode from either list once the writeback tag is cleared. + */ + list_move_tail(&inode->i_wb_list, &sb->s_inodes_wb); + + /* + * The mapping can appear untagged while still on-list since we + * do not have the mapping lock. Skip it here, wb completion + * will remove it. + */ + if (!mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK)) + continue; + + spin_unlock_irq(&sb->s_inode_wblist_lock); + spin_lock(&inode->i_lock); - if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) || - (mapping->nrpages == 0)) { + if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) { spin_unlock(&inode->i_lock); + + spin_lock_irq(&sb->s_inode_wblist_lock); continue; } __iget(inode); spin_unlock(&inode->i_lock); - spin_unlock(&sb->s_inode_list_lock); - - /* - * We hold a reference to 'inode' so it couldn't have been - * removed from s_inodes list while we dropped the - * s_inode_list_lock. We cannot iput the inode now as we can - * be holding the last reference and we cannot iput it under - * s_inode_list_lock. So we keep the reference and iput it - * later. - */ - iput(old_inode); - old_inode = inode; + rcu_read_unlock(); /* * We keep the error status of individual mapping so that @@ -2204,10 +2263,13 @@ static void wait_sb_inodes(struct super_block *sb) cond_resched(); - spin_lock(&sb->s_inode_list_lock); + iput(inode); + + rcu_read_lock(); + spin_lock_irq(&sb->s_inode_wblist_lock); } - spin_unlock(&sb->s_inode_list_lock); - iput(old_inode); + spin_unlock_irq(&sb->s_inode_wblist_lock); + rcu_read_unlock(); mutex_unlock(&sb->s_sync_lock); } diff --git a/fs/fscache/histogram.c b/fs/fscache/histogram.c index 7d637e2335fd..15a3d042247e 100644 --- a/fs/fscache/histogram.c +++ b/fs/fscache/histogram.c @@ -99,7 +99,6 @@ static int fscache_histogram_open(struct inode *inode, struct file *file) } const struct file_operations fscache_histogram_fops = { - .owner = THIS_MODULE, .open = fscache_histogram_open, .read = seq_read, .llseek = seq_lseek, diff --git a/fs/fscache/object-list.c b/fs/fscache/object-list.c index 6b028b7c4250..5d5ddaa84b21 100644 --- a/fs/fscache/object-list.c +++ b/fs/fscache/object-list.c @@ -404,7 +404,6 @@ static int fscache_objlist_release(struct inode *inode, struct file *file) } const struct file_operations fscache_objlist_fops = { - .owner = THIS_MODULE, .open = fscache_objlist_open, .read = seq_read, .llseek = seq_lseek, diff --git a/fs/fscache/page.c b/fs/fscache/page.c index 3078b679fcd1..c8c4f79c7ce1 100644 --- a/fs/fscache/page.c +++ b/fs/fscache/page.c @@ -887,6 +887,8 @@ void fscache_invalidate_writes(struct fscache_cookie *cookie) put_page(results[i]); } + wake_up_bit(&cookie->flags, 0); + _leave(""); } diff --git a/fs/fscache/stats.c b/fs/fscache/stats.c index 7cfa0aacdf6d..7ac6e839b065 100644 --- a/fs/fscache/stats.c +++ b/fs/fscache/stats.c @@ -295,7 +295,6 @@ static int fscache_stats_open(struct inode *inode, struct file *file) } const struct file_operations fscache_stats_fops = { - .owner = THIS_MODULE, .open = fscache_stats_open, .read = seq_read, .llseek = seq_lseek, diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index cbece1221417..203adf3b75db 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -1525,7 +1525,6 @@ static int fuse_notify_inval_entry(struct fuse_conn *fc, unsigned int size, goto err; fuse_copy_finish(cs); buf[outarg.namelen] = 0; - name.hash = full_name_hash(name.name, name.len); down_read(&fc->killsb); err = -ENOENT; @@ -1576,7 +1575,6 @@ static int fuse_notify_delete(struct fuse_conn *fc, unsigned int size, goto err; fuse_copy_finish(cs); buf[outarg.namelen] = 0; - name.hash = full_name_hash(name.name, name.len); down_read(&fc->killsb); err = -ENOENT; diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index b9419058108f..5f1627725791 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -341,8 +341,10 @@ static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry, struct dentry *newent; bool outarg_valid = true; + fuse_lock_inode(dir); err = fuse_lookup_name(dir->i_sb, get_node_id(dir), &entry->d_name, &outarg, &inode); + fuse_unlock_inode(dir); if (err == -ENOENT) { outarg_valid = false; err = 0; @@ -478,7 +480,7 @@ static int fuse_atomic_open(struct inode *dir, struct dentry *entry, struct fuse_conn *fc = get_fuse_conn(dir); struct dentry *res = NULL; - if (d_unhashed(entry)) { + if (d_in_lookup(entry)) { res = fuse_lookup(dir, entry, 0); if (IS_ERR(res)) return PTR_ERR(res); @@ -953,6 +955,7 @@ int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid, if (!dir) goto unlock; + name->hash = full_name_hash(dir, name->name, name->len); entry = d_lookup(dir, name); dput(dir); if (!entry) @@ -1202,7 +1205,7 @@ static int fuse_direntplus_link(struct file *file, fc = get_fuse_conn(dir); - name.hash = full_name_hash(name.name, name.len); + name.hash = full_name_hash(parent, name.name, name.len); dentry = d_lookup(parent, &name); if (!dentry) { retry: @@ -1341,7 +1344,9 @@ static int fuse_readdir(struct file *file, struct dir_context *ctx) fuse_read_fill(req, file, ctx->pos, PAGE_SIZE, FUSE_READDIR); } + fuse_lock_inode(inode); fuse_request_send(fc, req); + fuse_unlock_inode(inode); nbytes = req->out.args[0].size; err = req->out.h.error; fuse_put_request(fc, req); @@ -1719,10 +1724,10 @@ static int fuse_getattr(struct vfsmount *mnt, struct dentry *entry, return fuse_update_attributes(inode, stat, NULL, NULL); } -static int fuse_setxattr(struct dentry *entry, const char *name, - const void *value, size_t size, int flags) +static int fuse_setxattr(struct dentry *unused, struct inode *inode, + const char *name, const void *value, + size_t size, int flags) { - struct inode *inode = d_inode(entry); struct fuse_conn *fc = get_fuse_conn(inode); FUSE_ARGS(args); struct fuse_setxattr_in inarg; diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index eddbe02c4028..929c383432b0 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -110,6 +110,9 @@ struct fuse_inode { /** Miscellaneous bits describing inode state */ unsigned long state; + + /** Lock for serializing lookup and readdir for back compatibility*/ + struct mutex mutex; }; /** FUSE inode state bits */ @@ -540,6 +543,9 @@ struct fuse_conn { /** write-back cache policy (default is write-through) */ unsigned writeback_cache:1; + /** allow parallel lookups and readdir (default is serialized) */ + unsigned parallel_dirops:1; + /* * The following bitfields are only for optimization purposes * and hence races in setting them will not cause malfunction @@ -956,4 +962,7 @@ int fuse_do_setattr(struct inode *inode, struct iattr *attr, void fuse_set_initialized(struct fuse_conn *fc); +void fuse_unlock_inode(struct inode *inode); +void fuse_lock_inode(struct inode *inode); + #endif /* _FS_FUSE_I_H */ diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 1ce67668a8e1..9961d8432ce3 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -97,6 +97,7 @@ static struct inode *fuse_alloc_inode(struct super_block *sb) INIT_LIST_HEAD(&fi->queued_writes); INIT_LIST_HEAD(&fi->writepages); init_waitqueue_head(&fi->page_waitq); + mutex_init(&fi->mutex); fi->forget = fuse_alloc_forget(); if (!fi->forget) { kmem_cache_free(fuse_inode_cachep, inode); @@ -117,6 +118,7 @@ static void fuse_destroy_inode(struct inode *inode) struct fuse_inode *fi = get_fuse_inode(inode); BUG_ON(!list_empty(&fi->write_files)); BUG_ON(!list_empty(&fi->queued_writes)); + mutex_destroy(&fi->mutex); kfree(fi->forget); call_rcu(&inode->i_rcu, fuse_i_callback); } @@ -351,6 +353,18 @@ int fuse_reverse_inval_inode(struct super_block *sb, u64 nodeid, return 0; } +void fuse_lock_inode(struct inode *inode) +{ + if (!get_fuse_conn(inode)->parallel_dirops) + mutex_lock(&get_fuse_inode(inode)->mutex); +} + +void fuse_unlock_inode(struct inode *inode) +{ + if (!get_fuse_conn(inode)->parallel_dirops) + mutex_unlock(&get_fuse_inode(inode)->mutex); +} + static void fuse_umount_begin(struct super_block *sb) { fuse_abort_conn(get_fuse_conn_super(sb)); @@ -898,6 +912,8 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req) fc->async_dio = 1; if (arg->flags & FUSE_WRITEBACK_CACHE) fc->writeback_cache = 1; + if (arg->flags & FUSE_PARALLEL_DIROPS) + fc->parallel_dirops = 1; if (arg->time_gran && arg->time_gran <= 1000000000) fc->sb->s_time_gran = arg->time_gran; } else { @@ -928,7 +944,8 @@ static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req) FUSE_SPLICE_WRITE | FUSE_SPLICE_MOVE | FUSE_SPLICE_READ | FUSE_FLOCK_LOCKS | FUSE_IOCTL_DIR | FUSE_AUTO_INVAL_DATA | FUSE_DO_READDIRPLUS | FUSE_READDIRPLUS_AUTO | FUSE_ASYNC_DIO | - FUSE_WRITEBACK_CACHE | FUSE_NO_OPEN_SUPPORT; + FUSE_WRITEBACK_CACHE | FUSE_NO_OPEN_SUPPORT | + FUSE_PARALLEL_DIROPS; req->in.h.opcode = FUSE_INIT; req->in.numargs = 1; req->in.args[0].size = sizeof(*arg); diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index 8524c0e322fc..82df36886938 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -140,6 +140,32 @@ static int gfs2_writepage(struct page *page, struct writeback_control *wbc) return nobh_writepage(page, gfs2_get_block_noalloc, wbc); } +/* This is the same as calling block_write_full_page, but it also + * writes pages outside of i_size + */ +int gfs2_write_full_page(struct page *page, get_block_t *get_block, + struct writeback_control *wbc) +{ + struct inode * const inode = page->mapping->host; + loff_t i_size = i_size_read(inode); + const pgoff_t end_index = i_size >> PAGE_SHIFT; + unsigned offset; + + /* + * The page straddles i_size. It must be zeroed out on each and every + * writepage invocation because it may be mmapped. "A file is mapped + * in multiples of the page size. For a file that is not a multiple of + * the page size, the remaining memory is zeroed when mapped, and + * writes to that region are not written out to the file." + */ + offset = i_size & (PAGE_SIZE-1); + if (page->index == end_index && offset) + zero_user_segment(page, offset, PAGE_SIZE); + + return __block_write_full_page(inode, page, get_block, wbc, + end_buffer_async_write); +} + /** * __gfs2_jdata_writepage - The core of jdata writepage * @page: The page to write @@ -165,7 +191,7 @@ static int __gfs2_jdata_writepage(struct page *page, struct writeback_control *w } gfs2_page_add_databufs(ip, page, 0, sdp->sd_vfs->s_blocksize-1); } - return block_write_full_page(page, gfs2_get_block_noalloc, wbc); + return gfs2_write_full_page(page, gfs2_get_block_noalloc, wbc); } /** @@ -180,27 +206,20 @@ static int __gfs2_jdata_writepage(struct page *page, struct writeback_control *w static int gfs2_jdata_writepage(struct page *page, struct writeback_control *wbc) { struct inode *inode = page->mapping->host; + struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_sbd *sdp = GFS2_SB(inode); int ret; - int done_trans = 0; - if (PageChecked(page)) { - if (wbc->sync_mode != WB_SYNC_ALL) - goto out_ignore; - ret = gfs2_trans_begin(sdp, RES_DINODE + 1, 0); - if (ret) - goto out_ignore; - done_trans = 1; - } - ret = gfs2_writepage_common(page, wbc); - if (ret > 0) - ret = __gfs2_jdata_writepage(page, wbc); - if (done_trans) - gfs2_trans_end(sdp); + if (gfs2_assert_withdraw(sdp, gfs2_glock_is_held_excl(ip->i_gl))) + goto out; + if (PageChecked(page) || current->journal_info) + goto out_ignore; + ret = __gfs2_jdata_writepage(page, wbc); return ret; out_ignore: redirty_page_for_writepage(wbc, page); +out: unlock_page(page); return 0; } @@ -977,7 +996,7 @@ static void gfs2_discard(struct gfs2_sbd *sdp, struct buffer_head *bh) if (!list_empty(&bd->bd_list) && !buffer_pinned(bh)) list_del_init(&bd->bd_list); else - gfs2_remove_from_journal(bh, current->journal_info, 0); + gfs2_remove_from_journal(bh, REMOVE_JDATA); } bh->b_bdev = NULL; clear_buffer_mapped(bh); @@ -1063,7 +1082,7 @@ static ssize_t gfs2_direct_IO(struct kiocb *iocb, struct iov_iter *iter) gfs2_holder_init(ip->i_gl, LM_ST_DEFERRED, 0, &gh); rv = gfs2_glock_nq(&gh); if (rv) - return rv; + goto out_uninit; rv = gfs2_ok_for_dio(ip, offset); if (rv != 1) goto out; /* dio not valid, fall back to buffered i/o */ @@ -1102,6 +1121,7 @@ static ssize_t gfs2_direct_IO(struct kiocb *iocb, struct iov_iter *iter) gfs2_get_block_direct, NULL, NULL, 0); out: gfs2_glock_dq(&gh); +out_uninit: gfs2_holder_uninit(&gh); return rv; } diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index 24ce1cdd434a..6e2bec1cd289 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -285,7 +285,8 @@ static void gfs2_metapath_ra(struct gfs2_glock *gl, if (trylock_buffer(rabh)) { if (!buffer_uptodate(rabh)) { rabh->b_end_io = end_buffer_read_sync; - submit_bh(READA | REQ_META, rabh); + submit_bh(REQ_OP_READ, REQ_RAHEAD | REQ_META, + rabh); continue; } unlock_buffer(rabh); @@ -974,7 +975,7 @@ static int gfs2_block_truncate_page(struct address_space *mapping, loff_t from) if (!buffer_uptodate(bh)) { err = -EIO; - ll_rw_block(READ, 1, &bh); + ll_rw_block(REQ_OP_READ, 0, 1, &bh); wait_on_buffer(bh); /* Uhhuh. Read error. Complain and punt. */ if (!buffer_uptodate(bh)) diff --git a/fs/gfs2/dentry.c b/fs/gfs2/dentry.c index 30822b148f3e..5173b98ca036 100644 --- a/fs/gfs2/dentry.c +++ b/fs/gfs2/dentry.c @@ -117,7 +117,7 @@ static int gfs2_dentry_delete(const struct dentry *dentry) return 0; ginode = GFS2_I(d_inode(dentry)); - if (!ginode->i_iopen_gh.gh_gl) + if (!gfs2_holder_initialized(&ginode->i_iopen_gh)) return 0; if (test_bit(GLF_DEMOTE, &ginode->i_iopen_gh.gh_gl->gl_flags)) diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c index 4a01f30e9995..fcb59b23f1e3 100644 --- a/fs/gfs2/dir.c +++ b/fs/gfs2/dir.c @@ -783,12 +783,15 @@ static int get_leaf_nr(struct gfs2_inode *dip, u32 index, u64 *leaf_out) { __be64 *hash; + int error; hash = gfs2_dir_get_hash_table(dip); - if (IS_ERR(hash)) - return PTR_ERR(hash); - *leaf_out = be64_to_cpu(*(hash + index)); - return 0; + error = PTR_ERR_OR_ZERO(hash); + + if (!error) + *leaf_out = be64_to_cpu(*(hash + index)); + + return error; } static int get_first_leaf(struct gfs2_inode *dip, u32 index, @@ -798,7 +801,7 @@ static int get_first_leaf(struct gfs2_inode *dip, u32 index, int error; error = get_leaf_nr(dip, index, &leaf_no); - if (!IS_ERR_VALUE(error)) + if (!error) error = get_leaf(dip, leaf_no, bh_out); return error; @@ -1014,7 +1017,7 @@ static int dir_split_leaf(struct inode *inode, const struct qstr *name) index = name->hash >> (32 - dip->i_depth); error = get_leaf_nr(dip, index, &leaf_no); - if (IS_ERR_VALUE(error)) + if (error) return error; /* Get the old leaf block */ @@ -1510,7 +1513,7 @@ static void gfs2_dir_readahead(struct inode *inode, unsigned hsize, u32 index, continue; } bh->b_end_io = end_buffer_read_sync; - submit_bh(READA | REQ_META, bh); + submit_bh(REQ_OP_READ, REQ_RAHEAD | REQ_META, bh); continue; } brelse(bh); @@ -1660,7 +1663,8 @@ struct inode *gfs2_dir_search(struct inode *dir, const struct qstr *name, brelse(bh); if (fail_on_exist) return ERR_PTR(-EEXIST); - inode = gfs2_inode_lookup(dir->i_sb, dtype, addr, formal_ino); + inode = gfs2_inode_lookup(dir->i_sb, dtype, addr, formal_ino, + GFS2_BLKST_FREE /* ignore */); if (!IS_ERR(inode)) GFS2_I(inode)->i_rahead = rahead; return inode; diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c index d5bda8513457..a332f3cd925e 100644 --- a/fs/gfs2/export.c +++ b/fs/gfs2/export.c @@ -137,21 +137,10 @@ static struct dentry *gfs2_get_dentry(struct super_block *sb, struct gfs2_sbd *sdp = sb->s_fs_info; struct inode *inode; - inode = gfs2_ilookup(sb, inum->no_addr); - if (inode) { - if (GFS2_I(inode)->i_no_formal_ino != inum->no_formal_ino) { - iput(inode); - return ERR_PTR(-ESTALE); - } - goto out_inode; - } - inode = gfs2_lookup_by_inum(sdp, inum->no_addr, &inum->no_formal_ino, GFS2_BLKST_DINODE); if (IS_ERR(inode)) return ERR_CAST(inode); - -out_inode: return d_obtain_alias(inode); } diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index e53b723abd3b..320e65e61938 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -160,7 +160,7 @@ static int gfs2_get_flags(struct file *filp, u32 __user *ptr) gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &gh); error = gfs2_glock_nq(&gh); if (error) - return error; + goto out_uninit; fsflags = fsflags_cvt(gfs2_to_fsflags, ip->i_diskflags); if (!S_ISDIR(inode->i_mode) && ip->i_diskflags & GFS2_DIF_JDATA) @@ -169,6 +169,7 @@ static int gfs2_get_flags(struct file *filp, u32 __user *ptr) error = -EFAULT; gfs2_glock_dq(&gh); +out_uninit: gfs2_holder_uninit(&gh); return error; } @@ -953,6 +954,30 @@ out_uninit: return ret; } +static ssize_t gfs2_file_splice_read(struct file *in, loff_t *ppos, + struct pipe_inode_info *pipe, size_t len, + unsigned int flags) +{ + struct inode *inode = in->f_mapping->host; + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_holder gh; + int ret; + + inode_lock(inode); + + ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, 0, &gh); + if (ret) { + inode_unlock(inode); + return ret; + } + + gfs2_glock_dq_uninit(&gh); + inode_unlock(inode); + + return generic_file_splice_read(in, ppos, pipe, len, flags); +} + + static ssize_t gfs2_file_splice_write(struct pipe_inode_info *pipe, struct file *out, loff_t *ppos, size_t len, unsigned int flags) @@ -1073,7 +1098,7 @@ static void do_unflock(struct file *file, struct file_lock *fl) mutex_lock(&fp->f_fl_mutex); locks_lock_file_wait(file, fl); - if (fl_gh->gh_gl) { + if (gfs2_holder_initialized(fl_gh)) { gfs2_glock_dq(fl_gh); gfs2_holder_uninit(fl_gh); } @@ -1115,7 +1140,7 @@ const struct file_operations gfs2_file_fops = { .fsync = gfs2_fsync, .lock = gfs2_lock, .flock = gfs2_flock, - .splice_read = generic_file_splice_read, + .splice_read = gfs2_file_splice_read, .splice_write = gfs2_file_splice_write, .setlease = simple_nosetlease, .fallocate = gfs2_fallocate, @@ -1143,7 +1168,7 @@ const struct file_operations gfs2_file_fops_nolock = { .open = gfs2_open, .release = gfs2_release, .fsync = gfs2_fsync, - .splice_read = generic_file_splice_read, + .splice_read = gfs2_file_splice_read, .splice_write = gfs2_file_splice_write, .setlease = generic_setlease, .fallocate = gfs2_fallocate, diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 4b73bd101bdc..3a90b2b5b9bb 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -218,7 +218,7 @@ static void gfs2_holder_wake(struct gfs2_holder *gh) * */ -static inline void do_error(struct gfs2_glock *gl, const int ret) +static void do_error(struct gfs2_glock *gl, const int ret) { struct gfs2_holder *gh, *tmp; @@ -475,7 +475,14 @@ __acquires(&gl->gl_lockref.lock) if (sdp->sd_lockstruct.ls_ops->lm_lock) { /* lock_dlm */ ret = sdp->sd_lockstruct.ls_ops->lm_lock(gl, target, lck_flags); - if (ret) { + if (ret == -EINVAL && gl->gl_target == LM_ST_UNLOCKED && + target == LM_ST_UNLOCKED && + test_bit(SDF_SKIP_DLM_UNLOCK, &sdp->sd_flags)) { + finish_xmote(gl, target); + if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0) + gfs2_glock_put(gl); + } + else if (ret) { pr_err("lm_lock ret %d\n", ret); GLOCK_BUG_ON(gl, 1); } @@ -568,7 +575,6 @@ static void delete_work_func(struct work_struct *work) { struct gfs2_glock *gl = container_of(work, struct gfs2_glock, gl_delete); struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; - struct gfs2_inode *ip; struct inode *inode; u64 no_addr = gl->gl_name.ln_number; @@ -578,13 +584,7 @@ static void delete_work_func(struct work_struct *work) if (test_bit(GLF_INODE_CREATING, &gl->gl_flags)) goto out; - ip = gl->gl_object; - /* Note: Unsafe to dereference ip as we don't hold right refs/locks */ - - if (ip) - inode = gfs2_ilookup(sdp->sd_vfs, no_addr); - else - inode = gfs2_lookup_by_inum(sdp, no_addr, NULL, GFS2_BLKST_UNLINKED); + inode = gfs2_lookup_by_inum(sdp, no_addr, NULL, GFS2_BLKST_UNLINKED); if (inode && !IS_ERR(inode)) { d_prune_aliases(inode); iput(inode); @@ -801,7 +801,7 @@ void gfs2_holder_uninit(struct gfs2_holder *gh) { put_pid(gh->gh_owner_pid); gfs2_glock_put(gh->gh_gl); - gh->gh_gl = NULL; + gfs2_holder_mark_uninitialized(gh); gh->gh_ip = 0; } diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h index 46ab67fc16da..ab1ef322f7a5 100644 --- a/fs/gfs2/glock.h +++ b/fs/gfs2/glock.h @@ -247,4 +247,14 @@ extern void gfs2_unregister_debugfs(void); extern const struct lm_lockops gfs2_dlm_ops; +static inline void gfs2_holder_mark_uninitialized(struct gfs2_holder *gh) +{ + gh->gh_gl = NULL; +} + +static inline bool gfs2_holder_initialized(struct gfs2_holder *gh) +{ + return gh->gh_gl; +} + #endif /* __GLOCK_DOT_H__ */ diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index 437fd73e381e..5db59d444838 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -286,17 +286,10 @@ static void inode_go_inval(struct gfs2_glock *gl, int flags) static int inode_go_demote_ok(const struct gfs2_glock *gl) { struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; - struct gfs2_holder *gh; if (sdp->sd_jindex == gl->gl_object || sdp->sd_rindex == gl->gl_object) return 0; - if (!list_empty(&gl->gl_holders)) { - gh = list_entry(gl->gl_holders.next, struct gfs2_holder, gh_list); - if (gh->gh_list.next != &gl->gl_holders) - return 0; - } - return 1; } diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 72e9c64ae371..e0621cacf134 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -37,9 +37,35 @@ #include "super.h" #include "glops.h" -struct inode *gfs2_ilookup(struct super_block *sb, u64 no_addr) +static int iget_test(struct inode *inode, void *opaque) { - return ilookup(sb, (unsigned long)no_addr); + u64 no_addr = *(u64 *)opaque; + + return GFS2_I(inode)->i_no_addr == no_addr; +} + +static int iget_set(struct inode *inode, void *opaque) +{ + u64 no_addr = *(u64 *)opaque; + + GFS2_I(inode)->i_no_addr = no_addr; + inode->i_ino = no_addr; + return 0; +} + +static struct inode *gfs2_iget(struct super_block *sb, u64 no_addr) +{ + struct inode *inode; + +repeat: + inode = iget5_locked(sb, no_addr, iget_test, iget_set, &no_addr); + if (!inode) + return inode; + if (is_bad_inode(inode)) { + iput(inode); + goto repeat; + } + return inode; } /** @@ -78,27 +104,38 @@ static void gfs2_set_iop(struct inode *inode) /** * gfs2_inode_lookup - Lookup an inode * @sb: The super block - * @no_addr: The inode number * @type: The type of the inode + * @no_addr: The inode number + * @no_formal_ino: The inode generation number + * @blktype: Requested block type (GFS2_BLKST_DINODE or GFS2_BLKST_UNLINKED; + * GFS2_BLKST_FREE do indicate not to verify) + * + * If @type is DT_UNKNOWN, the inode type is fetched from disk. + * + * If @blktype is anything other than GFS2_BLKST_FREE (which is used as a + * placeholder because it doesn't otherwise make sense), the on-disk block type + * is verified to be @blktype. * * Returns: A VFS inode, or an error */ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type, - u64 no_addr, u64 no_formal_ino) + u64 no_addr, u64 no_formal_ino, + unsigned int blktype) { struct inode *inode; struct gfs2_inode *ip; struct gfs2_glock *io_gl = NULL; + struct gfs2_holder i_gh; int error; - inode = iget_locked(sb, (unsigned long)no_addr); - ip = GFS2_I(inode); - ip->i_no_addr = no_addr; - + gfs2_holder_mark_uninitialized(&i_gh); + inode = gfs2_iget(sb, no_addr); if (!inode) return ERR_PTR(-ENOMEM); + ip = GFS2_I(inode); + if (inode->i_state & I_NEW) { struct gfs2_sbd *sdp = GFS2_SB(inode); ip->i_no_formal_ino = no_formal_ino; @@ -112,10 +149,29 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type, if (unlikely(error)) goto fail_put; + if (type == DT_UNKNOWN || blktype != GFS2_BLKST_FREE) { + /* + * The GL_SKIP flag indicates to skip reading the inode + * block. We read the inode with gfs2_inode_refresh + * after possibly checking the block type. + */ + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, + GL_SKIP, &i_gh); + if (error) + goto fail_put; + + if (blktype != GFS2_BLKST_FREE) { + error = gfs2_check_blk_type(sdp, no_addr, + blktype); + if (error) + goto fail_put; + } + } + set_bit(GIF_INVALID, &ip->i_flags); error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, GL_EXACT, &ip->i_iopen_gh); if (unlikely(error)) - goto fail_iopen; + goto fail_put; ip->i_iopen_gh.gh_gl->gl_object = ip; gfs2_glock_put(io_gl); @@ -134,6 +190,8 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type, unlock_new_inode(inode); } + if (gfs2_holder_initialized(&i_gh)) + gfs2_glock_dq_uninit(&i_gh); return inode; fail_refresh: @@ -141,10 +199,11 @@ fail_refresh: ip->i_iopen_gh.gh_gl->gl_object = NULL; gfs2_glock_dq_wait(&ip->i_iopen_gh); gfs2_holder_uninit(&ip->i_iopen_gh); -fail_iopen: +fail_put: if (io_gl) gfs2_glock_put(io_gl); -fail_put: + if (gfs2_holder_initialized(&i_gh)) + gfs2_glock_dq_uninit(&i_gh); ip->i_gl->gl_object = NULL; fail: iget_failed(inode); @@ -155,23 +214,12 @@ struct inode *gfs2_lookup_by_inum(struct gfs2_sbd *sdp, u64 no_addr, u64 *no_formal_ino, unsigned int blktype) { struct super_block *sb = sdp->sd_vfs; - struct gfs2_holder i_gh; - struct inode *inode = NULL; + struct inode *inode; int error; - /* Must not read in block until block type is verified */ - error = gfs2_glock_nq_num(sdp, no_addr, &gfs2_inode_glops, - LM_ST_EXCLUSIVE, GL_SKIP, &i_gh); - if (error) - return ERR_PTR(error); - - error = gfs2_check_blk_type(sdp, no_addr, blktype); - if (error) - goto fail; - - inode = gfs2_inode_lookup(sb, DT_UNKNOWN, no_addr, 0); + inode = gfs2_inode_lookup(sb, DT_UNKNOWN, no_addr, 0, blktype); if (IS_ERR(inode)) - goto fail; + return inode; /* Two extra checks for NFS only */ if (no_formal_ino) { @@ -182,16 +230,12 @@ struct inode *gfs2_lookup_by_inum(struct gfs2_sbd *sdp, u64 no_addr, error = -EIO; if (GFS2_I(inode)->i_diskflags & GFS2_DIF_SYSTEM) goto fail_iput; - - error = 0; } + return inode; -fail: - gfs2_glock_dq_uninit(&i_gh); - return error ? ERR_PTR(error) : inode; fail_iput: iput(inode); - goto fail; + return ERR_PTR(error); } @@ -236,8 +280,8 @@ struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name, struct gfs2_holder d_gh; int error = 0; struct inode *inode = NULL; - int unlock = 0; + gfs2_holder_mark_uninitialized(&d_gh); if (!name->len || name->len > GFS2_FNAMESIZE) return ERR_PTR(-ENAMETOOLONG); @@ -252,7 +296,6 @@ struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name, error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, &d_gh); if (error) return ERR_PTR(error); - unlock = 1; } if (!is_root) { @@ -265,7 +308,7 @@ struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name, if (IS_ERR(inode)) error = PTR_ERR(inode); out: - if (unlock) + if (gfs2_holder_initialized(&d_gh)) gfs2_glock_dq_uninit(&d_gh); if (error == -ENOENT) return NULL; @@ -1189,7 +1232,7 @@ static int gfs2_atomic_open(struct inode *dir, struct dentry *dentry, struct dentry *d; bool excl = !!(flags & O_EXCL); - if (!d_unhashed(dentry)) + if (!d_in_lookup(dentry)) goto skip_lookup; d = __gfs2_lookup(dir, dentry, file, opened); @@ -1309,7 +1352,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry, struct gfs2_inode *ip = GFS2_I(d_inode(odentry)); struct gfs2_inode *nip = NULL; struct gfs2_sbd *sdp = GFS2_SB(odir); - struct gfs2_holder ghs[5], r_gh = { .gh_gl = NULL, }; + struct gfs2_holder ghs[5], r_gh; struct gfs2_rgrpd *nrgd; unsigned int num_gh; int dir_rename = 0; @@ -1317,6 +1360,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry, unsigned int x; int error; + gfs2_holder_mark_uninitialized(&r_gh); if (d_really_is_positive(ndentry)) { nip = GFS2_I(d_inode(ndentry)); if (ip == nip) @@ -1506,7 +1550,7 @@ out_gunlock: gfs2_holder_uninit(ghs + x); } out_gunlock_r: - if (r_gh.gh_gl) + if (gfs2_holder_initialized(&r_gh)) gfs2_glock_dq_uninit(&r_gh); out: return error; @@ -1532,13 +1576,14 @@ static int gfs2_exchange(struct inode *odir, struct dentry *odentry, struct gfs2_inode *oip = GFS2_I(odentry->d_inode); struct gfs2_inode *nip = GFS2_I(ndentry->d_inode); struct gfs2_sbd *sdp = GFS2_SB(odir); - struct gfs2_holder ghs[5], r_gh = { .gh_gl = NULL, }; + struct gfs2_holder ghs[5], r_gh; unsigned int num_gh; unsigned int x; umode_t old_mode = oip->i_inode.i_mode; umode_t new_mode = nip->i_inode.i_mode; int error; + gfs2_holder_mark_uninitialized(&r_gh); error = gfs2_rindex_update(sdp); if (error) return error; @@ -1646,7 +1691,7 @@ out_gunlock: gfs2_holder_uninit(ghs + x); } out_gunlock_r: - if (r_gh.gh_gl) + if (gfs2_holder_initialized(&r_gh)) gfs2_glock_dq_uninit(&r_gh); out: return error; @@ -1743,9 +1788,8 @@ int gfs2_permission(struct inode *inode, int mask) struct gfs2_inode *ip; struct gfs2_holder i_gh; int error; - int unlock = 0; - + gfs2_holder_mark_uninitialized(&i_gh); ip = GFS2_I(inode); if (gfs2_glock_is_locked_by_me(ip->i_gl) == NULL) { if (mask & MAY_NOT_BLOCK) @@ -1753,14 +1797,13 @@ int gfs2_permission(struct inode *inode, int mask) error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh); if (error) return error; - unlock = 1; } if ((mask & MAY_WRITE) && IS_IMMUTABLE(inode)) error = -EACCES; else error = generic_permission(inode, mask); - if (unlock) + if (gfs2_holder_initialized(&i_gh)) gfs2_glock_dq_uninit(&i_gh); return error; @@ -1932,17 +1975,16 @@ static int gfs2_getattr(struct vfsmount *mnt, struct dentry *dentry, struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_holder gh; int error; - int unlock = 0; + gfs2_holder_mark_uninitialized(&gh); if (gfs2_glock_is_locked_by_me(ip->i_gl) == NULL) { error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &gh); if (error) return error; - unlock = 1; } generic_fillattr(inode, stat); - if (unlock) + if (gfs2_holder_initialized(&gh)) gfs2_glock_dq_uninit(&gh); return 0; diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h index e1af0d4aa308..7710dfd3af35 100644 --- a/fs/gfs2/inode.h +++ b/fs/gfs2/inode.h @@ -94,11 +94,11 @@ err: } extern struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned type, - u64 no_addr, u64 no_formal_ino); + u64 no_addr, u64 no_formal_ino, + unsigned int blktype); extern struct inode *gfs2_lookup_by_inum(struct gfs2_sbd *sdp, u64 no_addr, u64 *no_formal_ino, unsigned int blktype); -extern struct inode *gfs2_ilookup(struct super_block *sb, u64 no_addr); extern int gfs2_inode_refresh(struct gfs2_inode *ip); diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index 0ff028c15199..e58ccef09c91 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -657,7 +657,7 @@ static void log_write_header(struct gfs2_sbd *sdp, u32 flags) struct gfs2_log_header *lh; unsigned int tail; u32 hash; - int rw = WRITE_FLUSH_FUA | REQ_META; + int op_flags = WRITE_FLUSH_FUA | REQ_META; struct page *page = mempool_alloc(gfs2_page_pool, GFP_NOIO); enum gfs2_freeze_state state = atomic_read(&sdp->sd_freeze_state); lh = page_address(page); @@ -682,12 +682,12 @@ static void log_write_header(struct gfs2_sbd *sdp, u32 flags) if (test_bit(SDF_NOBARRIERS, &sdp->sd_flags)) { gfs2_ordered_wait(sdp); log_flush_wait(sdp); - rw = WRITE_SYNC | REQ_META | REQ_PRIO; + op_flags = WRITE_SYNC | REQ_META | REQ_PRIO; } sdp->sd_log_idle = (tail == sdp->sd_log_flush_head); gfs2_log_write_page(sdp, page); - gfs2_log_flush_bio(sdp, rw); + gfs2_log_flush_bio(sdp, REQ_OP_WRITE, op_flags); log_flush_wait(sdp); if (sdp->sd_log_tail != tail) @@ -738,7 +738,7 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl, gfs2_ordered_write(sdp); lops_before_commit(sdp, tr); - gfs2_log_flush_bio(sdp, WRITE); + gfs2_log_flush_bio(sdp, REQ_OP_WRITE, 0); if (sdp->sd_log_head != sdp->sd_log_flush_head) { log_flush_wait(sdp); diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c index d5369a109781..49d5a1b61b06 100644 --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c @@ -230,17 +230,19 @@ static void gfs2_end_log_write(struct bio *bio) /** * gfs2_log_flush_bio - Submit any pending log bio * @sdp: The superblock - * @rw: The rw flags + * @op: REQ_OP + * @op_flags: rq_flag_bits * * Submit any pending part-built or full bio to the block device. If * there is no pending bio, then this is a no-op. */ -void gfs2_log_flush_bio(struct gfs2_sbd *sdp, int rw) +void gfs2_log_flush_bio(struct gfs2_sbd *sdp, int op, int op_flags) { if (sdp->sd_log_bio) { atomic_inc(&sdp->sd_log_in_flight); - submit_bio(rw, sdp->sd_log_bio); + bio_set_op_attrs(sdp->sd_log_bio, op, op_flags); + submit_bio(sdp->sd_log_bio); sdp->sd_log_bio = NULL; } } @@ -299,7 +301,7 @@ static struct bio *gfs2_log_get_bio(struct gfs2_sbd *sdp, u64 blkno) nblk >>= sdp->sd_fsb2bb_shift; if (blkno == nblk) return bio; - gfs2_log_flush_bio(sdp, WRITE); + gfs2_log_flush_bio(sdp, REQ_OP_WRITE, 0); } return gfs2_log_alloc_bio(sdp, blkno); @@ -328,7 +330,7 @@ static void gfs2_log_write(struct gfs2_sbd *sdp, struct page *page, bio = gfs2_log_get_bio(sdp, blkno); ret = bio_add_page(bio, page, size, offset); if (ret == 0) { - gfs2_log_flush_bio(sdp, WRITE); + gfs2_log_flush_bio(sdp, REQ_OP_WRITE, 0); bio = gfs2_log_alloc_bio(sdp, blkno); ret = bio_add_page(bio, page, size, offset); WARN_ON(ret == 0); @@ -535,9 +537,9 @@ static int buf_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start, if (pass != 1 || be32_to_cpu(ld->ld_type) != GFS2_LOG_DESC_METADATA) return 0; - gfs2_replay_incr_blk(sdp, &start); + gfs2_replay_incr_blk(jd, &start); - for (; blks; gfs2_replay_incr_blk(sdp, &start), blks--) { + for (; blks; gfs2_replay_incr_blk(jd, &start), blks--) { blkno = be64_to_cpu(*ptr++); jd->jd_found_blocks++; @@ -693,7 +695,7 @@ static int revoke_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start, offset = sizeof(struct gfs2_log_descriptor); - for (; blks; gfs2_replay_incr_blk(sdp, &start), blks--) { + for (; blks; gfs2_replay_incr_blk(jd, &start), blks--) { error = gfs2_replay_read_block(jd, start, &bh); if (error) return error; @@ -762,7 +764,6 @@ static int databuf_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start, __be64 *ptr, int pass) { struct gfs2_inode *ip = GFS2_I(jd->jd_inode); - struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); struct gfs2_glock *gl = ip->i_gl; unsigned int blks = be32_to_cpu(ld->ld_data1); struct buffer_head *bh_log, *bh_ip; @@ -773,8 +774,8 @@ static int databuf_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start, if (pass != 1 || be32_to_cpu(ld->ld_type) != GFS2_LOG_DESC_JDATA) return 0; - gfs2_replay_incr_blk(sdp, &start); - for (; blks; gfs2_replay_incr_blk(sdp, &start), blks--) { + gfs2_replay_incr_blk(jd, &start); + for (; blks; gfs2_replay_incr_blk(jd, &start), blks--) { blkno = be64_to_cpu(*ptr++); esc = be64_to_cpu(*ptr++); diff --git a/fs/gfs2/lops.h b/fs/gfs2/lops.h index a65a7ba32ffd..e529f536c117 100644 --- a/fs/gfs2/lops.h +++ b/fs/gfs2/lops.h @@ -27,7 +27,7 @@ extern const struct gfs2_log_operations gfs2_databuf_lops; extern const struct gfs2_log_operations *gfs2_log_ops[]; extern void gfs2_log_write_page(struct gfs2_sbd *sdp, struct page *page); -extern void gfs2_log_flush_bio(struct gfs2_sbd *sdp, int rw); +extern void gfs2_log_flush_bio(struct gfs2_sbd *sdp, int op, int op_flags); extern void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh); static inline unsigned int buf_limit(struct gfs2_sbd *sdp) diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c index f99f8e94de3f..74fd0139e6c2 100644 --- a/fs/gfs2/main.c +++ b/fs/gfs2/main.c @@ -45,6 +45,7 @@ static void gfs2_init_inode_once(void *foo) memset(&ip->i_res, 0, sizeof(ip->i_res)); RB_CLEAR_NODE(&ip->i_res.rs_node); ip->i_hash_cache = NULL; + gfs2_holder_mark_uninitialized(&ip->i_iopen_gh); } static void gfs2_init_glock_once(void *foo) diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c index 0448524c11bc..950b8be68e41 100644 --- a/fs/gfs2/meta_io.c +++ b/fs/gfs2/meta_io.c @@ -37,8 +37,8 @@ static int gfs2_aspace_writepage(struct page *page, struct writeback_control *wb { struct buffer_head *bh, *head; int nr_underway = 0; - int write_op = REQ_META | REQ_PRIO | - (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE); + int write_flags = REQ_META | REQ_PRIO | + (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : 0); BUG_ON(!PageLocked(page)); BUG_ON(!page_has_buffers(page)); @@ -79,7 +79,7 @@ static int gfs2_aspace_writepage(struct page *page, struct writeback_control *wb do { struct buffer_head *next = bh->b_this_page; if (buffer_async_write(bh)) { - submit_bh(write_op, bh); + submit_bh(REQ_OP_WRITE, write_flags, bh); nr_underway++; } bh = next; @@ -213,7 +213,8 @@ static void gfs2_meta_read_endio(struct bio *bio) * Submit several consecutive buffer head I/O requests as a single bio I/O * request. (See submit_bh_wbc.) */ -static void gfs2_submit_bhs(int rw, struct buffer_head *bhs[], int num) +static void gfs2_submit_bhs(int op, int op_flags, struct buffer_head *bhs[], + int num) { struct buffer_head *bh = bhs[0]; struct bio *bio; @@ -230,7 +231,8 @@ static void gfs2_submit_bhs(int rw, struct buffer_head *bhs[], int num) bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh)); } bio->bi_end_io = gfs2_meta_read_endio; - submit_bio(rw, bio); + bio_set_op_attrs(bio, op, op_flags); + submit_bio(bio); } /** @@ -280,7 +282,7 @@ int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags, } } - gfs2_submit_bhs(READ_SYNC | REQ_META | REQ_PRIO, bhs, num); + gfs2_submit_bhs(REQ_OP_READ, READ_SYNC | REQ_META | REQ_PRIO, bhs, num); if (!(flags & DIO_WAIT)) return 0; @@ -325,18 +327,19 @@ int gfs2_meta_wait(struct gfs2_sbd *sdp, struct buffer_head *bh) return 0; } -void gfs2_remove_from_journal(struct buffer_head *bh, struct gfs2_trans *tr, int meta) +void gfs2_remove_from_journal(struct buffer_head *bh, int meta) { struct address_space *mapping = bh->b_page->mapping; struct gfs2_sbd *sdp = gfs2_mapping2sbd(mapping); struct gfs2_bufdata *bd = bh->b_private; + struct gfs2_trans *tr = current->journal_info; int was_pinned = 0; if (test_clear_buffer_pinned(bh)) { trace_gfs2_pin(bd, 0); atomic_dec(&sdp->sd_log_pinned); list_del_init(&bd->bd_list); - if (meta) + if (meta == REMOVE_META) tr->tr_num_buf_rm++; else tr->tr_num_databuf_rm++; @@ -376,7 +379,7 @@ void gfs2_meta_wipe(struct gfs2_inode *ip, u64 bstart, u32 blen) if (bh) { lock_buffer(bh); gfs2_log_lock(sdp); - gfs2_remove_from_journal(bh, current->journal_info, 1); + gfs2_remove_from_journal(bh, REMOVE_META); gfs2_log_unlock(sdp); unlock_buffer(bh); brelse(bh); @@ -447,7 +450,7 @@ struct buffer_head *gfs2_meta_ra(struct gfs2_glock *gl, u64 dblock, u32 extlen) if (buffer_uptodate(first_bh)) goto out; if (!buffer_locked(first_bh)) - ll_rw_block(READ_SYNC | REQ_META, 1, &first_bh); + ll_rw_block(REQ_OP_READ, READ_SYNC | REQ_META, 1, &first_bh); dblock++; extlen--; @@ -456,7 +459,7 @@ struct buffer_head *gfs2_meta_ra(struct gfs2_glock *gl, u64 dblock, u32 extlen) bh = gfs2_getbuf(gl, dblock, CREATE); if (!buffer_uptodate(bh) && !buffer_locked(bh)) - ll_rw_block(READA | REQ_META, 1, &bh); + ll_rw_block(REQ_OP_READ, REQ_RAHEAD | REQ_META, 1, &bh); brelse(bh); dblock++; extlen--; diff --git a/fs/gfs2/meta_io.h b/fs/gfs2/meta_io.h index c5086c8af5ed..ffdf6aa3509d 100644 --- a/fs/gfs2/meta_io.h +++ b/fs/gfs2/meta_io.h @@ -57,8 +57,12 @@ extern int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags, extern int gfs2_meta_wait(struct gfs2_sbd *sdp, struct buffer_head *bh); extern struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno, int create); -extern void gfs2_remove_from_journal(struct buffer_head *bh, - struct gfs2_trans *tr, int meta); +enum { + REMOVE_JDATA = 0, + REMOVE_META = 1, +}; + +extern void gfs2_remove_from_journal(struct buffer_head *bh, int meta); extern void gfs2_meta_wipe(struct gfs2_inode *ip, u64 bstart, u32 blen); extern int gfs2_meta_indirect_buffer(struct gfs2_inode *ip, int height, u64 num, struct buffer_head **bhp); diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index 45463600fb81..ef1e1822977f 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -246,7 +246,8 @@ static int gfs2_read_super(struct gfs2_sbd *sdp, sector_t sector, int silent) bio->bi_end_io = end_bio_io_page; bio->bi_private = page; - submit_bio(READ_SYNC | REQ_META, bio); + bio_set_op_attrs(bio, REQ_OP_READ, READ_SYNC | REQ_META); + submit_bio(bio); wait_on_page_locked(page); bio_put(bio); if (!PageUptodate(page)) { @@ -454,7 +455,8 @@ static int gfs2_lookup_root(struct super_block *sb, struct dentry **dptr, struct dentry *dentry; struct inode *inode; - inode = gfs2_inode_lookup(sb, DT_DIR, no_addr, 0); + inode = gfs2_inode_lookup(sb, DT_DIR, no_addr, 0, + GFS2_BLKST_FREE /* ignore */); if (IS_ERR(inode)) { fs_err(sdp, "can't read in %s inode: %ld\n", name, PTR_ERR(inode)); return PTR_ERR(inode); diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index ce7d69a2fdc0..77930ca25303 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -730,7 +730,7 @@ static int gfs2_write_buf_to_page(struct gfs2_inode *ip, unsigned long index, if (PageUptodate(page)) set_buffer_uptodate(bh); if (!buffer_uptodate(bh)) { - ll_rw_block(READ | REQ_META, 1, &bh); + ll_rw_block(REQ_OP_READ, REQ_META, 1, &bh); wait_on_buffer(bh); if (!buffer_uptodate(bh)) goto unlock_out; @@ -883,7 +883,7 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda) gfs2_write_calc_reserv(ip, sizeof(struct gfs2_quota), &data_blocks, &ind_blocks); - ghs = kcalloc(num_qd, sizeof(struct gfs2_holder), GFP_NOFS); + ghs = kmalloc(num_qd * sizeof(struct gfs2_holder), GFP_NOFS); if (!ghs) return -ENOMEM; diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c index 1b645773c98e..113b6095a58d 100644 --- a/fs/gfs2/recovery.c +++ b/fs/gfs2/recovery.c @@ -338,7 +338,7 @@ static int foreach_descriptor(struct gfs2_jdesc *jd, unsigned int start, struct gfs2_log_header_host lh; error = get_log_header(jd, start, &lh); if (!error) { - gfs2_replay_incr_blk(sdp, &start); + gfs2_replay_incr_blk(jd, &start); brelse(bh); continue; } @@ -360,7 +360,7 @@ static int foreach_descriptor(struct gfs2_jdesc *jd, unsigned int start, } while (length--) - gfs2_replay_incr_blk(sdp, &start); + gfs2_replay_incr_blk(jd, &start); brelse(bh); } @@ -390,7 +390,7 @@ static int clean_journal(struct gfs2_jdesc *jd, struct gfs2_log_header_host *hea struct buffer_head bh_map = { .b_state = 0, .b_blocknr = 0 }; lblock = head->lh_blkno; - gfs2_replay_incr_blk(sdp, &lblock); + gfs2_replay_incr_blk(jd, &lblock); bh_map.b_size = 1 << ip->i_inode.i_blkbits; error = gfs2_block_map(&ip->i_inode, lblock, &bh_map, 0); if (error) diff --git a/fs/gfs2/recovery.h b/fs/gfs2/recovery.h index 6142836cce96..11fdfab4bf99 100644 --- a/fs/gfs2/recovery.h +++ b/fs/gfs2/recovery.h @@ -14,9 +14,9 @@ extern struct workqueue_struct *gfs_recovery_wq; -static inline void gfs2_replay_incr_blk(struct gfs2_sbd *sdp, unsigned int *blk) +static inline void gfs2_replay_incr_blk(struct gfs2_jdesc *jd, unsigned int *blk) { - if (++*blk == sdp->sd_jdesc->jd_blocks) + if (++*blk == jd->jd_blocks) *blk = 0; } diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 99a0bdac8796..86ccc0159393 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -73,8 +73,7 @@ static const char valid_change[16] = { }; static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, u32 *minext, - const struct gfs2_inode *ip, bool nowrap, - const struct gfs2_alloc_parms *ap); + const struct gfs2_inode *ip, bool nowrap); /** @@ -659,6 +658,7 @@ void gfs2_rs_deltree(struct gfs2_blkreserv *rs) if (rgd) { spin_lock(&rgd->rd_rsspin); __rs_deltree(rs); + BUG_ON(rs->rs_free); spin_unlock(&rgd->rd_rsspin); } } @@ -672,10 +672,8 @@ void gfs2_rs_deltree(struct gfs2_blkreserv *rs) void gfs2_rsqa_delete(struct gfs2_inode *ip, atomic_t *wcount) { down_write(&ip->i_rw_mutex); - if ((wcount == NULL) || (atomic_read(wcount) <= 1)) { + if ((wcount == NULL) || (atomic_read(wcount) <= 1)) gfs2_rs_deltree(&ip->i_res); - BUG_ON(ip->i_res.rs_free); - } up_write(&ip->i_rw_mutex); gfs2_qa_delete(ip, wcount); } @@ -723,6 +721,7 @@ void gfs2_clear_rgrpd(struct gfs2_sbd *sdp) gfs2_free_clones(rgd); kfree(rgd->rd_bits); + rgd->rd_bits = NULL; return_all_reservations(rgd); kmem_cache_free(gfs2_rgrpd_cachep, rgd); } @@ -917,9 +916,6 @@ static int read_rindex_entry(struct gfs2_inode *ip) if (error) goto fail; - rgd->rd_gl->gl_object = rgd; - rgd->rd_gl->gl_vm.start = (rgd->rd_addr * bsize) & PAGE_MASK; - rgd->rd_gl->gl_vm.end = PAGE_ALIGN((rgd->rd_addr + rgd->rd_length) * bsize) - 1; rgd->rd_rgl = (struct gfs2_rgrp_lvb *)rgd->rd_gl->gl_lksb.sb_lvbptr; rgd->rd_flags &= ~(GFS2_RDF_UPTODATE | GFS2_RDF_PREFERRED); if (rgd->rd_data > sdp->sd_max_rg_data) @@ -927,14 +923,20 @@ static int read_rindex_entry(struct gfs2_inode *ip) spin_lock(&sdp->sd_rindex_spin); error = rgd_insert(rgd); spin_unlock(&sdp->sd_rindex_spin); - if (!error) + if (!error) { + rgd->rd_gl->gl_object = rgd; + rgd->rd_gl->gl_vm.start = (rgd->rd_addr * bsize) & PAGE_MASK; + rgd->rd_gl->gl_vm.end = PAGE_ALIGN((rgd->rd_addr + + rgd->rd_length) * bsize) - 1; return 0; + } error = 0; /* someone else read in the rgrp; free it and ignore it */ gfs2_glock_put(rgd->rd_gl); fail: kfree(rgd->rd_bits); + rgd->rd_bits = NULL; kmem_cache_free(gfs2_rgrpd_cachep, rgd); return error; } @@ -1511,7 +1513,7 @@ static void rg_mblk_search(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip, if (WARN_ON(gfs2_rbm_from_block(&rbm, goal))) return; - ret = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, &extlen, ip, true, ap); + ret = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, &extlen, ip, true); if (ret == 0) { rs->rs_rbm = rbm; rs->rs_free = extlen; @@ -1638,7 +1640,6 @@ fail: * @ip: If set, check for reservations * @nowrap: Stop looking at the end of the rgrp, rather than wrapping * around until we've reached the starting point. - * @ap: the allocation parameters * * Side effects: * - If looking for free blocks, we set GBF_FULL on each bitmap which @@ -1650,8 +1651,7 @@ fail: */ static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, u32 *minext, - const struct gfs2_inode *ip, bool nowrap, - const struct gfs2_alloc_parms *ap) + const struct gfs2_inode *ip, bool nowrap) { struct buffer_head *bh; int initial_bii; @@ -1772,7 +1772,7 @@ static void try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, u64 skip while (1) { down_write(&sdp->sd_log_flush_lock); error = gfs2_rbm_find(&rbm, GFS2_BLKST_UNLINKED, NULL, NULL, - true, NULL); + true); up_write(&sdp->sd_log_flush_lock); if (error == -ENOSPC) break; @@ -2099,7 +2099,7 @@ void gfs2_inplace_release(struct gfs2_inode *ip) { struct gfs2_blkreserv *rs = &ip->i_res; - if (rs->rs_rgd_gh.gh_gl) + if (gfs2_holder_initialized(&rs->rs_rgd_gh)) gfs2_glock_dq_uninit(&rs->rs_rgd_gh); } @@ -2329,12 +2329,11 @@ int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *nblocks, int error; gfs2_set_alloc_start(&rbm, ip, dinode); - error = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, NULL, ip, false, NULL); + error = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, NULL, ip, false); if (error == -ENOSPC) { gfs2_set_alloc_start(&rbm, ip, dinode); - error = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, NULL, NULL, false, - NULL); + error = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, NULL, NULL, false); } /* Since all blocks are reserved in advance, this shouldn't happen */ @@ -2600,7 +2599,7 @@ void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist, unsigned int state) { unsigned int x; - rlist->rl_ghs = kcalloc(rlist->rl_rgrps, sizeof(struct gfs2_holder), + rlist->rl_ghs = kmalloc(rlist->rl_rgrps * sizeof(struct gfs2_holder), GFP_NOFS | __GFP_NOFAIL); for (x = 0; x < rlist->rl_rgrps; x++) gfs2_holder_init(rlist->rl_rgd[x]->rd_gl, diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index 9b2ff353e45f..3a7e60bb39f8 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -855,7 +855,7 @@ static int gfs2_make_fs_ro(struct gfs2_sbd *sdp) wait_event(sdp->sd_reserving_log_wait, atomic_read(&sdp->sd_reserving_log) == 0); gfs2_assert_warn(sdp, atomic_read(&sdp->sd_log_blks_free) == sdp->sd_jdesc->jd_blocks); - if (freeze_gh.gh_gl) + if (gfs2_holder_initialized(&freeze_gh)) gfs2_glock_dq_uninit(&freeze_gh); gfs2_quota_cleanup(sdp); @@ -1033,7 +1033,7 @@ static int gfs2_unfreeze(struct super_block *sb) mutex_lock(&sdp->sd_freeze_mutex); if (atomic_read(&sdp->sd_freeze_state) != SFS_FROZEN || - sdp->sd_freeze_gh.gh_gl == NULL) { + !gfs2_holder_initialized(&sdp->sd_freeze_gh)) { mutex_unlock(&sdp->sd_freeze_mutex); return 0; } @@ -1084,9 +1084,11 @@ static int gfs2_statfs_slow(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host int error = 0, err; memset(sc, 0, sizeof(struct gfs2_statfs_change_host)); - gha = kcalloc(slots, sizeof(struct gfs2_holder), GFP_KERNEL); + gha = kmalloc(slots * sizeof(struct gfs2_holder), GFP_KERNEL); if (!gha) return -ENOMEM; + for (x = 0; x < slots; x++) + gfs2_holder_mark_uninitialized(gha + x); rgd_next = gfs2_rgrpd_get_first(sdp); @@ -1096,7 +1098,7 @@ static int gfs2_statfs_slow(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host for (x = 0; x < slots; x++) { gh = gha + x; - if (gh->gh_gl && gfs2_glock_poll(gh)) { + if (gfs2_holder_initialized(gh) && gfs2_glock_poll(gh)) { err = gfs2_glock_wait(gh); if (err) { gfs2_holder_uninit(gh); @@ -1109,7 +1111,7 @@ static int gfs2_statfs_slow(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host } } - if (gh->gh_gl) + if (gfs2_holder_initialized(gh)) done = 0; else if (rgd_next && !error) { error = gfs2_glock_nq_init(rgd_next->rd_gl, @@ -1304,9 +1306,11 @@ static int gfs2_drop_inode(struct inode *inode) { struct gfs2_inode *ip = GFS2_I(inode); - if (!test_bit(GIF_FREE_VFS_INODE, &ip->i_flags) && inode->i_nlink) { + if (!test_bit(GIF_FREE_VFS_INODE, &ip->i_flags) && + inode->i_nlink && + gfs2_holder_initialized(&ip->i_iopen_gh)) { struct gfs2_glock *gl = ip->i_iopen_gh.gh_gl; - if (gl && test_bit(GLF_DEMOTE, &gl->gl_flags)) + if (test_bit(GLF_DEMOTE, &gl->gl_flags)) clear_nlink(inode); } return generic_drop_inode(inode); @@ -1551,7 +1555,7 @@ static void gfs2_evict_inode(struct inode *inode) goto out_truncate; } - if (ip->i_iopen_gh.gh_gl && + if (gfs2_holder_initialized(&ip->i_iopen_gh) && test_bit(HIF_HOLDER, &ip->i_iopen_gh.gh_iflags)) { ip->i_iopen_gh.gh_flags |= GL_NOCACHE; gfs2_glock_dq_wait(&ip->i_iopen_gh); @@ -1610,7 +1614,7 @@ out_unlock: if (gfs2_rs_active(&ip->i_res)) gfs2_rs_deltree(&ip->i_res); - if (ip->i_iopen_gh.gh_gl) { + if (gfs2_holder_initialized(&ip->i_iopen_gh)) { if (test_bit(HIF_HOLDER, &ip->i_iopen_gh.gh_iflags)) { ip->i_iopen_gh.gh_flags |= GL_NOCACHE; gfs2_glock_dq_wait(&ip->i_iopen_gh); @@ -1632,7 +1636,7 @@ out: gfs2_glock_add_to_lru(ip->i_gl); gfs2_glock_put(ip->i_gl); ip->i_gl = NULL; - if (ip->i_iopen_gh.gh_gl) { + if (gfs2_holder_initialized(&ip->i_iopen_gh)) { ip->i_iopen_gh.gh_gl->gl_object = NULL; ip->i_iopen_gh.gh_flags |= GL_NOCACHE; gfs2_glock_dq_wait(&ip->i_iopen_gh); diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c index cf645835710f..aee4485ad8a9 100644 --- a/fs/gfs2/util.c +++ b/fs/gfs2/util.c @@ -68,6 +68,7 @@ int gfs2_lm_withdraw(struct gfs2_sbd *sdp, const char *fmt, ...) fs_err(sdp, "telling LM to unmount\n"); lm->lm_unmount(sdp); } + set_bit(SDF_SKIP_DLM_UNLOCK, &sdp->sd_flags); fs_err(sdp, "withdrawn\n"); dump_stack(); } diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c index f42ab53bd30d..3a2853504084 100644 --- a/fs/gfs2/xattr.c +++ b/fs/gfs2/xattr.c @@ -1251,10 +1251,10 @@ int __gfs2_xattr_set(struct inode *inode, const char *name, } static int gfs2_xattr_set(const struct xattr_handler *handler, - struct dentry *dentry, const char *name, - const void *value, size_t size, int flags) + struct dentry *unused, struct inode *inode, + const char *name, const void *value, + size_t size, int flags) { - struct inode *inode = d_inode(dentry); struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_holder gh; int ret; diff --git a/fs/hfs/attr.c b/fs/hfs/attr.c index 064f92f17efc..d9a86919fdf6 100644 --- a/fs/hfs/attr.c +++ b/fs/hfs/attr.c @@ -13,10 +13,10 @@ #include "hfs_fs.h" #include "btree.h" -int hfs_setxattr(struct dentry *dentry, const char *name, - const void *value, size_t size, int flags) +int hfs_setxattr(struct dentry *unused, struct inode *inode, + const char *name, const void *value, + size_t size, int flags) { - struct inode *inode = d_inode(dentry); struct hfs_find_data fd; hfs_cat_rec rec; struct hfs_cat_file *file; diff --git a/fs/hfs/hfs_fs.h b/fs/hfs/hfs_fs.h index fa3eed86837c..ee2f385811c8 100644 --- a/fs/hfs/hfs_fs.h +++ b/fs/hfs/hfs_fs.h @@ -212,7 +212,7 @@ extern void hfs_evict_inode(struct inode *); extern void hfs_delete_inode(struct inode *); /* attr.c */ -extern int hfs_setxattr(struct dentry *dentry, const char *name, +extern int hfs_setxattr(struct dentry *dentry, struct inode *inode, const char *name, const void *value, size_t size, int flags); extern ssize_t hfs_getxattr(struct dentry *dentry, struct inode *inode, const char *name, void *value, size_t size); diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c index 8eed66af5b82..02a3845363f7 100644 --- a/fs/hfs/inode.c +++ b/fs/hfs/inode.c @@ -128,7 +128,7 @@ static ssize_t hfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) { struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; - struct inode *inode = file_inode(file)->i_mapping->host; + struct inode *inode = mapping->host; size_t count = iov_iter_count(iter); ssize_t ret; diff --git a/fs/hfs/string.c b/fs/hfs/string.c index 85b610c3909f..ec9f164c35a5 100644 --- a/fs/hfs/string.c +++ b/fs/hfs/string.c @@ -59,7 +59,7 @@ int hfs_hash_dentry(const struct dentry *dentry, struct qstr *this) if (len > HFS_NAMELEN) len = HFS_NAMELEN; - hash = init_name_hash(); + hash = init_name_hash(dentry); for (; len; len--) hash = partial_name_hash(caseorder[*name++], hash); this->hash = end_name_hash(hash); diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h index fdc3446d934a..047245bd2cd6 100644 --- a/fs/hfsplus/hfsplus_fs.h +++ b/fs/hfsplus/hfsplus_fs.h @@ -526,7 +526,7 @@ int hfsplus_compare_dentry(const struct dentry *parent, /* wrapper.c */ int hfsplus_submit_bio(struct super_block *sb, sector_t sector, void *buf, - void **data, int rw); + void **data, int op, int op_flags); int hfsplus_read_wrapper(struct super_block *sb); /* time macros */ diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c index ef9fefe364a6..19462d773fe2 100644 --- a/fs/hfsplus/inode.c +++ b/fs/hfsplus/inode.c @@ -126,7 +126,7 @@ static ssize_t hfsplus_direct_IO(struct kiocb *iocb, struct iov_iter *iter) { struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; - struct inode *inode = file_inode(file)->i_mapping->host; + struct inode *inode = mapping->host; size_t count = iov_iter_count(iter); ssize_t ret; diff --git a/fs/hfsplus/part_tbl.c b/fs/hfsplus/part_tbl.c index eb355d81e279..63164ebc52fa 100644 --- a/fs/hfsplus/part_tbl.c +++ b/fs/hfsplus/part_tbl.c @@ -112,7 +112,8 @@ static int hfs_parse_new_pmap(struct super_block *sb, void *buf, if ((u8 *)pm - (u8 *)buf >= buf_size) { res = hfsplus_submit_bio(sb, *part_start + HFS_PMAP_BLK + i, - buf, (void **)&pm, READ); + buf, (void **)&pm, REQ_OP_READ, + 0); if (res) return res; } @@ -136,7 +137,7 @@ int hfs_part_find(struct super_block *sb, return -ENOMEM; res = hfsplus_submit_bio(sb, *part_start + HFS_PMAP_BLK, - buf, &data, READ); + buf, &data, REQ_OP_READ, 0); if (res) goto out; diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c index 755bf30ba1ce..11854dd84572 100644 --- a/fs/hfsplus/super.c +++ b/fs/hfsplus/super.c @@ -220,7 +220,8 @@ static int hfsplus_sync_fs(struct super_block *sb, int wait) error2 = hfsplus_submit_bio(sb, sbi->part_start + HFSPLUS_VOLHEAD_SECTOR, - sbi->s_vhdr_buf, NULL, WRITE_SYNC); + sbi->s_vhdr_buf, NULL, REQ_OP_WRITE, + WRITE_SYNC); if (!error) error = error2; if (!write_backup) @@ -228,7 +229,8 @@ static int hfsplus_sync_fs(struct super_block *sb, int wait) error2 = hfsplus_submit_bio(sb, sbi->part_start + sbi->sect_count - 2, - sbi->s_backup_vhdr_buf, NULL, WRITE_SYNC); + sbi->s_backup_vhdr_buf, NULL, REQ_OP_WRITE, + WRITE_SYNC); if (!error) error2 = error; out: diff --git a/fs/hfsplus/unicode.c b/fs/hfsplus/unicode.c index e8ef121a4d8b..c13c8a240be3 100644 --- a/fs/hfsplus/unicode.c +++ b/fs/hfsplus/unicode.c @@ -346,7 +346,7 @@ int hfsplus_hash_dentry(const struct dentry *dentry, struct qstr *str) casefold = test_bit(HFSPLUS_SB_CASEFOLD, &HFSPLUS_SB(sb)->flags); decompose = !test_bit(HFSPLUS_SB_NODECOMPOSE, &HFSPLUS_SB(sb)->flags); - hash = init_name_hash(); + hash = init_name_hash(dentry); astr = str->name; len = str->len; while (len > 0) { diff --git a/fs/hfsplus/wrapper.c b/fs/hfsplus/wrapper.c index cc6235671437..ebb85e5f6549 100644 --- a/fs/hfsplus/wrapper.c +++ b/fs/hfsplus/wrapper.c @@ -30,7 +30,8 @@ struct hfsplus_wd { * @sector: block to read or write, for blocks of HFSPLUS_SECTOR_SIZE bytes * @buf: buffer for I/O * @data: output pointer for location of requested data - * @rw: direction of I/O + * @op: direction of I/O + * @op_flags: request op flags * * The unit of I/O is hfsplus_min_io_size(sb), which may be bigger than * HFSPLUS_SECTOR_SIZE, and @buf must be sized accordingly. On reads @@ -44,7 +45,7 @@ struct hfsplus_wd { * will work correctly. */ int hfsplus_submit_bio(struct super_block *sb, sector_t sector, - void *buf, void **data, int rw) + void *buf, void **data, int op, int op_flags) { struct bio *bio; int ret = 0; @@ -65,8 +66,9 @@ int hfsplus_submit_bio(struct super_block *sb, sector_t sector, bio = bio_alloc(GFP_NOIO, 1); bio->bi_iter.bi_sector = sector; bio->bi_bdev = sb->s_bdev; + bio_set_op_attrs(bio, op, op_flags); - if (!(rw & WRITE) && data) + if (op != WRITE && data) *data = (u8 *)buf + offset; while (io_size > 0) { @@ -83,7 +85,7 @@ int hfsplus_submit_bio(struct super_block *sb, sector_t sector, buf = (u8 *)buf + len; } - ret = submit_bio_wait(rw, bio); + ret = submit_bio_wait(bio); out: bio_put(bio); return ret < 0 ? ret : 0; @@ -181,7 +183,7 @@ int hfsplus_read_wrapper(struct super_block *sb) reread: error = hfsplus_submit_bio(sb, part_start + HFSPLUS_VOLHEAD_SECTOR, sbi->s_vhdr_buf, (void **)&sbi->s_vhdr, - READ); + REQ_OP_READ, 0); if (error) goto out_free_backup_vhdr; @@ -213,7 +215,8 @@ reread: error = hfsplus_submit_bio(sb, part_start + part_size - 2, sbi->s_backup_vhdr_buf, - (void **)&sbi->s_backup_vhdr, READ); + (void **)&sbi->s_backup_vhdr, REQ_OP_READ, + 0); if (error) goto out_free_backup_vhdr; diff --git a/fs/hfsplus/xattr.c b/fs/hfsplus/xattr.c index 4f118d282a7a..d37bb88dc746 100644 --- a/fs/hfsplus/xattr.c +++ b/fs/hfsplus/xattr.c @@ -424,7 +424,7 @@ static int copy_name(char *buffer, const char *xattr_name, int name_len) return len; } -int hfsplus_setxattr(struct dentry *dentry, const char *name, +int hfsplus_setxattr(struct inode *inode, const char *name, const void *value, size_t size, int flags, const char *prefix, size_t prefixlen) { @@ -437,8 +437,7 @@ int hfsplus_setxattr(struct dentry *dentry, const char *name, return -ENOMEM; strcpy(xattr_name, prefix); strcpy(xattr_name + prefixlen, name); - res = __hfsplus_setxattr(d_inode(dentry), xattr_name, value, size, - flags); + res = __hfsplus_setxattr(inode, xattr_name, value, size, flags); kfree(xattr_name); return res; } @@ -864,8 +863,9 @@ static int hfsplus_osx_getxattr(const struct xattr_handler *handler, } static int hfsplus_osx_setxattr(const struct xattr_handler *handler, - struct dentry *dentry, const char *name, - const void *buffer, size_t size, int flags) + struct dentry *unused, struct inode *inode, + const char *name, const void *buffer, + size_t size, int flags) { /* * Don't allow setting properly prefixed attributes @@ -880,7 +880,7 @@ static int hfsplus_osx_setxattr(const struct xattr_handler *handler, * creates), so we pass the name through unmodified (after * ensuring it doesn't conflict with another namespace). */ - return __hfsplus_setxattr(d_inode(dentry), name, buffer, size, flags); + return __hfsplus_setxattr(inode, name, buffer, size, flags); } const struct xattr_handler hfsplus_xattr_osx_handler = { diff --git a/fs/hfsplus/xattr.h b/fs/hfsplus/xattr.h index d04ba6f58df2..68f6b539371f 100644 --- a/fs/hfsplus/xattr.h +++ b/fs/hfsplus/xattr.h @@ -21,7 +21,7 @@ extern const struct xattr_handler *hfsplus_xattr_handlers[]; int __hfsplus_setxattr(struct inode *inode, const char *name, const void *value, size_t size, int flags); -int hfsplus_setxattr(struct dentry *dentry, const char *name, +int hfsplus_setxattr(struct inode *inode, const char *name, const void *value, size_t size, int flags, const char *prefix, size_t prefixlen); diff --git a/fs/hfsplus/xattr_security.c b/fs/hfsplus/xattr_security.c index ae2ca8c2e335..37b3efa733ef 100644 --- a/fs/hfsplus/xattr_security.c +++ b/fs/hfsplus/xattr_security.c @@ -23,10 +23,11 @@ static int hfsplus_security_getxattr(const struct xattr_handler *handler, } static int hfsplus_security_setxattr(const struct xattr_handler *handler, - struct dentry *dentry, const char *name, - const void *buffer, size_t size, int flags) + struct dentry *unused, struct inode *inode, + const char *name, const void *buffer, + size_t size, int flags) { - return hfsplus_setxattr(dentry, name, buffer, size, flags, + return hfsplus_setxattr(inode, name, buffer, size, flags, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN); } diff --git a/fs/hfsplus/xattr_trusted.c b/fs/hfsplus/xattr_trusted.c index eae2947060aa..94519d6c627d 100644 --- a/fs/hfsplus/xattr_trusted.c +++ b/fs/hfsplus/xattr_trusted.c @@ -21,10 +21,11 @@ static int hfsplus_trusted_getxattr(const struct xattr_handler *handler, } static int hfsplus_trusted_setxattr(const struct xattr_handler *handler, - struct dentry *dentry, const char *name, - const void *buffer, size_t size, int flags) + struct dentry *unused, struct inode *inode, + const char *name, const void *buffer, + size_t size, int flags) { - return hfsplus_setxattr(dentry, name, buffer, size, flags, + return hfsplus_setxattr(inode, name, buffer, size, flags, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN); } diff --git a/fs/hfsplus/xattr_user.c b/fs/hfsplus/xattr_user.c index 3c9eec3e4c7b..fae6c0ea0030 100644 --- a/fs/hfsplus/xattr_user.c +++ b/fs/hfsplus/xattr_user.c @@ -21,10 +21,11 @@ static int hfsplus_user_getxattr(const struct xattr_handler *handler, } static int hfsplus_user_setxattr(const struct xattr_handler *handler, - struct dentry *dentry, const char *name, - const void *buffer, size_t size, int flags) + struct dentry *unused, struct inode *inode, + const char *name, const void *buffer, + size_t size, int flags) { - return hfsplus_setxattr(dentry, name, buffer, size, flags, + return hfsplus_setxattr(inode, name, buffer, size, flags, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN); } diff --git a/fs/hpfs/dentry.c b/fs/hpfs/dentry.c index fa27980f2229..60e6d334d79a 100644 --- a/fs/hpfs/dentry.c +++ b/fs/hpfs/dentry.c @@ -26,7 +26,7 @@ static int hpfs_hash_dentry(const struct dentry *dentry, struct qstr *qstr) /*return -ENOENT;*/ x: - hash = init_name_hash(); + hash = init_name_hash(dentry); for (i = 0; i < l; i++) hash = partial_name_hash(hpfs_upcase(hpfs_sb(dentry->d_sb)->sb_cp_table,qstr->name[i]), hash); qstr->hash = end_name_hash(hash); diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c index 458cf463047b..82067ca22f2b 100644 --- a/fs/hpfs/super.c +++ b/fs/hpfs/super.c @@ -15,6 +15,7 @@ #include <linux/sched.h> #include <linux/bitmap.h> #include <linux/slab.h> +#include <linux/seq_file.h> /* Mark the filesystem dirty, so that chkdsk checks it when os/2 booted */ @@ -453,10 +454,6 @@ static int hpfs_remount_fs(struct super_block *s, int *flags, char *data) int lowercase, eas, chk, errs, chkdsk, timeshift; int o; struct hpfs_sb_info *sbi = hpfs_sb(s); - char *new_opts = kstrdup(data, GFP_KERNEL); - - if (!new_opts) - return -ENOMEM; sync_filesystem(s); @@ -493,17 +490,44 @@ static int hpfs_remount_fs(struct super_block *s, int *flags, char *data) if (!(*flags & MS_RDONLY)) mark_dirty(s, 1); - replace_mount_options(s, new_opts); - hpfs_unlock(s); return 0; out_err: hpfs_unlock(s); - kfree(new_opts); return -EINVAL; } +static int hpfs_show_options(struct seq_file *seq, struct dentry *root) +{ + struct hpfs_sb_info *sbi = hpfs_sb(root->d_sb); + + seq_printf(seq, ",uid=%u", from_kuid_munged(&init_user_ns, sbi->sb_uid)); + seq_printf(seq, ",gid=%u", from_kgid_munged(&init_user_ns, sbi->sb_gid)); + seq_printf(seq, ",umask=%03o", (~sbi->sb_mode & 0777)); + if (sbi->sb_lowercase) + seq_printf(seq, ",case=lower"); + if (!sbi->sb_chk) + seq_printf(seq, ",check=none"); + if (sbi->sb_chk == 2) + seq_printf(seq, ",check=strict"); + if (!sbi->sb_err) + seq_printf(seq, ",errors=continue"); + if (sbi->sb_err == 2) + seq_printf(seq, ",errors=panic"); + if (!sbi->sb_chkdsk) + seq_printf(seq, ",chkdsk=no"); + if (sbi->sb_chkdsk == 2) + seq_printf(seq, ",chkdsk=always"); + if (!sbi->sb_eas) + seq_printf(seq, ",eas=no"); + if (sbi->sb_eas == 1) + seq_printf(seq, ",eas=ro"); + if (sbi->sb_timeshift) + seq_printf(seq, ",timeshift=%d", sbi->sb_timeshift); + return 0; +} + /* Super operations */ static const struct super_operations hpfs_sops = @@ -514,7 +538,7 @@ static const struct super_operations hpfs_sops = .put_super = hpfs_put_super, .statfs = hpfs_statfs, .remount_fs = hpfs_remount_fs, - .show_options = generic_show_options, + .show_options = hpfs_show_options, }; static int hpfs_fill_super(struct super_block *s, void *options, int silent) @@ -537,8 +561,6 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent) int o; - save_mount_options(s, options); - sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); if (!sbi) { return -ENOMEM; diff --git a/fs/inode.c b/fs/inode.c index 4ccbc21b30ce..e171f7b5f9e4 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -365,6 +365,7 @@ void inode_init_once(struct inode *inode) INIT_HLIST_NODE(&inode->i_hash); INIT_LIST_HEAD(&inode->i_devices); INIT_LIST_HEAD(&inode->i_io_list); + INIT_LIST_HEAD(&inode->i_wb_list); INIT_LIST_HEAD(&inode->i_lru); address_space_init_once(&inode->i_data); i_size_ordered_init(inode); @@ -507,6 +508,7 @@ void clear_inode(struct inode *inode) BUG_ON(!list_empty(&inode->i_data.private_list)); BUG_ON(!(inode->i_state & I_FREEING)); BUG_ON(inode->i_state & I_CLEAR); + BUG_ON(!list_empty(&inode->i_wb_list)); /* don't need i_lock here, no concurrent mods to i_state */ inode->i_state = I_FREEING | I_CLEAR; } diff --git a/fs/internal.h b/fs/internal.h index b71deeecea17..cef0913e5d41 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -11,6 +11,7 @@ struct super_block; struct file_system_type; +struct iomap; struct linux_binprm; struct path; struct mount; @@ -39,6 +40,8 @@ static inline int __sync_blockdev(struct block_device *bdev, int wait) * buffer.c */ extern void guard_bio_eod(int rw, struct bio *bio); +extern int __block_write_begin_int(struct page *page, loff_t pos, unsigned len, + get_block_t *get_block, struct iomap *iomap); /* * char_dev.c @@ -130,6 +133,7 @@ extern int invalidate_inodes(struct super_block *, bool); extern struct dentry *__d_alloc(struct super_block *, const struct qstr *); extern int d_set_mounted(struct dentry *dentry); extern long prune_dcache_sb(struct super_block *sb, struct shrink_control *sc); +extern struct dentry *d_alloc_cursor(struct dentry *); /* * read_write.c diff --git a/fs/iomap.c b/fs/iomap.c new file mode 100644 index 000000000000..48141b8eff5f --- /dev/null +++ b/fs/iomap.c @@ -0,0 +1,497 @@ +/* + * Copyright (C) 2010 Red Hat, Inc. + * Copyright (c) 2016 Christoph Hellwig. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ +#include <linux/module.h> +#include <linux/compiler.h> +#include <linux/fs.h> +#include <linux/iomap.h> +#include <linux/uaccess.h> +#include <linux/gfp.h> +#include <linux/mm.h> +#include <linux/swap.h> +#include <linux/pagemap.h> +#include <linux/file.h> +#include <linux/uio.h> +#include <linux/backing-dev.h> +#include <linux/buffer_head.h> +#include <linux/dax.h> +#include "internal.h" + +typedef loff_t (*iomap_actor_t)(struct inode *inode, loff_t pos, loff_t len, + void *data, struct iomap *iomap); + +/* + * Execute a iomap write on a segment of the mapping that spans a + * contiguous range of pages that have identical block mapping state. + * + * This avoids the need to map pages individually, do individual allocations + * for each page and most importantly avoid the need for filesystem specific + * locking per page. Instead, all the operations are amortised over the entire + * range of pages. It is assumed that the filesystems will lock whatever + * resources they require in the iomap_begin call, and release them in the + * iomap_end call. + */ +static loff_t +iomap_apply(struct inode *inode, loff_t pos, loff_t length, unsigned flags, + struct iomap_ops *ops, void *data, iomap_actor_t actor) +{ + struct iomap iomap = { 0 }; + loff_t written = 0, ret; + + /* + * Need to map a range from start position for length bytes. This can + * span multiple pages - it is only guaranteed to return a range of a + * single type of pages (e.g. all into a hole, all mapped or all + * unwritten). Failure at this point has nothing to undo. + * + * If allocation is required for this range, reserve the space now so + * that the allocation is guaranteed to succeed later on. Once we copy + * the data into the page cache pages, then we cannot fail otherwise we + * expose transient stale data. If the reserve fails, we can safely + * back out at this point as there is nothing to undo. + */ + ret = ops->iomap_begin(inode, pos, length, flags, &iomap); + if (ret) + return ret; + if (WARN_ON(iomap.offset > pos)) + return -EIO; + + /* + * Cut down the length to the one actually provided by the filesystem, + * as it might not be able to give us the whole size that we requested. + */ + if (iomap.offset + iomap.length < pos + length) + length = iomap.offset + iomap.length - pos; + + /* + * Now that we have guaranteed that the space allocation will succeed. + * we can do the copy-in page by page without having to worry about + * failures exposing transient data. + */ + written = actor(inode, pos, length, data, &iomap); + + /* + * Now the data has been copied, commit the range we've copied. This + * should not fail unless the filesystem has had a fatal error. + */ + ret = ops->iomap_end(inode, pos, length, written > 0 ? written : 0, + flags, &iomap); + + return written ? written : ret; +} + +static void +iomap_write_failed(struct inode *inode, loff_t pos, unsigned len) +{ + loff_t i_size = i_size_read(inode); + + /* + * Only truncate newly allocated pages beyoned EOF, even if the + * write started inside the existing inode size. + */ + if (pos + len > i_size) + truncate_pagecache_range(inode, max(pos, i_size), pos + len); +} + +static int +iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, unsigned flags, + struct page **pagep, struct iomap *iomap) +{ + pgoff_t index = pos >> PAGE_SHIFT; + struct page *page; + int status = 0; + + BUG_ON(pos + len > iomap->offset + iomap->length); + + page = grab_cache_page_write_begin(inode->i_mapping, index, flags); + if (!page) + return -ENOMEM; + + status = __block_write_begin_int(page, pos, len, NULL, iomap); + if (unlikely(status)) { + unlock_page(page); + put_page(page); + page = NULL; + + iomap_write_failed(inode, pos, len); + } + + *pagep = page; + return status; +} + +static int +iomap_write_end(struct inode *inode, loff_t pos, unsigned len, + unsigned copied, struct page *page) +{ + int ret; + + ret = generic_write_end(NULL, inode->i_mapping, pos, len, + copied, page, NULL); + if (ret < len) + iomap_write_failed(inode, pos, len); + return ret; +} + +static loff_t +iomap_write_actor(struct inode *inode, loff_t pos, loff_t length, void *data, + struct iomap *iomap) +{ + struct iov_iter *i = data; + long status = 0; + ssize_t written = 0; + unsigned int flags = AOP_FLAG_NOFS; + + /* + * Copies from kernel address space cannot fail (NFSD is a big user). + */ + if (!iter_is_iovec(i)) + flags |= AOP_FLAG_UNINTERRUPTIBLE; + + do { + struct page *page; + unsigned long offset; /* Offset into pagecache page */ + unsigned long bytes; /* Bytes to write to page */ + size_t copied; /* Bytes copied from user */ + + offset = (pos & (PAGE_SIZE - 1)); + bytes = min_t(unsigned long, PAGE_SIZE - offset, + iov_iter_count(i)); +again: + if (bytes > length) + bytes = length; + + /* + * Bring in the user page that we will copy from _first_. + * Otherwise there's a nasty deadlock on copying from the + * same page as we're writing to, without it being marked + * up-to-date. + * + * Not only is this an optimisation, but it is also required + * to check that the address is actually valid, when atomic + * usercopies are used, below. + */ + if (unlikely(iov_iter_fault_in_readable(i, bytes))) { + status = -EFAULT; + break; + } + + status = iomap_write_begin(inode, pos, bytes, flags, &page, + iomap); + if (unlikely(status)) + break; + + if (mapping_writably_mapped(inode->i_mapping)) + flush_dcache_page(page); + + pagefault_disable(); + copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes); + pagefault_enable(); + + flush_dcache_page(page); + mark_page_accessed(page); + + status = iomap_write_end(inode, pos, bytes, copied, page); + if (unlikely(status < 0)) + break; + copied = status; + + cond_resched(); + + iov_iter_advance(i, copied); + if (unlikely(copied == 0)) { + /* + * If we were unable to copy any data at all, we must + * fall back to a single segment length write. + * + * If we didn't fallback here, we could livelock + * because not all segments in the iov can be copied at + * once without a pagefault. + */ + bytes = min_t(unsigned long, PAGE_SIZE - offset, + iov_iter_single_seg_count(i)); + goto again; + } + pos += copied; + written += copied; + length -= copied; + + balance_dirty_pages_ratelimited(inode->i_mapping); + } while (iov_iter_count(i) && length); + + return written ? written : status; +} + +ssize_t +iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *iter, + struct iomap_ops *ops) +{ + struct inode *inode = iocb->ki_filp->f_mapping->host; + loff_t pos = iocb->ki_pos, ret = 0, written = 0; + + while (iov_iter_count(iter)) { + ret = iomap_apply(inode, pos, iov_iter_count(iter), + IOMAP_WRITE, ops, iter, iomap_write_actor); + if (ret <= 0) + break; + pos += ret; + written += ret; + } + + return written ? written : ret; +} +EXPORT_SYMBOL_GPL(iomap_file_buffered_write); + +static int iomap_zero(struct inode *inode, loff_t pos, unsigned offset, + unsigned bytes, struct iomap *iomap) +{ + struct page *page; + int status; + + status = iomap_write_begin(inode, pos, bytes, + AOP_FLAG_UNINTERRUPTIBLE | AOP_FLAG_NOFS, &page, iomap); + if (status) + return status; + + zero_user(page, offset, bytes); + mark_page_accessed(page); + + return iomap_write_end(inode, pos, bytes, bytes, page); +} + +static int iomap_dax_zero(loff_t pos, unsigned offset, unsigned bytes, + struct iomap *iomap) +{ + sector_t sector = iomap->blkno + + (((pos & ~(PAGE_SIZE - 1)) - iomap->offset) >> 9); + + return __dax_zero_page_range(iomap->bdev, sector, offset, bytes); +} + +static loff_t +iomap_zero_range_actor(struct inode *inode, loff_t pos, loff_t count, + void *data, struct iomap *iomap) +{ + bool *did_zero = data; + loff_t written = 0; + int status; + + /* already zeroed? we're done. */ + if (iomap->type == IOMAP_HOLE || iomap->type == IOMAP_UNWRITTEN) + return count; + + do { + unsigned offset, bytes; + + offset = pos & (PAGE_SIZE - 1); /* Within page */ + bytes = min_t(unsigned, PAGE_SIZE - offset, count); + + if (IS_DAX(inode)) + status = iomap_dax_zero(pos, offset, bytes, iomap); + else + status = iomap_zero(inode, pos, offset, bytes, iomap); + if (status < 0) + return status; + + pos += bytes; + count -= bytes; + written += bytes; + if (did_zero) + *did_zero = true; + } while (count > 0); + + return written; +} + +int +iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero, + struct iomap_ops *ops) +{ + loff_t ret; + + while (len > 0) { + ret = iomap_apply(inode, pos, len, IOMAP_ZERO, + ops, did_zero, iomap_zero_range_actor); + if (ret <= 0) + return ret; + + pos += ret; + len -= ret; + } + + return 0; +} +EXPORT_SYMBOL_GPL(iomap_zero_range); + +int +iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero, + struct iomap_ops *ops) +{ + unsigned blocksize = (1 << inode->i_blkbits); + unsigned off = pos & (blocksize - 1); + + /* Block boundary? Nothing to do */ + if (!off) + return 0; + return iomap_zero_range(inode, pos, blocksize - off, did_zero, ops); +} +EXPORT_SYMBOL_GPL(iomap_truncate_page); + +static loff_t +iomap_page_mkwrite_actor(struct inode *inode, loff_t pos, loff_t length, + void *data, struct iomap *iomap) +{ + struct page *page = data; + int ret; + + ret = __block_write_begin_int(page, pos & ~PAGE_MASK, length, + NULL, iomap); + if (ret) + return ret; + + block_commit_write(page, 0, length); + return length; +} + +int iomap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf, + struct iomap_ops *ops) +{ + struct page *page = vmf->page; + struct inode *inode = file_inode(vma->vm_file); + unsigned long length; + loff_t offset, size; + ssize_t ret; + + lock_page(page); + size = i_size_read(inode); + if ((page->mapping != inode->i_mapping) || + (page_offset(page) > size)) { + /* We overload EFAULT to mean page got truncated */ + ret = -EFAULT; + goto out_unlock; + } + + /* page is wholly or partially inside EOF */ + if (((page->index + 1) << PAGE_SHIFT) > size) + length = size & ~PAGE_MASK; + else + length = PAGE_SIZE; + + offset = page_offset(page); + while (length > 0) { + ret = iomap_apply(inode, offset, length, IOMAP_WRITE, + ops, page, iomap_page_mkwrite_actor); + if (unlikely(ret <= 0)) + goto out_unlock; + offset += ret; + length -= ret; + } + + set_page_dirty(page); + wait_for_stable_page(page); + return 0; +out_unlock: + unlock_page(page); + return ret; +} +EXPORT_SYMBOL_GPL(iomap_page_mkwrite); + +struct fiemap_ctx { + struct fiemap_extent_info *fi; + struct iomap prev; +}; + +static int iomap_to_fiemap(struct fiemap_extent_info *fi, + struct iomap *iomap, u32 flags) +{ + switch (iomap->type) { + case IOMAP_HOLE: + /* skip holes */ + return 0; + case IOMAP_DELALLOC: + flags |= FIEMAP_EXTENT_DELALLOC | FIEMAP_EXTENT_UNKNOWN; + break; + case IOMAP_UNWRITTEN: + flags |= FIEMAP_EXTENT_UNWRITTEN; + break; + case IOMAP_MAPPED: + break; + } + + return fiemap_fill_next_extent(fi, iomap->offset, + iomap->blkno != IOMAP_NULL_BLOCK ? iomap->blkno << 9: 0, + iomap->length, flags | FIEMAP_EXTENT_MERGED); + +} + +static loff_t +iomap_fiemap_actor(struct inode *inode, loff_t pos, loff_t length, void *data, + struct iomap *iomap) +{ + struct fiemap_ctx *ctx = data; + loff_t ret = length; + + if (iomap->type == IOMAP_HOLE) + return length; + + ret = iomap_to_fiemap(ctx->fi, &ctx->prev, 0); + ctx->prev = *iomap; + switch (ret) { + case 0: /* success */ + return length; + case 1: /* extent array full */ + return 0; + default: + return ret; + } +} + +int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fi, + loff_t start, loff_t len, struct iomap_ops *ops) +{ + struct fiemap_ctx ctx; + loff_t ret; + + memset(&ctx, 0, sizeof(ctx)); + ctx.fi = fi; + ctx.prev.type = IOMAP_HOLE; + + ret = fiemap_check_flags(fi, FIEMAP_FLAG_SYNC); + if (ret) + return ret; + + ret = filemap_write_and_wait(inode->i_mapping); + if (ret) + return ret; + + while (len > 0) { + ret = iomap_apply(inode, start, len, 0, ops, &ctx, + iomap_fiemap_actor); + if (ret < 0) + return ret; + if (ret == 0) + break; + + start += ret; + len -= ret; + } + + if (ctx.prev.type != IOMAP_HOLE) { + ret = iomap_to_fiemap(fi, &ctx.prev, FIEMAP_EXTENT_LAST); + if (ret < 0) + return ret; + } + + return 0; +} +EXPORT_SYMBOL_GPL(iomap_fiemap); diff --git a/fs/isofs/compress.c b/fs/isofs/compress.c index 2e4e834d1a98..44af14b2e916 100644 --- a/fs/isofs/compress.c +++ b/fs/isofs/compress.c @@ -81,7 +81,7 @@ static loff_t zisofs_uncompress_block(struct inode *inode, loff_t block_start, blocknum = block_start >> bufshift; memset(bhs, 0, (needblocks + 1) * sizeof(struct buffer_head *)); haveblocks = isofs_get_blocks(inode, blocknum, bhs, needblocks); - ll_rw_block(READ, haveblocks, bhs); + ll_rw_block(REQ_OP_READ, 0, haveblocks, bhs); curbh = 0; curpage = 0; @@ -361,7 +361,6 @@ static int zisofs_readpage(struct file *file, struct page *page) const struct address_space_operations zisofs_aops = { .readpage = zisofs_readpage, - /* No sync_page operation supported? */ /* No bmap operation supported */ }; diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c index 131dedc920d8..761fade7680f 100644 --- a/fs/isofs/inode.c +++ b/fs/isofs/inode.c @@ -174,7 +174,7 @@ struct iso9660_options{ * Compute the hash for the isofs name corresponding to the dentry. */ static int -isofs_hashi_common(struct qstr *qstr, int ms) +isofs_hashi_common(const struct dentry *dentry, struct qstr *qstr, int ms) { const char *name; int len; @@ -188,7 +188,7 @@ isofs_hashi_common(struct qstr *qstr, int ms) len--; } - hash = init_name_hash(); + hash = init_name_hash(dentry); while (len--) { c = tolower(*name++); hash = partial_name_hash(c, hash); @@ -231,7 +231,7 @@ static int isofs_dentry_cmp_common( static int isofs_hashi(const struct dentry *dentry, struct qstr *qstr) { - return isofs_hashi_common(qstr, 0); + return isofs_hashi_common(dentry, qstr, 0); } static int @@ -246,7 +246,7 @@ isofs_dentry_cmpi(const struct dentry *parent, const struct dentry *dentry, * Compute the hash for the isofs name corresponding to the dentry. */ static int -isofs_hash_common(struct qstr *qstr, int ms) +isofs_hash_common(const struct dentry *dentry, struct qstr *qstr, int ms) { const char *name; int len; @@ -258,7 +258,7 @@ isofs_hash_common(struct qstr *qstr, int ms) len--; } - qstr->hash = full_name_hash(name, len); + qstr->hash = full_name_hash(dentry, name, len); return 0; } @@ -266,13 +266,13 @@ isofs_hash_common(struct qstr *qstr, int ms) static int isofs_hash_ms(const struct dentry *dentry, struct qstr *qstr) { - return isofs_hash_common(qstr, 1); + return isofs_hash_common(dentry, qstr, 1); } static int isofs_hashi_ms(const struct dentry *dentry, struct qstr *qstr) { - return isofs_hashi_common(qstr, 1); + return isofs_hashi_common(dentry, qstr, 1); } static int diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 2ad98d6e19f4..5bb565f9989c 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -124,7 +124,7 @@ static int journal_submit_commit_record(journal_t *journal, struct commit_header *tmp; struct buffer_head *bh; int ret; - struct timespec now = current_kernel_time(); + struct timespec64 now = current_kernel_time64(); *cbh = NULL; @@ -155,9 +155,9 @@ static int journal_submit_commit_record(journal_t *journal, if (journal->j_flags & JBD2_BARRIER && !jbd2_has_feature_async_commit(journal)) - ret = submit_bh(WRITE_SYNC | WRITE_FLUSH_FUA, bh); + ret = submit_bh(REQ_OP_WRITE, WRITE_SYNC | WRITE_FLUSH_FUA, bh); else - ret = submit_bh(WRITE_SYNC, bh); + ret = submit_bh(REQ_OP_WRITE, WRITE_SYNC, bh); *cbh = bh; return ret; @@ -219,6 +219,8 @@ static int journal_submit_data_buffers(journal_t *journal, spin_lock(&journal->j_list_lock); list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) { + if (!(jinode->i_flags & JI_WRITE_DATA)) + continue; mapping = jinode->i_vfs_inode->i_mapping; jinode->i_flags |= JI_COMMIT_RUNNING; spin_unlock(&journal->j_list_lock); @@ -256,6 +258,8 @@ static int journal_finish_inode_data_buffers(journal_t *journal, /* For locking, see the comment in journal_submit_data_buffers() */ spin_lock(&journal->j_list_lock); list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) { + if (!(jinode->i_flags & JI_WAIT_DATA)) + continue; jinode->i_flags |= JI_COMMIT_RUNNING; spin_unlock(&journal->j_list_lock); err = filemap_fdatawait(jinode->i_vfs_inode->i_mapping); @@ -714,7 +718,7 @@ start_journal_io: clear_buffer_dirty(bh); set_buffer_uptodate(bh); bh->b_end_io = journal_end_buffer_io_sync; - submit_bh(WRITE_SYNC, bh); + submit_bh(REQ_OP_WRITE, WRITE_SYNC, bh); } cond_resched(); stats.run.rs_blocks_logged += bufs; diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 435f0b26ac20..46261a6f902d 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -94,7 +94,8 @@ EXPORT_SYMBOL(jbd2_journal_blocks_per_page); EXPORT_SYMBOL(jbd2_journal_invalidatepage); EXPORT_SYMBOL(jbd2_journal_try_to_free_buffers); EXPORT_SYMBOL(jbd2_journal_force_commit); -EXPORT_SYMBOL(jbd2_journal_file_inode); +EXPORT_SYMBOL(jbd2_journal_inode_add_write); +EXPORT_SYMBOL(jbd2_journal_inode_add_wait); EXPORT_SYMBOL(jbd2_journal_init_jbd_inode); EXPORT_SYMBOL(jbd2_journal_release_jbd_inode); EXPORT_SYMBOL(jbd2_journal_begin_ordered_truncate); @@ -690,6 +691,7 @@ int jbd2_log_wait_commit(journal_t *journal, tid_t tid) { int err = 0; + jbd2_might_wait_for_commit(journal); read_lock(&journal->j_state_lock); #ifdef CONFIG_JBD2_DEBUG if (!tid_geq(journal->j_commit_request, tid)) { @@ -1090,6 +1092,7 @@ static void jbd2_stats_proc_exit(journal_t *journal) static journal_t * journal_init_common (void) { + static struct lock_class_key jbd2_trans_commit_key; journal_t *journal; int err; @@ -1125,6 +1128,9 @@ static journal_t * journal_init_common (void) spin_lock_init(&journal->j_history_lock); + lockdep_init_map(&journal->j_trans_commit_map, "jbd2_handle", + &jbd2_trans_commit_key, 0); + return journal; } @@ -1345,15 +1351,15 @@ static int journal_reset(journal_t *journal) return jbd2_journal_start_thread(journal); } -static int jbd2_write_superblock(journal_t *journal, int write_op) +static int jbd2_write_superblock(journal_t *journal, int write_flags) { struct buffer_head *bh = journal->j_sb_buffer; journal_superblock_t *sb = journal->j_superblock; int ret; - trace_jbd2_write_superblock(journal, write_op); + trace_jbd2_write_superblock(journal, write_flags); if (!(journal->j_flags & JBD2_BARRIER)) - write_op &= ~(REQ_FUA | REQ_FLUSH); + write_flags &= ~(REQ_FUA | REQ_PREFLUSH); lock_buffer(bh); if (buffer_write_io_error(bh)) { /* @@ -1373,7 +1379,7 @@ static int jbd2_write_superblock(journal_t *journal, int write_op) jbd2_superblock_csum_set(journal, sb); get_bh(bh); bh->b_end_io = end_buffer_write_sync; - ret = submit_bh(write_op, bh); + ret = submit_bh(REQ_OP_WRITE, write_flags, bh); wait_on_buffer(bh); if (buffer_write_io_error(bh)) { clear_buffer_write_io_error(bh); @@ -1497,7 +1503,7 @@ static int journal_get_superblock(journal_t *journal) J_ASSERT(bh != NULL); if (!buffer_uptodate(bh)) { - ll_rw_block(READ, 1, &bh); + ll_rw_block(REQ_OP_READ, 0, 1, &bh); wait_on_buffer(bh); if (!buffer_uptodate(bh)) { printk(KERN_ERR @@ -2328,18 +2334,10 @@ void *jbd2_alloc(size_t size, gfp_t flags) BUG_ON(size & (size-1)); /* Must be a power of 2 */ - flags |= __GFP_REPEAT; - if (size == PAGE_SIZE) - ptr = (void *)__get_free_pages(flags, 0); - else if (size > PAGE_SIZE) { - int order = get_order(size); - - if (order < 3) - ptr = (void *)__get_free_pages(flags, order); - else - ptr = vmalloc(size); - } else + if (size < PAGE_SIZE) ptr = kmem_cache_alloc(get_slab(size), flags); + else + ptr = (void *)__get_free_pages(flags, get_order(size)); /* Check alignment; SLUB has gotten this wrong in the past, * and this can lead to user data corruption! */ @@ -2350,20 +2348,10 @@ void *jbd2_alloc(size_t size, gfp_t flags) void jbd2_free(void *ptr, size_t size) { - if (size == PAGE_SIZE) { - free_pages((unsigned long)ptr, 0); - return; - } - if (size > PAGE_SIZE) { - int order = get_order(size); - - if (order < 3) - free_pages((unsigned long)ptr, order); - else - vfree(ptr); - return; - } - kmem_cache_free(get_slab(size), ptr); + if (size < PAGE_SIZE) + kmem_cache_free(get_slab(size), ptr); + else + free_pages((unsigned long)ptr, get_order(size)); }; /* diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c index 805bc6bcd8ab..02dd3360cb20 100644 --- a/fs/jbd2/recovery.c +++ b/fs/jbd2/recovery.c @@ -104,7 +104,7 @@ static int do_readahead(journal_t *journal, unsigned int start) if (!buffer_uptodate(bh) && !buffer_locked(bh)) { bufs[nbufs++] = bh; if (nbufs == MAXBUF) { - ll_rw_block(READ, nbufs, bufs); + ll_rw_block(REQ_OP_READ, 0, nbufs, bufs); journal_brelse_array(bufs, nbufs); nbufs = 0; } @@ -113,7 +113,7 @@ static int do_readahead(journal_t *journal, unsigned int start) } if (nbufs) - ll_rw_block(READ, nbufs, bufs); + ll_rw_block(REQ_OP_READ, 0, nbufs, bufs); err = 0; failed: diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 2c56c3e32194..b5bc3e249163 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -182,6 +182,8 @@ static int add_transaction_credits(journal_t *journal, int blocks, int needed; int total = blocks + rsv_blocks; + jbd2_might_wait_for_commit(journal); + /* * If the current transaction is locked down for commit, wait * for the lock to be released. @@ -382,13 +384,11 @@ repeat: read_unlock(&journal->j_state_lock); current->journal_info = handle; - lock_map_acquire(&handle->h_lockdep_map); + rwsem_acquire_read(&journal->j_trans_commit_map, 0, 0, _THIS_IP_); jbd2_journal_free_transaction(new_transaction); return 0; } -static struct lock_class_key jbd2_handle_key; - /* Allocate a new handle. This should probably be in a slab... */ static handle_t *new_handle(int nblocks) { @@ -398,9 +398,6 @@ static handle_t *new_handle(int nblocks) handle->h_buffer_credits = nblocks; handle->h_ref = 1; - lockdep_init_map(&handle->h_lockdep_map, "jbd2_handle", - &jbd2_handle_key, 0); - return handle; } @@ -672,7 +669,7 @@ int jbd2__journal_restart(handle_t *handle, int nblocks, gfp_t gfp_mask) if (need_to_start) jbd2_log_start_commit(journal, tid); - lock_map_release(&handle->h_lockdep_map); + rwsem_release(&journal->j_trans_commit_map, 1, _THIS_IP_); handle->h_buffer_credits = nblocks; ret = start_this_handle(journal, handle, gfp_mask); return ret; @@ -700,6 +697,8 @@ void jbd2_journal_lock_updates(journal_t *journal) { DEFINE_WAIT(wait); + jbd2_might_wait_for_commit(journal); + write_lock(&journal->j_state_lock); ++journal->j_barrier_count; @@ -1750,11 +1749,11 @@ int jbd2_journal_stop(handle_t *handle) wake_up(&journal->j_wait_transaction_locked); } + rwsem_release(&journal->j_trans_commit_map, 1, _THIS_IP_); + if (wait_for_commit) err = jbd2_log_wait_commit(journal, tid); - lock_map_release(&handle->h_lockdep_map); - if (handle->h_rsv_handle) jbd2_journal_free_reserved(handle->h_rsv_handle); free_and_exit: @@ -2462,7 +2461,8 @@ void jbd2_journal_refile_buffer(journal_t *journal, struct journal_head *jh) /* * File inode in the inode list of the handle's transaction */ -int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *jinode) +static int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *jinode, + unsigned long flags) { transaction_t *transaction = handle->h_transaction; journal_t *journal; @@ -2487,12 +2487,14 @@ int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *jinode) * and if jinode->i_next_transaction == transaction, commit code * will only file the inode where we want it. */ - if (jinode->i_transaction == transaction || - jinode->i_next_transaction == transaction) + if ((jinode->i_transaction == transaction || + jinode->i_next_transaction == transaction) && + (jinode->i_flags & flags) == flags) return 0; spin_lock(&journal->j_list_lock); - + jinode->i_flags |= flags; + /* Is inode already attached where we need it? */ if (jinode->i_transaction == transaction || jinode->i_next_transaction == transaction) goto done; @@ -2523,6 +2525,17 @@ done: return 0; } +int jbd2_journal_inode_add_write(handle_t *handle, struct jbd2_inode *jinode) +{ + return jbd2_journal_file_inode(handle, jinode, + JI_WRITE_DATA | JI_WAIT_DATA); +} + +int jbd2_journal_inode_add_wait(handle_t *handle, struct jbd2_inode *jinode) +{ + return jbd2_journal_file_inode(handle, jinode, JI_WAIT_DATA); +} + /* * File truncate and transaction commit interact with each other in a * non-trivial way. If a transaction writing data block A is diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c index 84c4bf3631a2..30eb33ff8189 100644 --- a/fs/jffs2/dir.c +++ b/fs/jffs2/dir.c @@ -81,6 +81,7 @@ static struct dentry *jffs2_lookup(struct inode *dir_i, struct dentry *target, struct jffs2_full_dirent *fd = NULL, *fd_list; uint32_t ino = 0; struct inode *inode = NULL; + unsigned int nhash; jffs2_dbg(1, "jffs2_lookup()\n"); @@ -89,11 +90,14 @@ static struct dentry *jffs2_lookup(struct inode *dir_i, struct dentry *target, dir_f = JFFS2_INODE_INFO(dir_i); + /* The 'nhash' on the fd_list is not the same as the dentry hash */ + nhash = full_name_hash(NULL, target->d_name.name, target->d_name.len); + mutex_lock(&dir_f->sem); /* NB: The 2.2 backport will need to explicitly check for '.' and '..' here */ - for (fd_list = dir_f->dents; fd_list && fd_list->nhash <= target->d_name.hash; fd_list = fd_list->next) { - if (fd_list->nhash == target->d_name.hash && + for (fd_list = dir_f->dents; fd_list && fd_list->nhash <= nhash; fd_list = fd_list->next) { + if (fd_list->nhash == nhash && (!fd || fd_list->version > fd->version) && strlen(fd_list->name) == target->d_name.len && !strncmp(fd_list->name, target->d_name.name, target->d_name.len)) { diff --git a/fs/jffs2/readinode.c b/fs/jffs2/readinode.c index bfebbf13698c..06a71dbd4833 100644 --- a/fs/jffs2/readinode.c +++ b/fs/jffs2/readinode.c @@ -674,7 +674,7 @@ static inline int read_direntry(struct jffs2_sb_info *c, struct jffs2_raw_node_r } } - fd->nhash = full_name_hash(fd->name, rd->nsize); + fd->nhash = full_name_hash(NULL, fd->name, rd->nsize); fd->next = NULL; fd->name[rd->nsize] = '\0'; diff --git a/fs/jffs2/scan.c b/fs/jffs2/scan.c index 9ad5ba4b299b..90431dd613b8 100644 --- a/fs/jffs2/scan.c +++ b/fs/jffs2/scan.c @@ -1100,7 +1100,7 @@ static int jffs2_scan_dirent_node(struct jffs2_sb_info *c, struct jffs2_eraseblo fd->next = NULL; fd->version = je32_to_cpu(rd->version); fd->ino = je32_to_cpu(rd->ino); - fd->nhash = full_name_hash(fd->name, checkedlen); + fd->nhash = full_name_hash(NULL, fd->name, checkedlen); fd->type = rd->type; jffs2_add_fd_to_list(c, fd, &ic->scan_dents); diff --git a/fs/jffs2/security.c b/fs/jffs2/security.c index 3ed9a4b49778..c2332e30f218 100644 --- a/fs/jffs2/security.c +++ b/fs/jffs2/security.c @@ -57,10 +57,11 @@ static int jffs2_security_getxattr(const struct xattr_handler *handler, } static int jffs2_security_setxattr(const struct xattr_handler *handler, - struct dentry *dentry, const char *name, - const void *buffer, size_t size, int flags) + struct dentry *unused, struct inode *inode, + const char *name, const void *buffer, + size_t size, int flags) { - return do_jffs2_setxattr(d_inode(dentry), JFFS2_XPREFIX_SECURITY, + return do_jffs2_setxattr(inode, JFFS2_XPREFIX_SECURITY, name, buffer, size, flags); } diff --git a/fs/jffs2/summary.c b/fs/jffs2/summary.c index bc5385471a6e..be7c8a6a5748 100644 --- a/fs/jffs2/summary.c +++ b/fs/jffs2/summary.c @@ -476,7 +476,7 @@ static int jffs2_sum_process_sum_data(struct jffs2_sb_info *c, struct jffs2_eras fd->next = NULL; fd->version = je32_to_cpu(spd->version); fd->ino = je32_to_cpu(spd->ino); - fd->nhash = full_name_hash(fd->name, checkedlen); + fd->nhash = full_name_hash(NULL, fd->name, checkedlen); fd->type = spd->type; jffs2_add_fd_to_list(c, fd, &ic->scan_dents); diff --git a/fs/jffs2/write.c b/fs/jffs2/write.c index 7fb187ab2682..cda9a361368e 100644 --- a/fs/jffs2/write.c +++ b/fs/jffs2/write.c @@ -245,7 +245,7 @@ struct jffs2_full_dirent *jffs2_write_dirent(struct jffs2_sb_info *c, struct jff fd->version = je32_to_cpu(rd->version); fd->ino = je32_to_cpu(rd->ino); - fd->nhash = full_name_hash(name, namelen); + fd->nhash = full_name_hash(NULL, name, namelen); fd->type = rd->type; memcpy(fd->name, name, namelen); fd->name[namelen]=0; @@ -598,7 +598,7 @@ int jffs2_do_unlink(struct jffs2_sb_info *c, struct jffs2_inode_info *dir_f, jffs2_add_fd_to_list(c, fd, &dir_f->dents); mutex_unlock(&dir_f->sem); } else { - uint32_t nhash = full_name_hash(name, namelen); + uint32_t nhash = full_name_hash(NULL, name, namelen); fd = dir_f->dents; /* We don't actually want to reserve any space, but we do diff --git a/fs/jffs2/xattr_trusted.c b/fs/jffs2/xattr_trusted.c index 4ebecff1d922..5d6030826c52 100644 --- a/fs/jffs2/xattr_trusted.c +++ b/fs/jffs2/xattr_trusted.c @@ -25,10 +25,11 @@ static int jffs2_trusted_getxattr(const struct xattr_handler *handler, } static int jffs2_trusted_setxattr(const struct xattr_handler *handler, - struct dentry *dentry, const char *name, - const void *buffer, size_t size, int flags) + struct dentry *unused, struct inode *inode, + const char *name, const void *buffer, + size_t size, int flags) { - return do_jffs2_setxattr(d_inode(dentry), JFFS2_XPREFIX_TRUSTED, + return do_jffs2_setxattr(inode, JFFS2_XPREFIX_TRUSTED, name, buffer, size, flags); } diff --git a/fs/jffs2/xattr_user.c b/fs/jffs2/xattr_user.c index bce249e1b277..9d027b4abcf9 100644 --- a/fs/jffs2/xattr_user.c +++ b/fs/jffs2/xattr_user.c @@ -25,10 +25,11 @@ static int jffs2_user_getxattr(const struct xattr_handler *handler, } static int jffs2_user_setxattr(const struct xattr_handler *handler, - struct dentry *dentry, const char *name, - const void *buffer, size_t size, int flags) + struct dentry *unused, struct inode *inode, + const char *name, const void *buffer, + size_t size, int flags) { - return do_jffs2_setxattr(d_inode(dentry), JFFS2_XPREFIX_USER, + return do_jffs2_setxattr(inode, JFFS2_XPREFIX_USER, name, buffer, size, flags); } diff --git a/fs/jfs/jfs_debug.c b/fs/jfs/jfs_debug.c index dd824d9b0b1a..a37eb5f8cbc0 100644 --- a/fs/jfs/jfs_debug.c +++ b/fs/jfs/jfs_debug.c @@ -58,7 +58,6 @@ static ssize_t jfs_loglevel_proc_write(struct file *file, } static const struct file_operations jfs_loglevel_proc_fops = { - .owner = THIS_MODULE, .open = jfs_loglevel_proc_open, .read = seq_read, .llseek = seq_lseek, diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c index 63759d723920..a21ea8b3e5fa 100644 --- a/fs/jfs/jfs_logmgr.c +++ b/fs/jfs/jfs_logmgr.c @@ -2002,12 +2002,13 @@ static int lbmRead(struct jfs_log * log, int pn, struct lbuf ** bpp) bio->bi_end_io = lbmIODone; bio->bi_private = bp; + bio_set_op_attrs(bio, REQ_OP_READ, READ_SYNC); /*check if journaling to disk has been disabled*/ if (log->no_integrity) { bio->bi_iter.bi_size = 0; lbmIODone(bio); } else { - submit_bio(READ_SYNC, bio); + submit_bio(bio); } wait_event(bp->l_ioevent, (bp->l_flag != lbmREAD)); @@ -2145,13 +2146,14 @@ static void lbmStartIO(struct lbuf * bp) bio->bi_end_io = lbmIODone; bio->bi_private = bp; + bio_set_op_attrs(bio, REQ_OP_WRITE, WRITE_SYNC); /* check if journaling to disk has been disabled */ if (log->no_integrity) { bio->bi_iter.bi_size = 0; lbmIODone(bio); } else { - submit_bio(WRITE_SYNC, bio); + submit_bio(bio); INCREMENT(lmStat.submitted); } } @@ -2515,7 +2517,6 @@ static int jfs_lmstats_proc_open(struct inode *inode, struct file *file) } const struct file_operations jfs_lmstats_proc_fops = { - .owner = THIS_MODULE, .open = jfs_lmstats_proc_open, .read = seq_read, .llseek = seq_lseek, diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c index b60e015cc757..489aaa1403e5 100644 --- a/fs/jfs/jfs_metapage.c +++ b/fs/jfs/jfs_metapage.c @@ -411,7 +411,7 @@ static int metapage_writepage(struct page *page, struct writeback_control *wbc) inc_io(page); if (!bio->bi_iter.bi_size) goto dump_bio; - submit_bio(WRITE, bio); + submit_bio(bio); nr_underway++; bio = NULL; } else @@ -434,6 +434,7 @@ static int metapage_writepage(struct page *page, struct writeback_control *wbc) bio->bi_iter.bi_sector = pblock << (inode->i_blkbits - 9); bio->bi_end_io = metapage_write_end_io; bio->bi_private = page; + bio_set_op_attrs(bio, REQ_OP_WRITE, 0); /* Don't call bio_add_page yet, we may add to this vec */ bio_offset = offset; @@ -448,7 +449,7 @@ static int metapage_writepage(struct page *page, struct writeback_control *wbc) if (!bio->bi_iter.bi_size) goto dump_bio; - submit_bio(WRITE, bio); + submit_bio(bio); nr_underway++; } if (redirty) @@ -506,7 +507,7 @@ static int metapage_readpage(struct file *fp, struct page *page) insert_metapage(page, NULL); inc_io(page); if (bio) - submit_bio(READ, bio); + submit_bio(bio); bio = bio_alloc(GFP_NOFS, 1); bio->bi_bdev = inode->i_sb->s_bdev; @@ -514,6 +515,7 @@ static int metapage_readpage(struct file *fp, struct page *page) pblock << (inode->i_blkbits - 9); bio->bi_end_io = metapage_read_end_io; bio->bi_private = page; + bio_set_op_attrs(bio, REQ_OP_READ, 0); len = xlen << inode->i_blkbits; offset = block_offset << inode->i_blkbits; if (bio_add_page(bio, page, len, offset) < len) @@ -523,7 +525,7 @@ static int metapage_readpage(struct file *fp, struct page *page) block_offset++; } if (bio) - submit_bio(READ, bio); + submit_bio(bio); else unlock_page(page); @@ -828,7 +830,6 @@ static int jfs_mpstat_proc_open(struct inode *inode, struct file *file) } const struct file_operations jfs_mpstat_proc_fops = { - .owner = THIS_MODULE, .open = jfs_mpstat_proc_open, .read = seq_read, .llseek = seq_lseek, diff --git a/fs/jfs/jfs_txnmgr.c b/fs/jfs/jfs_txnmgr.c index eddf2b6eda85..2e58978d6f45 100644 --- a/fs/jfs/jfs_txnmgr.c +++ b/fs/jfs/jfs_txnmgr.c @@ -3040,7 +3040,6 @@ static int jfs_txanchor_proc_open(struct inode *inode, struct file *file) } const struct file_operations jfs_txanchor_proc_fops = { - .owner = THIS_MODULE, .open = jfs_txanchor_proc_open, .read = seq_read, .llseek = seq_lseek, @@ -3081,7 +3080,6 @@ static int jfs_txstats_proc_open(struct inode *inode, struct file *file) } const struct file_operations jfs_txstats_proc_fops = { - .owner = THIS_MODULE, .open = jfs_txstats_proc_open, .read = seq_read, .llseek = seq_lseek, diff --git a/fs/jfs/jfs_xtree.c b/fs/jfs/jfs_xtree.c index 5ad7748860ce..5cde6d2fcfca 100644 --- a/fs/jfs/jfs_xtree.c +++ b/fs/jfs/jfs_xtree.c @@ -3894,7 +3894,6 @@ static int jfs_xtstat_proc_open(struct inode *inode, struct file *file) } const struct file_operations jfs_xtstat_proc_fops = { - .owner = THIS_MODULE, .open = jfs_xtstat_proc_open, .read = seq_read, .llseek = seq_lseek, diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c index 539deddecbb0..04baf0dfc40c 100644 --- a/fs/jfs/namei.c +++ b/fs/jfs/namei.c @@ -1564,7 +1564,7 @@ static int jfs_ci_hash(const struct dentry *dir, struct qstr *this) unsigned long hash; int i; - hash = init_name_hash(); + hash = init_name_hash(dir); for (i=0; i < this->len; i++) hash = partial_name_hash(tolower(this->name[i]), hash); this->hash = end_name_hash(hash); diff --git a/fs/jfs/xattr.c b/fs/jfs/xattr.c index beb182b503b3..0bf3c33aedff 100644 --- a/fs/jfs/xattr.c +++ b/fs/jfs/xattr.c @@ -943,11 +943,10 @@ static int jfs_xattr_get(const struct xattr_handler *handler, } static int jfs_xattr_set(const struct xattr_handler *handler, - struct dentry *dentry, const char *name, - const void *value, size_t size, int flags) + struct dentry *unused, struct inode *inode, + const char *name, const void *value, + size_t size, int flags) { - struct inode *inode = d_inode(dentry); - name = xattr_full_name(handler, name); return __jfs_xattr_set(inode, name, value, size, flags); } @@ -962,11 +961,10 @@ static int jfs_xattr_get_os2(const struct xattr_handler *handler, } static int jfs_xattr_set_os2(const struct xattr_handler *handler, - struct dentry *dentry, const char *name, - const void *value, size_t size, int flags) + struct dentry *unused, struct inode *inode, + const char *name, const void *value, + size_t size, int flags) { - struct inode *inode = d_inode(dentry); - if (is_known_namespace(name)) return -EOPNOTSUPP; return __jfs_xattr_set(inode, name, value, size, flags); diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c index 68a44315d949..e57174d43683 100644 --- a/fs/kernfs/dir.c +++ b/fs/kernfs/dir.c @@ -336,11 +336,11 @@ struct kernfs_node *kernfs_get_parent(struct kernfs_node *kn) */ static unsigned int kernfs_name_hash(const char *name, const void *ns) { - unsigned long hash = init_name_hash(); + unsigned long hash = init_name_hash(ns); unsigned int len = strlen(name); while (len--) hash = partial_name_hash(*name++, hash); - hash = (end_name_hash(hash) ^ hash_ptr((void *)ns, 31)); + hash = end_name_hash(hash); hash &= 0x7fffffffU; /* Reserve hash numbers 0, 1 and INT_MAX for magic directory entries */ if (hash < 2) @@ -753,7 +753,8 @@ int kernfs_add_one(struct kernfs_node *kn) ps_iattr = parent->iattr; if (ps_iattr) { struct iattr *ps_iattrs = &ps_iattr->ia_iattr; - ps_iattrs->ia_ctime = ps_iattrs->ia_mtime = CURRENT_TIME; + ktime_get_real_ts(&ps_iattrs->ia_ctime); + ps_iattrs->ia_mtime = ps_iattrs->ia_ctime; } mutex_unlock(&kernfs_mutex); @@ -1279,8 +1280,9 @@ static void __kernfs_remove(struct kernfs_node *kn) /* update timestamps on the parent */ if (ps_iattr) { - ps_iattr->ia_iattr.ia_ctime = CURRENT_TIME; - ps_iattr->ia_iattr.ia_mtime = CURRENT_TIME; + ktime_get_real_ts(&ps_iattr->ia_iattr.ia_ctime); + ps_iattr->ia_iattr.ia_mtime = + ps_iattr->ia_iattr.ia_ctime; } kernfs_put(pos); diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c index 7247252ee9b1..e1574008adc9 100644 --- a/fs/kernfs/file.c +++ b/fs/kernfs/file.c @@ -190,15 +190,16 @@ static ssize_t kernfs_file_direct_read(struct kernfs_open_file *of, char *buf; buf = of->prealloc_buf; - if (!buf) + if (buf) + mutex_lock(&of->prealloc_mutex); + else buf = kmalloc(len, GFP_KERNEL); if (!buf) return -ENOMEM; /* * @of->mutex nests outside active ref and is used both to ensure that - * the ops aren't called concurrently for the same open file, and - * to provide exclusive access to ->prealloc_buf (when that exists). + * the ops aren't called concurrently for the same open file. */ mutex_lock(&of->mutex); if (!kernfs_get_active(of->kn)) { @@ -214,21 +215,23 @@ static ssize_t kernfs_file_direct_read(struct kernfs_open_file *of, else len = -EINVAL; + kernfs_put_active(of->kn); + mutex_unlock(&of->mutex); + if (len < 0) - goto out_unlock; + goto out_free; if (copy_to_user(user_buf, buf, len)) { len = -EFAULT; - goto out_unlock; + goto out_free; } *ppos += len; - out_unlock: - kernfs_put_active(of->kn); - mutex_unlock(&of->mutex); out_free: - if (buf != of->prealloc_buf) + if (buf == of->prealloc_buf) + mutex_unlock(&of->prealloc_mutex); + else kfree(buf); return len; } @@ -284,15 +287,22 @@ static ssize_t kernfs_fop_write(struct file *file, const char __user *user_buf, } buf = of->prealloc_buf; - if (!buf) + if (buf) + mutex_lock(&of->prealloc_mutex); + else buf = kmalloc(len + 1, GFP_KERNEL); if (!buf) return -ENOMEM; + if (copy_from_user(buf, user_buf, len)) { + len = -EFAULT; + goto out_free; + } + buf[len] = '\0'; /* guarantee string termination */ + /* * @of->mutex nests outside active ref and is used both to ensure that - * the ops aren't called concurrently for the same open file, and - * to provide exclusive access to ->prealloc_buf (when that exists). + * the ops aren't called concurrently for the same open file. */ mutex_lock(&of->mutex); if (!kernfs_get_active(of->kn)) { @@ -301,26 +311,22 @@ static ssize_t kernfs_fop_write(struct file *file, const char __user *user_buf, goto out_free; } - if (copy_from_user(buf, user_buf, len)) { - len = -EFAULT; - goto out_unlock; - } - buf[len] = '\0'; /* guarantee string termination */ - ops = kernfs_ops(of->kn); if (ops->write) len = ops->write(of, buf, len, *ppos); else len = -EINVAL; + kernfs_put_active(of->kn); + mutex_unlock(&of->mutex); + if (len > 0) *ppos += len; -out_unlock: - kernfs_put_active(of->kn); - mutex_unlock(&of->mutex); out_free: - if (buf != of->prealloc_buf) + if (buf == of->prealloc_buf) + mutex_unlock(&of->prealloc_mutex); + else kfree(buf); return len; } @@ -687,6 +693,7 @@ static int kernfs_fop_open(struct inode *inode, struct file *file) error = -ENOMEM; if (!of->prealloc_buf) goto err_free; + mutex_init(&of->prealloc_mutex); } /* diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c index b5247226732b..63b925d5ba1e 100644 --- a/fs/kernfs/inode.c +++ b/fs/kernfs/inode.c @@ -54,7 +54,10 @@ static struct kernfs_iattrs *kernfs_iattrs(struct kernfs_node *kn) iattrs->ia_mode = kn->mode; iattrs->ia_uid = GLOBAL_ROOT_UID; iattrs->ia_gid = GLOBAL_ROOT_GID; - iattrs->ia_atime = iattrs->ia_mtime = iattrs->ia_ctime = CURRENT_TIME; + + ktime_get_real_ts(&iattrs->ia_atime); + iattrs->ia_mtime = iattrs->ia_atime; + iattrs->ia_ctime = iattrs->ia_atime; simple_xattrs_init(&kn->iattr->xattrs); out_unlock: @@ -157,10 +160,11 @@ static int kernfs_node_setsecdata(struct kernfs_node *kn, void **secdata, return 0; } -int kernfs_iop_setxattr(struct dentry *dentry, const char *name, - const void *value, size_t size, int flags) +int kernfs_iop_setxattr(struct dentry *unused, struct inode *inode, + const char *name, const void *value, + size_t size, int flags) { - struct kernfs_node *kn = dentry->d_fsdata; + struct kernfs_node *kn = inode->i_private; struct kernfs_iattrs *attrs; void *secdata; int error; @@ -172,11 +176,11 @@ int kernfs_iop_setxattr(struct dentry *dentry, const char *name, if (!strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN)) { const char *suffix = name + XATTR_SECURITY_PREFIX_LEN; - error = security_inode_setsecurity(d_inode(dentry), suffix, + error = security_inode_setsecurity(inode, suffix, value, size, flags); if (error) return error; - error = security_inode_getsecctx(d_inode(dentry), + error = security_inode_getsecctx(inode, &secdata, &secdata_len); if (error) return error; @@ -236,16 +240,18 @@ ssize_t kernfs_iop_listxattr(struct dentry *dentry, char *buf, size_t size) static inline void set_default_inode_attr(struct inode *inode, umode_t mode) { inode->i_mode = mode; - inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; + inode->i_atime = inode->i_mtime = + inode->i_ctime = current_fs_time(inode->i_sb); } static inline void set_inode_attr(struct inode *inode, struct iattr *iattr) { + struct super_block *sb = inode->i_sb; inode->i_uid = iattr->ia_uid; inode->i_gid = iattr->ia_gid; - inode->i_atime = iattr->ia_atime; - inode->i_mtime = iattr->ia_mtime; - inode->i_ctime = iattr->ia_ctime; + inode->i_atime = timespec_trunc(iattr->ia_atime, sb->s_time_gran); + inode->i_mtime = timespec_trunc(iattr->ia_mtime, sb->s_time_gran); + inode->i_ctime = timespec_trunc(iattr->ia_ctime, sb->s_time_gran); } static void kernfs_refresh_inode(struct kernfs_node *kn, struct inode *inode) diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h index 45c9192c276e..37159235ac10 100644 --- a/fs/kernfs/kernfs-internal.h +++ b/fs/kernfs/kernfs-internal.h @@ -81,7 +81,8 @@ int kernfs_iop_permission(struct inode *inode, int mask); int kernfs_iop_setattr(struct dentry *dentry, struct iattr *iattr); int kernfs_iop_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat); -int kernfs_iop_setxattr(struct dentry *dentry, const char *name, const void *value, +int kernfs_iop_setxattr(struct dentry *dentry, struct inode *inode, + const char *name, const void *value, size_t size, int flags); int kernfs_iop_removexattr(struct dentry *dentry, const char *name); ssize_t kernfs_iop_getxattr(struct dentry *dentry, struct inode *inode, diff --git a/fs/libfs.c b/fs/libfs.c index 8765ff1adc07..74dc8b9e7f53 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -71,9 +71,7 @@ EXPORT_SYMBOL(simple_lookup); int dcache_dir_open(struct inode *inode, struct file *file) { - static struct qstr cursor_name = QSTR_INIT(".", 1); - - file->private_data = d_alloc(file->f_path.dentry, &cursor_name); + file->private_data = d_alloc_cursor(file->f_path.dentry); return file->private_data ? 0 : -ENOMEM; } @@ -86,6 +84,61 @@ int dcache_dir_close(struct inode *inode, struct file *file) } EXPORT_SYMBOL(dcache_dir_close); +/* parent is locked at least shared */ +static struct dentry *next_positive(struct dentry *parent, + struct list_head *from, + int count) +{ + unsigned *seq = &parent->d_inode->i_dir_seq, n; + struct dentry *res; + struct list_head *p; + bool skipped; + int i; + +retry: + i = count; + skipped = false; + n = smp_load_acquire(seq) & ~1; + res = NULL; + rcu_read_lock(); + for (p = from->next; p != &parent->d_subdirs; p = p->next) { + struct dentry *d = list_entry(p, struct dentry, d_child); + if (!simple_positive(d)) { + skipped = true; + } else if (!--i) { + res = d; + break; + } + } + rcu_read_unlock(); + if (skipped) { + smp_rmb(); + if (unlikely(*seq != n)) + goto retry; + } + return res; +} + +static void move_cursor(struct dentry *cursor, struct list_head *after) +{ + struct dentry *parent = cursor->d_parent; + unsigned n, *seq = &parent->d_inode->i_dir_seq; + spin_lock(&parent->d_lock); + for (;;) { + n = *seq; + if (!(n & 1) && cmpxchg(seq, n, n + 1) == n) + break; + cpu_relax(); + } + __list_del(cursor->d_child.prev, cursor->d_child.next); + if (after) + list_add(&cursor->d_child, after); + else + list_add_tail(&cursor->d_child, &parent->d_subdirs); + smp_store_release(seq, n + 2); + spin_unlock(&parent->d_lock); +} + loff_t dcache_dir_lseek(struct file *file, loff_t offset, int whence) { struct dentry *dentry = file->f_path.dentry; @@ -101,25 +154,14 @@ loff_t dcache_dir_lseek(struct file *file, loff_t offset, int whence) if (offset != file->f_pos) { file->f_pos = offset; if (file->f_pos >= 2) { - struct list_head *p; struct dentry *cursor = file->private_data; + struct dentry *to; loff_t n = file->f_pos - 2; - spin_lock(&dentry->d_lock); - /* d_lock not required for cursor */ - list_del(&cursor->d_child); - p = dentry->d_subdirs.next; - while (n && p != &dentry->d_subdirs) { - struct dentry *next; - next = list_entry(p, struct dentry, d_child); - spin_lock_nested(&next->d_lock, DENTRY_D_LOCK_NESTED); - if (simple_positive(next)) - n--; - spin_unlock(&next->d_lock); - p = p->next; - } - list_add_tail(&cursor->d_child, p); - spin_unlock(&dentry->d_lock); + inode_lock_shared(dentry->d_inode); + to = next_positive(dentry, &dentry->d_subdirs, n); + move_cursor(cursor, to ? &to->d_child : NULL); + inode_unlock_shared(dentry->d_inode); } } return offset; @@ -142,36 +184,25 @@ int dcache_readdir(struct file *file, struct dir_context *ctx) { struct dentry *dentry = file->f_path.dentry; struct dentry *cursor = file->private_data; - struct list_head *p, *q = &cursor->d_child; + struct list_head *p = &cursor->d_child; + struct dentry *next; + bool moved = false; if (!dir_emit_dots(file, ctx)) return 0; - spin_lock(&dentry->d_lock); - if (ctx->pos == 2) - list_move(q, &dentry->d_subdirs); - for (p = q->next; p != &dentry->d_subdirs; p = p->next) { - struct dentry *next = list_entry(p, struct dentry, d_child); - spin_lock_nested(&next->d_lock, DENTRY_D_LOCK_NESTED); - if (!simple_positive(next)) { - spin_unlock(&next->d_lock); - continue; - } - - spin_unlock(&next->d_lock); - spin_unlock(&dentry->d_lock); + if (ctx->pos == 2) + p = &dentry->d_subdirs; + while ((next = next_positive(dentry, p, 1)) != NULL) { if (!dir_emit(ctx, next->d_name.name, next->d_name.len, d_inode(next)->i_ino, dt_type(d_inode(next)))) - return 0; - spin_lock(&dentry->d_lock); - spin_lock_nested(&next->d_lock, DENTRY_D_LOCK_NESTED); - /* next is still alive */ - list_move(q, p); - spin_unlock(&next->d_lock); - p = q; + break; + moved = true; + p = &next->d_child; ctx->pos++; } - spin_unlock(&dentry->d_lock); + if (moved) + move_cursor(cursor, p); return 0; } EXPORT_SYMBOL(dcache_readdir); @@ -1118,8 +1149,9 @@ static int empty_dir_setattr(struct dentry *dentry, struct iattr *attr) return -EPERM; } -static int empty_dir_setxattr(struct dentry *dentry, const char *name, - const void *value, size_t size, int flags) +static int empty_dir_setxattr(struct dentry *dentry, struct inode *inode, + const char *name, const void *value, + size_t size, int flags) { return -EOPNOTSUPP; } diff --git a/fs/lockd/procfs.c b/fs/lockd/procfs.c index 2a0a98480e39..8f72cb237ef3 100644 --- a/fs/lockd/procfs.c +++ b/fs/lockd/procfs.c @@ -64,7 +64,6 @@ static const struct file_operations lockd_end_grace_operations = { .read = nlm_end_grace_read, .llseek = default_llseek, .release = simple_transaction_release, - .owner = THIS_MODULE, }; int __init diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c index 154a107cd376..fc4084ef4736 100644 --- a/fs/lockd/svc.c +++ b/fs/lockd/svc.c @@ -335,12 +335,17 @@ static struct notifier_block lockd_inet6addr_notifier = { }; #endif -static void lockd_svc_exit_thread(void) +static void lockd_unregister_notifiers(void) { unregister_inetaddr_notifier(&lockd_inetaddr_notifier); #if IS_ENABLED(CONFIG_IPV6) unregister_inet6addr_notifier(&lockd_inet6addr_notifier); #endif +} + +static void lockd_svc_exit_thread(void) +{ + lockd_unregister_notifiers(); svc_exit_thread(nlmsvc_rqst); } @@ -462,7 +467,7 @@ int lockd_up(struct net *net) * Note: svc_serv structures have an initial use count of 1, * so we exit through here on both success and failure. */ -err_net: +err_put: svc_destroy(serv); err_create: mutex_unlock(&nlmsvc_mutex); @@ -470,7 +475,9 @@ err_create: err_start: lockd_down_net(serv, net); - goto err_net; +err_net: + lockd_unregister_notifiers(); + goto err_put; } EXPORT_SYMBOL_GPL(lockd_up); diff --git a/fs/locks.c b/fs/locks.c index 7c5f91be9b65..ee1b15f6fc13 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -1628,7 +1628,7 @@ generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **pr { struct file_lock *fl, *my_fl = NULL, *lease; struct dentry *dentry = filp->f_path.dentry; - struct inode *inode = dentry->d_inode; + struct inode *inode = file_inode(filp); struct file_lock_context *ctx; bool is_deleg = (*flp)->fl_flags & FL_DELEG; int error; diff --git a/fs/logfs/dev_bdev.c b/fs/logfs/dev_bdev.c index cc26f8f215f5..a8329cc47dec 100644 --- a/fs/logfs/dev_bdev.c +++ b/fs/logfs/dev_bdev.c @@ -14,7 +14,7 @@ #define PAGE_OFS(ofs) ((ofs) & (PAGE_SIZE-1)) -static int sync_request(struct page *page, struct block_device *bdev, int rw) +static int sync_request(struct page *page, struct block_device *bdev, int op) { struct bio bio; struct bio_vec bio_vec; @@ -29,8 +29,9 @@ static int sync_request(struct page *page, struct block_device *bdev, int rw) bio.bi_bdev = bdev; bio.bi_iter.bi_sector = page->index * (PAGE_SIZE >> 9); bio.bi_iter.bi_size = PAGE_SIZE; + bio_set_op_attrs(&bio, op, 0); - return submit_bio_wait(rw, &bio); + return submit_bio_wait(&bio); } static int bdev_readpage(void *_sb, struct page *page) @@ -95,8 +96,9 @@ static int __bdev_writeseg(struct super_block *sb, u64 ofs, pgoff_t index, bio->bi_iter.bi_sector = ofs >> 9; bio->bi_private = sb; bio->bi_end_io = writeseg_end_io; + bio_set_op_attrs(bio, REQ_OP_WRITE, 0); atomic_inc(&super->s_pending_writes); - submit_bio(WRITE, bio); + submit_bio(bio); ofs += i * PAGE_SIZE; index += i; @@ -122,8 +124,9 @@ static int __bdev_writeseg(struct super_block *sb, u64 ofs, pgoff_t index, bio->bi_iter.bi_sector = ofs >> 9; bio->bi_private = sb; bio->bi_end_io = writeseg_end_io; + bio_set_op_attrs(bio, REQ_OP_WRITE, 0); atomic_inc(&super->s_pending_writes); - submit_bio(WRITE, bio); + submit_bio(bio); return 0; } @@ -185,8 +188,9 @@ static int do_erase(struct super_block *sb, u64 ofs, pgoff_t index, bio->bi_iter.bi_sector = ofs >> 9; bio->bi_private = sb; bio->bi_end_io = erase_end_io; + bio_set_op_attrs(bio, REQ_OP_WRITE, 0); atomic_inc(&super->s_pending_writes); - submit_bio(WRITE, bio); + submit_bio(bio); ofs += i * PAGE_SIZE; index += i; @@ -206,8 +210,9 @@ static int do_erase(struct super_block *sb, u64 ofs, pgoff_t index, bio->bi_iter.bi_sector = ofs >> 9; bio->bi_private = sb; bio->bi_end_io = erase_end_io; + bio_set_op_attrs(bio, REQ_OP_WRITE, 0); atomic_inc(&super->s_pending_writes); - submit_bio(WRITE, bio); + submit_bio(bio); return 0; } diff --git a/fs/logfs/dir.c b/fs/logfs/dir.c index 2d5336bd4efd..bcd754d216bd 100644 --- a/fs/logfs/dir.c +++ b/fs/logfs/dir.c @@ -95,7 +95,7 @@ static int beyond_eof(struct inode *inode, loff_t bix) * of each character and pick a prime nearby, preferably a bit-sparse * one. */ -static u32 hash_32(const char *s, int len, u32 seed) +static u32 logfs_hash_32(const char *s, int len, u32 seed) { u32 hash = seed; int i; @@ -159,7 +159,7 @@ static struct page *logfs_get_dd_page(struct inode *dir, struct dentry *dentry) struct qstr *name = &dentry->d_name; struct page *page; struct logfs_disk_dentry *dd; - u32 hash = hash_32(name->name, name->len, 0); + u32 hash = logfs_hash_32(name->name, name->len, 0); pgoff_t index; int round; @@ -370,7 +370,7 @@ static int logfs_write_dir(struct inode *dir, struct dentry *dentry, { struct page *page; struct logfs_disk_dentry *dd; - u32 hash = hash_32(dentry->d_name.name, dentry->d_name.len, 0); + u32 hash = logfs_hash_32(dentry->d_name.name, dentry->d_name.len, 0); pgoff_t index; int round, err; diff --git a/fs/mpage.c b/fs/mpage.c index eedc644b78d7..2ca1f39c8cba 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -56,11 +56,12 @@ static void mpage_end_io(struct bio *bio) bio_put(bio); } -static struct bio *mpage_bio_submit(int rw, struct bio *bio) +static struct bio *mpage_bio_submit(int op, int op_flags, struct bio *bio) { bio->bi_end_io = mpage_end_io; - guard_bio_eod(rw, bio); - submit_bio(rw, bio); + bio_set_op_attrs(bio, op, op_flags); + guard_bio_eod(op, bio); + submit_bio(bio); return NULL; } @@ -71,6 +72,8 @@ mpage_alloc(struct block_device *bdev, { struct bio *bio; + /* Restrict the given (page cache) mask for slab allocations */ + gfp_flags &= GFP_KERNEL; bio = bio_alloc(gfp_flags, nr_vecs); if (bio == NULL && (current->flags & PF_MEMALLOC)) { @@ -269,7 +272,7 @@ do_mpage_readpage(struct bio *bio, struct page *page, unsigned nr_pages, * This page will go to BIO. Do we need to send this BIO off first? */ if (bio && (*last_block_in_bio != blocks[0] - 1)) - bio = mpage_bio_submit(READ, bio); + bio = mpage_bio_submit(REQ_OP_READ, 0, bio); alloc_new: if (bio == NULL) { @@ -286,7 +289,7 @@ alloc_new: length = first_hole << blkbits; if (bio_add_page(bio, page, length, 0) < length) { - bio = mpage_bio_submit(READ, bio); + bio = mpage_bio_submit(REQ_OP_READ, 0, bio); goto alloc_new; } @@ -294,7 +297,7 @@ alloc_new: nblocks = map_bh->b_size >> blkbits; if ((buffer_boundary(map_bh) && relative_block == nblocks) || (first_hole != blocks_per_page)) - bio = mpage_bio_submit(READ, bio); + bio = mpage_bio_submit(REQ_OP_READ, 0, bio); else *last_block_in_bio = blocks[blocks_per_page - 1]; out: @@ -302,7 +305,7 @@ out: confused: if (bio) - bio = mpage_bio_submit(READ, bio); + bio = mpage_bio_submit(REQ_OP_READ, 0, bio); if (!PageUptodate(page)) block_read_full_page(page, get_block); else @@ -362,7 +365,7 @@ mpage_readpages(struct address_space *mapping, struct list_head *pages, sector_t last_block_in_bio = 0; struct buffer_head map_bh; unsigned long first_logical_block = 0; - gfp_t gfp = mapping_gfp_constraint(mapping, GFP_KERNEL); + gfp_t gfp = readahead_gfp_mask(mapping); map_bh.b_state = 0; map_bh.b_size = 0; @@ -384,7 +387,7 @@ mpage_readpages(struct address_space *mapping, struct list_head *pages, } BUG_ON(!list_empty(pages)); if (bio) - mpage_bio_submit(READ, bio); + mpage_bio_submit(REQ_OP_READ, 0, bio); return 0; } EXPORT_SYMBOL(mpage_readpages); @@ -405,7 +408,7 @@ int mpage_readpage(struct page *page, get_block_t get_block) bio = do_mpage_readpage(bio, page, 1, &last_block_in_bio, &map_bh, &first_logical_block, get_block, gfp); if (bio) - mpage_bio_submit(READ, bio); + mpage_bio_submit(REQ_OP_READ, 0, bio); return 0; } EXPORT_SYMBOL(mpage_readpage); @@ -486,7 +489,7 @@ static int __mpage_writepage(struct page *page, struct writeback_control *wbc, struct buffer_head map_bh; loff_t i_size = i_size_read(inode); int ret = 0; - int wr = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE); + int op_flags = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : 0); if (page_has_buffers(page)) { struct buffer_head *head = page_buffers(page); @@ -595,7 +598,7 @@ page_is_mapped: * This page will go to BIO. Do we need to send this BIO off first? */ if (bio && mpd->last_block_in_bio != blocks[0] - 1) - bio = mpage_bio_submit(wr, bio); + bio = mpage_bio_submit(REQ_OP_WRITE, op_flags, bio); alloc_new: if (bio == NULL) { @@ -622,7 +625,7 @@ alloc_new: wbc_account_io(wbc, page, PAGE_SIZE); length = first_unmapped << blkbits; if (bio_add_page(bio, page, length, 0) < length) { - bio = mpage_bio_submit(wr, bio); + bio = mpage_bio_submit(REQ_OP_WRITE, op_flags, bio); goto alloc_new; } @@ -632,7 +635,7 @@ alloc_new: set_page_writeback(page); unlock_page(page); if (boundary || (first_unmapped != blocks_per_page)) { - bio = mpage_bio_submit(wr, bio); + bio = mpage_bio_submit(REQ_OP_WRITE, op_flags, bio); if (boundary_block) { write_boundary_block(boundary_bdev, boundary_block, 1 << blkbits); @@ -644,7 +647,7 @@ alloc_new: confused: if (bio) - bio = mpage_bio_submit(wr, bio); + bio = mpage_bio_submit(REQ_OP_WRITE, op_flags, bio); if (mpd->use_writepage) { ret = mapping->a_ops->writepage(page, wbc); @@ -701,9 +704,9 @@ mpage_writepages(struct address_space *mapping, ret = write_cache_pages(mapping, wbc, __mpage_writepage, &mpd); if (mpd.bio) { - int wr = (wbc->sync_mode == WB_SYNC_ALL ? - WRITE_SYNC : WRITE); - mpage_bio_submit(wr, mpd.bio); + int op_flags = (wbc->sync_mode == WB_SYNC_ALL ? + WRITE_SYNC : 0); + mpage_bio_submit(REQ_OP_WRITE, op_flags, mpd.bio); } } blk_finish_plug(&plug); @@ -722,9 +725,9 @@ int mpage_writepage(struct page *page, get_block_t get_block, }; int ret = __mpage_writepage(page, wbc, &mpd); if (mpd.bio) { - int wr = (wbc->sync_mode == WB_SYNC_ALL ? - WRITE_SYNC : WRITE); - mpage_bio_submit(wr, mpd.bio); + int op_flags = (wbc->sync_mode == WB_SYNC_ALL ? + WRITE_SYNC : 0); + mpage_bio_submit(REQ_OP_WRITE, op_flags, mpd.bio); } return ret; } diff --git a/fs/namei.c b/fs/namei.c index 9d193d336c9f..ecb0b439747e 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -35,6 +35,7 @@ #include <linux/fs_struct.h> #include <linux/posix_acl.h> #include <linux/hash.h> +#include <linux/bitops.h> #include <asm/uaccess.h> #include "internal.h" @@ -1415,21 +1416,28 @@ static void follow_mount(struct path *path) } } +static int path_parent_directory(struct path *path) +{ + struct dentry *old = path->dentry; + /* rare case of legitimate dget_parent()... */ + path->dentry = dget_parent(path->dentry); + dput(old); + if (unlikely(!path_connected(path))) + return -ENOENT; + return 0; +} + static int follow_dotdot(struct nameidata *nd) { while(1) { - struct dentry *old = nd->path.dentry; - if (nd->path.dentry == nd->root.dentry && nd->path.mnt == nd->root.mnt) { break; } if (nd->path.dentry != nd->path.mnt->mnt_root) { - /* rare case of legitimate dget_parent()... */ - nd->path.dentry = dget_parent(nd->path.dentry); - dput(old); - if (unlikely(!path_connected(&nd->path))) - return -ENOENT; + int ret = path_parent_directory(&nd->path); + if (ret) + return ret; break; } if (!follow_up(&nd->path)) @@ -1441,9 +1449,8 @@ static int follow_dotdot(struct nameidata *nd) } /* - * This looks up the name in dcache, possibly revalidates the old dentry and - * allocates a new one if not found or not valid. In the need_lookup argument - * returns whether i_op->lookup is necessary. + * This looks up the name in dcache and possibly revalidates the found dentry. + * NULL is returned if the dentry does not exist in the cache. */ static struct dentry *lookup_dcache(const struct qstr *name, struct dentry *dir, @@ -1797,107 +1804,200 @@ static int walk_component(struct nameidata *nd, int flags) #include <asm/word-at-a-time.h> -#ifdef CONFIG_64BIT +#ifdef HASH_MIX -static inline unsigned int fold_hash(unsigned long hash) -{ - return hash_64(hash, 32); -} +/* Architecture provides HASH_MIX and fold_hash() in <asm/hash.h> */ + +#elif defined(CONFIG_64BIT) +/* + * Register pressure in the mixing function is an issue, particularly + * on 32-bit x86, but almost any function requires one state value and + * one temporary. Instead, use a function designed for two state values + * and no temporaries. + * + * This function cannot create a collision in only two iterations, so + * we have two iterations to achieve avalanche. In those two iterations, + * we have six layers of mixing, which is enough to spread one bit's + * influence out to 2^6 = 64 state bits. + * + * Rotate constants are scored by considering either 64 one-bit input + * deltas or 64*63/2 = 2016 two-bit input deltas, and finding the + * probability of that delta causing a change to each of the 128 output + * bits, using a sample of random initial states. + * + * The Shannon entropy of the computed probabilities is then summed + * to produce a score. Ideally, any input change has a 50% chance of + * toggling any given output bit. + * + * Mixing scores (in bits) for (12,45): + * Input delta: 1-bit 2-bit + * 1 round: 713.3 42542.6 + * 2 rounds: 2753.7 140389.8 + * 3 rounds: 5954.1 233458.2 + * 4 rounds: 7862.6 256672.2 + * Perfect: 8192 258048 + * (64*128) (64*63/2 * 128) + */ +#define HASH_MIX(x, y, a) \ + ( x ^= (a), \ + y ^= x, x = rol64(x,12),\ + x += y, y = rol64(y,45),\ + y *= 9 ) /* - * This is George Marsaglia's XORSHIFT generator. - * It implements a maximum-period LFSR in only a few - * instructions. It also has the property (required - * by hash_name()) that mix_hash(0) = 0. + * Fold two longs into one 32-bit hash value. This must be fast, but + * latency isn't quite as critical, as there is a fair bit of additional + * work done before the hash value is used. */ -static inline unsigned long mix_hash(unsigned long hash) +static inline unsigned int fold_hash(unsigned long x, unsigned long y) { - hash ^= hash << 13; - hash ^= hash >> 7; - hash ^= hash << 17; - return hash; + y ^= x * GOLDEN_RATIO_64; + y *= GOLDEN_RATIO_64; + return y >> 32; } #else /* 32-bit case */ -#define fold_hash(x) (x) +/* + * Mixing scores (in bits) for (7,20): + * Input delta: 1-bit 2-bit + * 1 round: 330.3 9201.6 + * 2 rounds: 1246.4 25475.4 + * 3 rounds: 1907.1 31295.1 + * 4 rounds: 2042.3 31718.6 + * Perfect: 2048 31744 + * (32*64) (32*31/2 * 64) + */ +#define HASH_MIX(x, y, a) \ + ( x ^= (a), \ + y ^= x, x = rol32(x, 7),\ + x += y, y = rol32(y,20),\ + y *= 9 ) -static inline unsigned long mix_hash(unsigned long hash) +static inline unsigned int fold_hash(unsigned long x, unsigned long y) { - hash ^= hash << 13; - hash ^= hash >> 17; - hash ^= hash << 5; - return hash; + /* Use arch-optimized multiply if one exists */ + return __hash_32(y ^ __hash_32(x)); } #endif -unsigned int full_name_hash(const unsigned char *name, unsigned int len) +/* + * Return the hash of a string of known length. This is carfully + * designed to match hash_name(), which is the more critical function. + * In particular, we must end by hashing a final word containing 0..7 + * payload bytes, to match the way that hash_name() iterates until it + * finds the delimiter after the name. + */ +unsigned int full_name_hash(const void *salt, const char *name, unsigned int len) { - unsigned long a, hash = 0; + unsigned long a, x = 0, y = (unsigned long)salt; for (;;) { + if (!len) + goto done; a = load_unaligned_zeropad(name); if (len < sizeof(unsigned long)) break; - hash = mix_hash(hash + a); + HASH_MIX(x, y, a); name += sizeof(unsigned long); len -= sizeof(unsigned long); - if (!len) - goto done; } - hash += a & bytemask_from_count(len); + x ^= a & bytemask_from_count(len); done: - return fold_hash(hash); + return fold_hash(x, y); } EXPORT_SYMBOL(full_name_hash); +/* Return the "hash_len" (hash and length) of a null-terminated string */ +u64 hashlen_string(const void *salt, const char *name) +{ + unsigned long a = 0, x = 0, y = (unsigned long)salt; + unsigned long adata, mask, len; + const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS; + + len = 0; + goto inside; + + do { + HASH_MIX(x, y, a); + len += sizeof(unsigned long); +inside: + a = load_unaligned_zeropad(name+len); + } while (!has_zero(a, &adata, &constants)); + + adata = prep_zero_mask(a, adata, &constants); + mask = create_zero_mask(adata); + x ^= a & zero_bytemask(mask); + + return hashlen_create(fold_hash(x, y), len + find_zero(mask)); +} +EXPORT_SYMBOL(hashlen_string); + /* * Calculate the length and hash of the path component, and * return the "hash_len" as the result. */ -static inline u64 hash_name(const char *name) +static inline u64 hash_name(const void *salt, const char *name) { - unsigned long a, b, adata, bdata, mask, hash, len; + unsigned long a = 0, b, x = 0, y = (unsigned long)salt; + unsigned long adata, bdata, mask, len; const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS; - hash = a = 0; - len = -sizeof(unsigned long); + len = 0; + goto inside; + do { - hash = mix_hash(hash + a); + HASH_MIX(x, y, a); len += sizeof(unsigned long); +inside: a = load_unaligned_zeropad(name+len); b = a ^ REPEAT_BYTE('/'); } while (!(has_zero(a, &adata, &constants) | has_zero(b, &bdata, &constants))); adata = prep_zero_mask(a, adata, &constants); bdata = prep_zero_mask(b, bdata, &constants); - mask = create_zero_mask(adata | bdata); + x ^= a & zero_bytemask(mask); - hash += a & zero_bytemask(mask); - len += find_zero(mask); - return hashlen_create(fold_hash(hash), len); + return hashlen_create(fold_hash(x, y), len + find_zero(mask)); } -#else +#else /* !CONFIG_DCACHE_WORD_ACCESS: Slow, byte-at-a-time version */ -unsigned int full_name_hash(const unsigned char *name, unsigned int len) +/* Return the hash of a string of known length */ +unsigned int full_name_hash(const void *salt, const char *name, unsigned int len) { - unsigned long hash = init_name_hash(); + unsigned long hash = init_name_hash(salt); while (len--) - hash = partial_name_hash(*name++, hash); + hash = partial_name_hash((unsigned char)*name++, hash); return end_name_hash(hash); } EXPORT_SYMBOL(full_name_hash); +/* Return the "hash_len" (hash and length) of a null-terminated string */ +u64 hashlen_string(const void *salt, const char *name) +{ + unsigned long hash = init_name_hash(salt); + unsigned long len = 0, c; + + c = (unsigned char)*name; + while (c) { + len++; + hash = partial_name_hash(c, hash); + c = (unsigned char)name[len]; + } + return hashlen_create(end_name_hash(hash), len); +} +EXPORT_SYMBOL(hashlen_string); + /* * We know there's a real path component here of at least * one character. */ -static inline u64 hash_name(const char *name) +static inline u64 hash_name(const void *salt, const char *name) { - unsigned long hash = init_name_hash(); + unsigned long hash = init_name_hash(salt); unsigned long len = 0, c; c = (unsigned char)*name; @@ -1934,10 +2034,10 @@ static int link_path_walk(const char *name, struct nameidata *nd) int type; err = may_lookup(nd); - if (err) + if (err) return err; - hash_len = hash_name(name); + hash_len = hash_name(nd->path.dentry, name); type = LAST_NORM; if (name[0] == '.') switch (hashlen_len(hash_len)) { @@ -2343,7 +2443,7 @@ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len) this.name = name; this.len = len; - this.hash = full_name_hash(name, len); + this.hash = full_name_hash(base, name, len); if (!len) return ERR_PTR(-EACCES); @@ -2396,7 +2496,7 @@ struct dentry *lookup_one_len_unlocked(const char *name, this.name = name; this.len = len; - this.hash = full_name_hash(name, len); + this.hash = full_name_hash(base, name, len); if (!len) return ERR_PTR(-EACCES); @@ -2428,6 +2528,34 @@ struct dentry *lookup_one_len_unlocked(const char *name, } EXPORT_SYMBOL(lookup_one_len_unlocked); +#ifdef CONFIG_UNIX98_PTYS +int path_pts(struct path *path) +{ + /* Find something mounted on "pts" in the same directory as + * the input path. + */ + struct dentry *child, *parent; + struct qstr this; + int ret; + + ret = path_parent_directory(path); + if (ret) + return ret; + + parent = path->dentry; + this.name = "pts"; + this.len = 3; + child = d_hash_and_lookup(parent, &this); + if (!child) + return -ENOENT; + + path->dentry = child; + dput(parent); + follow_mount(path); + return 0; +} +#endif + int user_path_at_empty(int dfd, const char __user *name, unsigned flags, struct path *path, int *empty) { @@ -2909,9 +3037,13 @@ static int atomic_open(struct nameidata *nd, struct dentry *dentry, } if (*opened & FILE_CREATED) fsnotify_create(dir, dentry); - path->dentry = dentry; - path->mnt = nd->path.mnt; - return 1; + if (unlikely(d_is_negative(dentry))) { + error = -ENOENT; + } else { + path->dentry = dentry; + path->mnt = nd->path.mnt; + return 1; + } } } dput(dentry); @@ -3080,9 +3212,7 @@ static int do_last(struct nameidata *nd, int acc_mode = op->acc_mode; unsigned seq; struct inode *inode; - struct path save_parent = { .dentry = NULL, .mnt = NULL }; struct path path; - bool retried = false; int error; nd->flags &= ~LOOKUP_PARENT; @@ -3125,7 +3255,6 @@ static int do_last(struct nameidata *nd, return -EISDIR; } -retry_lookup: if (open_flag & (O_CREAT | O_TRUNC | O_WRONLY | O_RDWR)) { error = mnt_want_write(nd->path.mnt); if (!error) @@ -3177,6 +3306,10 @@ retry_lookup: got_write = false; } + error = follow_managed(&path, nd); + if (unlikely(error < 0)) + return error; + if (unlikely(d_is_negative(path.dentry))) { path_to_nameidata(&path, nd); return -ENOENT; @@ -3192,10 +3325,6 @@ retry_lookup: return -EEXIST; } - error = follow_managed(&path, nd); - if (unlikely(error < 0)) - return error; - seq = 0; /* out of RCU mode, so the value doesn't matter */ inode = d_backing_inode(path.dentry); finish_lookup: @@ -3206,23 +3335,14 @@ finish_lookup: if (unlikely(error)) return error; - if ((nd->flags & LOOKUP_RCU) || nd->path.mnt != path.mnt) { - path_to_nameidata(&path, nd); - } else { - save_parent.dentry = nd->path.dentry; - save_parent.mnt = mntget(path.mnt); - nd->path.dentry = path.dentry; - - } + path_to_nameidata(&path, nd); nd->inode = inode; nd->seq = seq; /* Why this, you ask? _Now_ we might have grown LOOKUP_JUMPED... */ finish_open: error = complete_walk(nd); - if (error) { - path_put(&save_parent); + if (error) return error; - } audit_inode(nd->name, nd->path.dentry, 0); error = -EISDIR; if ((open_flag & O_CREAT) && d_is_dir(nd->path.dentry)) @@ -3245,13 +3365,9 @@ finish_open_created: goto out; BUG_ON(*opened & FILE_OPENED); /* once it's opened, it's opened */ error = vfs_open(&nd->path, file, current_cred()); - if (!error) { - *opened |= FILE_OPENED; - } else { - if (error == -EOPENSTALE) - goto stale_open; + if (error) goto out; - } + *opened |= FILE_OPENED; opened: error = open_check_o_direct(file); if (!error) @@ -3267,26 +3383,7 @@ out: } if (got_write) mnt_drop_write(nd->path.mnt); - path_put(&save_parent); return error; - -stale_open: - /* If no saved parent or already retried then can't retry */ - if (!save_parent.dentry || retried) - goto out; - - BUG_ON(save_parent.dentry != dir); - path_put(&nd->path); - nd->path = save_parent; - nd->inode = dir->d_inode; - save_parent.mnt = NULL; - save_parent.dentry = NULL; - if (got_write) { - mnt_drop_write(nd->path.mnt); - got_write = false; - } - retried = true; - goto retry_lookup; } static int do_tmpfile(struct nameidata *nd, unsigned flags, @@ -3627,6 +3724,8 @@ retry: switch (mode & S_IFMT) { case 0: case S_IFREG: error = vfs_create(path.dentry->d_inode,dentry,mode,true); + if (!error) + ima_post_path_mknod(dentry); break; case S_IFCHR: case S_IFBLK: error = vfs_mknod(path.dentry->d_inode,dentry,mode, @@ -4236,7 +4335,7 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry, * Check source == target. * On overlayfs need to look at underlying inodes. */ - if (vfs_select_inode(old_dentry, 0) == vfs_select_inode(new_dentry, 0)) + if (d_real_inode(old_dentry) == d_real_inode(new_dentry)) return 0; error = may_delete(old_dir, old_dentry, is_dir); @@ -4540,7 +4639,6 @@ int readlink_copy(char __user *buffer, int buflen, const char *link) out: return len; } -EXPORT_SYMBOL(readlink_copy); /* * A helper for ->readlink(). This should be used *ONLY* for symlinks that diff --git a/fs/namespace.c b/fs/namespace.c index 4fb1691b4355..419f746d851d 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1562,6 +1562,7 @@ void __detach_mounts(struct dentry *dentry) goto out_unlock; lock_mount_hash(); + event++; while (!hlist_empty(&mp->m_list)) { mnt = hlist_entry(mp->m_list.first, struct mount, mnt_mp_list); if (mnt->mnt.mnt_flags & MNT_UMOUNT) { @@ -2409,8 +2410,10 @@ static int do_new_mount(struct path *path, const char *fstype, int flags, mnt_flags |= MNT_NODEV | MNT_LOCK_NODEV; } if (type->fs_flags & FS_USERNS_VISIBLE) { - if (!fs_fully_visible(type, &mnt_flags)) + if (!fs_fully_visible(type, &mnt_flags)) { + put_filesystem(type); return -EPERM; + } } } @@ -3245,6 +3248,10 @@ static bool fs_fully_visible(struct file_system_type *type, int *new_mnt_flags) if (mnt->mnt.mnt_sb->s_iflags & SB_I_NOEXEC) mnt_flags &= ~(MNT_LOCK_NOSUID | MNT_LOCK_NOEXEC); + /* Don't miss readonly hidden in the superblock flags */ + if (mnt->mnt.mnt_sb->s_flags & MS_RDONLY) + mnt_flags |= MNT_LOCK_READONLY; + /* Verify the mount flags are equal to or more permissive * than the proposed new mount. */ @@ -3271,7 +3278,7 @@ static bool fs_fully_visible(struct file_system_type *type, int *new_mnt_flags) list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) { struct inode *inode = child->mnt_mountpoint->d_inode; /* Only worry about locked mounts */ - if (!(mnt_flags & MNT_LOCKED)) + if (!(child->mnt.mnt_flags & MNT_LOCKED)) continue; /* Is the directory permanetly empty? */ if (!is_empty_dir_inode(inode)) diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c index bfdad003ee56..9add7ab747a5 100644 --- a/fs/ncpfs/dir.c +++ b/fs/ncpfs/dir.c @@ -139,7 +139,7 @@ ncp_hash_dentry(const struct dentry *dentry, struct qstr *this) int i; t = NCP_IO_TABLE(sb); - hash = init_name_hash(); + hash = init_name_hash(dentry); for (i=0; i<this->len ; i++) hash = partial_name_hash(ncp_tolower(t, this->name[i]), hash); diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c index 17a42e4eb872..f55a4e756047 100644 --- a/fs/nfs/blocklayout/blocklayout.c +++ b/fs/nfs/blocklayout/blocklayout.c @@ -102,14 +102,15 @@ static inline void put_parallel(struct parallel_io *p) } static struct bio * -bl_submit_bio(int rw, struct bio *bio) +bl_submit_bio(struct bio *bio) { if (bio) { get_parallel(bio->bi_private); dprintk("%s submitting %s bio %u@%llu\n", __func__, - rw == READ ? "read" : "write", bio->bi_iter.bi_size, + bio_op(bio) == READ ? "read" : "write", + bio->bi_iter.bi_size, (unsigned long long)bio->bi_iter.bi_sector); - submit_bio(rw, bio); + submit_bio(bio); } return NULL; } @@ -158,7 +159,7 @@ do_add_page_to_bio(struct bio *bio, int npg, int rw, sector_t isect, if (disk_addr < map->start || disk_addr >= map->start + map->len) { if (!dev->map(dev, disk_addr, map)) return ERR_PTR(-EIO); - bio = bl_submit_bio(rw, bio); + bio = bl_submit_bio(bio); } disk_addr += map->disk_offset; disk_addr -= map->start; @@ -174,9 +175,10 @@ retry: disk_addr >> SECTOR_SHIFT, end_io, par); if (!bio) return ERR_PTR(-ENOMEM); + bio_set_op_attrs(bio, rw, 0); } if (bio_add_page(bio, page, *len, offset) < *len) { - bio = bl_submit_bio(rw, bio); + bio = bl_submit_bio(bio); goto retry; } return bio; @@ -252,7 +254,7 @@ bl_read_pagelist(struct nfs_pgio_header *header) for (i = pg_index; i < header->page_array.npages; i++) { if (extent_length <= 0) { /* We've used up the previous extent */ - bio = bl_submit_bio(READ, bio); + bio = bl_submit_bio(bio); /* Get the next one */ if (!ext_tree_lookup(bl, isect, &be, false)) { @@ -273,7 +275,7 @@ bl_read_pagelist(struct nfs_pgio_header *header) } if (is_hole(&be)) { - bio = bl_submit_bio(READ, bio); + bio = bl_submit_bio(bio); /* Fill hole w/ zeroes w/o accessing device */ dprintk("%s Zeroing page for hole\n", __func__); zero_user_segment(pages[i], pg_offset, pg_len); @@ -306,7 +308,7 @@ bl_read_pagelist(struct nfs_pgio_header *header) header->res.count = (isect << SECTOR_SHIFT) - header->args.offset; } out: - bl_submit_bio(READ, bio); + bl_submit_bio(bio); blk_finish_plug(&plug); put_parallel(par); return PNFS_ATTEMPTED; @@ -398,7 +400,7 @@ bl_write_pagelist(struct nfs_pgio_header *header, int sync) for (i = pg_index; i < header->page_array.npages; i++) { if (extent_length <= 0) { /* We've used up the previous extent */ - bio = bl_submit_bio(WRITE, bio); + bio = bl_submit_bio(bio); /* Get the next one */ if (!ext_tree_lookup(bl, isect, &be, true)) { header->pnfs_error = -EINVAL; @@ -427,7 +429,7 @@ bl_write_pagelist(struct nfs_pgio_header *header, int sync) header->res.count = header->args.count; out: - bl_submit_bio(WRITE, bio); + bl_submit_bio(bio); blk_finish_plug(&plug); put_parallel(par); return PNFS_ATTEMPTED; diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c index 618ced381a14..aaa2e8d3df6f 100644 --- a/fs/nfs/callback_proc.c +++ b/fs/nfs/callback_proc.c @@ -217,7 +217,8 @@ static u32 initiate_file_draining(struct nfs_client *clp, } if (pnfs_mark_matching_lsegs_return(lo, &free_me_list, - &args->cbl_range)) { + &args->cbl_range, + be32_to_cpu(args->cbl_stateid.seqid))) { rv = NFS4_OK; goto unlock; } @@ -500,8 +501,10 @@ __be32 nfs4_callback_sequence(struct cb_sequenceargs *args, cps->slot = slot; /* The ca_maxresponsesize_cached is 0 with no DRC */ - if (args->csa_cachethis != 0) - return htonl(NFS4ERR_REP_TOO_BIG_TO_CACHE); + if (args->csa_cachethis != 0) { + status = htonl(NFS4ERR_REP_TOO_BIG_TO_CACHE); + goto out_unlock; + } /* * Check for pending referring calls. If a match is found, a diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c index 976c90608e56..d81f96aacd51 100644 --- a/fs/nfs/callback_xdr.c +++ b/fs/nfs/callback_xdr.c @@ -146,10 +146,16 @@ static __be32 decode_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid) p = read_buf(xdr, NFS4_STATEID_SIZE); if (unlikely(p == NULL)) return htonl(NFS4ERR_RESOURCE); - memcpy(stateid, p, NFS4_STATEID_SIZE); + memcpy(stateid->data, p, NFS4_STATEID_SIZE); return 0; } +static __be32 decode_delegation_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid) +{ + stateid->type = NFS4_DELEGATION_STATEID_TYPE; + return decode_stateid(xdr, stateid); +} + static __be32 decode_compound_hdr_arg(struct xdr_stream *xdr, struct cb_compound_hdr_arg *hdr) { __be32 *p; @@ -211,7 +217,7 @@ static __be32 decode_recall_args(struct svc_rqst *rqstp, struct xdr_stream *xdr, __be32 *p; __be32 status; - status = decode_stateid(xdr, &args->stateid); + status = decode_delegation_stateid(xdr, &args->stateid); if (unlikely(status != 0)) goto out; p = read_buf(xdr, 4); @@ -227,6 +233,11 @@ out: } #if defined(CONFIG_NFS_V4_1) +static __be32 decode_layout_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid) +{ + stateid->type = NFS4_LAYOUT_STATEID_TYPE; + return decode_stateid(xdr, stateid); +} static __be32 decode_layoutrecall_args(struct svc_rqst *rqstp, struct xdr_stream *xdr, @@ -263,7 +274,7 @@ static __be32 decode_layoutrecall_args(struct svc_rqst *rqstp, } p = xdr_decode_hyper(p, &args->cbl_range.offset); p = xdr_decode_hyper(p, &args->cbl_range.length); - status = decode_stateid(xdr, &args->cbl_stateid); + status = decode_layout_stateid(xdr, &args->cbl_stateid); if (unlikely(status != 0)) goto out; } else if (args->cbl_recall_type == RETURN_FSID) { diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 0c96528db94a..487c5607d52f 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -1102,7 +1102,6 @@ static const struct file_operations nfs_server_list_fops = { .read = seq_read, .llseek = seq_lseek, .release = seq_release_net, - .owner = THIS_MODULE, }; static int nfs_volume_list_open(struct inode *inode, struct file *file); @@ -1123,7 +1122,6 @@ static const struct file_operations nfs_volume_list_fops = { .read = seq_read, .llseek = seq_lseek, .release = seq_release_net, - .owner = THIS_MODULE, }; /* diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c index 5166adcfc0fb..322c2585bc34 100644 --- a/fs/nfs/delegation.c +++ b/fs/nfs/delegation.c @@ -875,15 +875,16 @@ int nfs_delegations_present(struct nfs_client *clp) /** * nfs4_copy_delegation_stateid - Copy inode's state ID information - * @dst: stateid data structure to fill in * @inode: inode to check * @flags: delegation type requirement + * @dst: stateid data structure to fill in + * @cred: optional argument to retrieve credential * * Returns "true" and fills in "dst->data" * if inode had a delegation, * otherwise "false" is returned. */ -bool nfs4_copy_delegation_stateid(nfs4_stateid *dst, struct inode *inode, - fmode_t flags) +bool nfs4_copy_delegation_stateid(struct inode *inode, fmode_t flags, + nfs4_stateid *dst, struct rpc_cred **cred) { struct nfs_inode *nfsi = NFS_I(inode); struct nfs_delegation *delegation; @@ -896,6 +897,8 @@ bool nfs4_copy_delegation_stateid(nfs4_stateid *dst, struct inode *inode, if (ret) { nfs4_stateid_copy(dst, &delegation->stateid); nfs_mark_delegation_referenced(delegation); + if (cred) + *cred = get_rpccred(delegation->cred); } rcu_read_unlock(); return ret; diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h index 333063e032f0..64724d252a79 100644 --- a/fs/nfs/delegation.h +++ b/fs/nfs/delegation.h @@ -56,7 +56,7 @@ void nfs_delegation_reap_unclaimed(struct nfs_client *clp); int nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, const nfs4_stateid *stateid, int issync); int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state *state, const nfs4_stateid *stateid, fmode_t type); int nfs4_lock_delegation_recall(struct file_lock *fl, struct nfs4_state *state, const nfs4_stateid *stateid); -bool nfs4_copy_delegation_stateid(nfs4_stateid *dst, struct inode *inode, fmode_t flags); +bool nfs4_copy_delegation_stateid(struct inode *inode, fmode_t flags, nfs4_stateid *dst, struct rpc_cred **cred); void nfs_mark_delegation_referenced(struct nfs_delegation *delegation); int nfs4_have_delegation(struct inode *inode, fmode_t flags); diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index aaf7bd0cbae2..baaa38859899 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -232,7 +232,7 @@ int nfs_readdir_make_qstr(struct qstr *string, const char *name, unsigned int le * in a page cache page which kmemleak does not scan. */ kmemleak_not_leak(string->name); - string->hash = full_name_hash(name, len); + string->hash = full_name_hash(NULL, name, len); return 0; } @@ -424,12 +424,17 @@ static int xdr_decode(nfs_readdir_descriptor_t *desc, static int nfs_same_file(struct dentry *dentry, struct nfs_entry *entry) { + struct inode *inode; struct nfs_inode *nfsi; if (d_really_is_negative(dentry)) return 0; - nfsi = NFS_I(d_inode(dentry)); + inode = d_inode(dentry); + if (is_bad_inode(inode) || NFS_STALE(inode)) + return 0; + + nfsi = NFS_I(inode); if (entry->fattr->fileid == nfsi->fileid) return 1; if (nfs_compare_fh(entry->fh, &nfsi->fh) == 0) @@ -497,7 +502,7 @@ void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry) if (filename.len == 2 && filename.name[1] == '.') return; } - filename.hash = full_name_hash(filename.name, filename.len); + filename.hash = full_name_hash(parent, filename.name, filename.len); dentry = d_lookup(parent, &filename); again: @@ -729,7 +734,7 @@ struct page *get_cache_page(nfs_readdir_descriptor_t *desc) struct page *page; for (;;) { - page = read_cache_page(file_inode(desc->file)->i_mapping, + page = read_cache_page(desc->file->f_mapping, desc->page_index, (filler_t *)nfs_readdir_filler, desc); if (IS_ERR(page) || grab_page(page)) break; @@ -1363,7 +1368,6 @@ EXPORT_SYMBOL_GPL(nfs_dentry_operations); struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, unsigned int flags) { struct dentry *res; - struct dentry *parent; struct inode *inode = NULL; struct nfs_fh *fhandle = NULL; struct nfs_fattr *fattr = NULL; @@ -1393,20 +1397,18 @@ struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, unsigned in if (IS_ERR(label)) goto out; - parent = dentry->d_parent; - /* Protect against concurrent sillydeletes */ trace_nfs_lookup_enter(dir, dentry, flags); error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr, label); if (error == -ENOENT) goto no_entry; if (error < 0) { res = ERR_PTR(error); - goto out_unblock_sillyrename; + goto out_label; } inode = nfs_fhget(dentry->d_sb, fhandle, fattr, label); res = ERR_CAST(inode); if (IS_ERR(res)) - goto out_unblock_sillyrename; + goto out_label; /* Success: notify readdir to use READDIRPLUS */ nfs_advise_use_readdirplus(dir); @@ -1415,11 +1417,11 @@ no_entry: res = d_splice_alias(inode, dentry); if (res != NULL) { if (IS_ERR(res)) - goto out_unblock_sillyrename; + goto out_label; dentry = res; } nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); -out_unblock_sillyrename: +out_label: trace_nfs_lookup_exit(dir, dentry, flags, error); nfs4_label_free(label); out: @@ -1482,11 +1484,13 @@ int nfs_atomic_open(struct inode *dir, struct dentry *dentry, struct file *file, unsigned open_flags, umode_t mode, int *opened) { + DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); struct nfs_open_context *ctx; struct dentry *res; struct iattr attr = { .ia_valid = ATTR_OPEN }; struct inode *inode; unsigned int lookup_flags = 0; + bool switched = false; int err; /* Expect a negative dentry */ @@ -1501,7 +1505,7 @@ int nfs_atomic_open(struct inode *dir, struct dentry *dentry, /* NFS only supports OPEN on regular files */ if ((open_flags & O_DIRECTORY)) { - if (!d_unhashed(dentry)) { + if (!d_in_lookup(dentry)) { /* * Hashed negative dentry with O_DIRECTORY: dentry was * revalidated and is fine, no need to perform lookup @@ -1525,6 +1529,17 @@ int nfs_atomic_open(struct inode *dir, struct dentry *dentry, attr.ia_size = 0; } + if (!(open_flags & O_CREAT) && !d_in_lookup(dentry)) { + d_drop(dentry); + switched = true; + dentry = d_alloc_parallel(dentry->d_parent, + &dentry->d_name, &wq); + if (IS_ERR(dentry)) + return PTR_ERR(dentry); + if (unlikely(!d_in_lookup(dentry))) + return finish_no_open(file, dentry); + } + ctx = create_nfs_open_context(dentry, open_flags); err = PTR_ERR(ctx); if (IS_ERR(ctx)) @@ -1536,9 +1551,9 @@ int nfs_atomic_open(struct inode *dir, struct dentry *dentry, err = PTR_ERR(inode); trace_nfs_atomic_open_exit(dir, ctx, open_flags, err); put_nfs_open_context(ctx); + d_drop(dentry); switch (err) { case -ENOENT: - d_drop(dentry); d_add(dentry, NULL); nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); break; @@ -1560,14 +1575,23 @@ int nfs_atomic_open(struct inode *dir, struct dentry *dentry, trace_nfs_atomic_open_exit(dir, ctx, open_flags, err); put_nfs_open_context(ctx); out: + if (unlikely(switched)) { + d_lookup_done(dentry); + dput(dentry); + } return err; no_open: res = nfs_lookup(dir, dentry, lookup_flags); - err = PTR_ERR(res); + if (switched) { + d_lookup_done(dentry); + if (!res) + res = dentry; + else + dput(dentry); + } if (IS_ERR(res)) - goto out; - + return PTR_ERR(res); return finish_no_open(file, res); } EXPORT_SYMBOL_GPL(nfs_atomic_open); diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 741a92c470bb..e6210ead71d0 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -87,6 +87,7 @@ struct nfs_direct_req { int mirror_count; ssize_t count, /* bytes actually processed */ + max_count, /* max expected count */ bytes_left, /* bytes left to be sent */ io_start, /* start of IO */ error; /* any reported error */ @@ -123,6 +124,8 @@ nfs_direct_good_bytes(struct nfs_direct_req *dreq, struct nfs_pgio_header *hdr) int i; ssize_t count; + WARN_ON_ONCE(dreq->count >= dreq->max_count); + if (dreq->mirror_count == 1) { dreq->mirrors[hdr->pgio_mirror_idx].count += hdr->good_bytes; dreq->count += hdr->good_bytes; @@ -241,9 +244,7 @@ static int nfs_direct_cmp_commit_data_verf(struct nfs_direct_req *dreq, /** * nfs_direct_IO - NFS address space operation for direct I/O * @iocb: target I/O control block - * @iov: array of vectors that define I/O buffer - * @pos: offset in file to begin the operation - * @nr_segs: size of iovec array + * @iter: I/O buffer * * The presence of this routine in the address space ops vector means * the NFS client supports direct I/O. However, for most direct IO, we @@ -275,7 +276,7 @@ static void nfs_direct_release_pages(struct page **pages, unsigned int npages) void nfs_init_cinfo_from_dreq(struct nfs_commit_info *cinfo, struct nfs_direct_req *dreq) { - cinfo->lock = &dreq->inode->i_lock; + cinfo->inode = dreq->inode; cinfo->mds = &dreq->mds_cinfo; cinfo->ds = &dreq->ds_cinfo; cinfo->dreq = dreq; @@ -350,10 +351,12 @@ static ssize_t nfs_direct_wait(struct nfs_direct_req *dreq) result = wait_for_completion_killable(&dreq->completion); + if (!result) { + result = dreq->count; + WARN_ON_ONCE(dreq->count < 0); + } if (!result) result = dreq->error; - if (!result) - result = dreq->count; out: return (ssize_t) result; @@ -383,8 +386,10 @@ static void nfs_direct_complete(struct nfs_direct_req *dreq, bool write) if (dreq->iocb) { long res = (long) dreq->error; - if (!res) + if (dreq->count != 0) { res = (long) dreq->count; + WARN_ON_ONCE(dreq->count < 0); + } dreq->iocb->ki_complete(dreq->iocb, res, 0); } @@ -591,7 +596,7 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter) goto out_unlock; dreq->inode = inode; - dreq->bytes_left = count; + dreq->bytes_left = dreq->max_count = count; dreq->io_start = iocb->ki_pos; dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp)); l_ctx = nfs_get_lock_context(dreq->ctx); @@ -630,13 +635,13 @@ nfs_direct_write_scan_commit_list(struct inode *inode, struct list_head *list, struct nfs_commit_info *cinfo) { - spin_lock(cinfo->lock); + spin_lock(&cinfo->inode->i_lock); #ifdef CONFIG_NFS_V4_1 if (cinfo->ds != NULL && cinfo->ds->nwritten != 0) NFS_SERVER(inode)->pnfs_curr_ld->recover_commit_reqs(list, cinfo); #endif nfs_scan_commit_list(&cinfo->mds->list, list, cinfo, 0); - spin_unlock(cinfo->lock); + spin_unlock(&cinfo->inode->i_lock); } static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq) @@ -671,13 +676,13 @@ static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq) if (!nfs_pageio_add_request(&desc, req)) { nfs_list_remove_request(req); nfs_list_add_request(req, &failed); - spin_lock(cinfo.lock); + spin_lock(&cinfo.inode->i_lock); dreq->flags = 0; if (desc.pg_error < 0) dreq->error = desc.pg_error; else dreq->error = -EIO; - spin_unlock(cinfo.lock); + spin_unlock(&cinfo.inode->i_lock); } nfs_release_request(req); } @@ -1023,7 +1028,7 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter) goto out_unlock; dreq->inode = inode; - dreq->bytes_left = iov_iter_count(iter); + dreq->bytes_left = dreq->max_count = iov_iter_count(iter); dreq->io_start = pos; dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp)); l_ctx = nfs_get_lock_context(dreq->ctx); diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c index 3384dc8e6683..aa59757389dc 100644 --- a/fs/nfs/filelayout/filelayout.c +++ b/fs/nfs/filelayout/filelayout.c @@ -795,7 +795,7 @@ filelayout_alloc_commit_info(struct pnfs_layout_segment *lseg, buckets[i].direct_verf.committed = NFS_INVALID_STABLE_HOW; } - spin_lock(cinfo->lock); + spin_lock(&cinfo->inode->i_lock); if (cinfo->ds->nbuckets >= size) goto out; for (i = 0; i < cinfo->ds->nbuckets; i++) { @@ -811,7 +811,7 @@ filelayout_alloc_commit_info(struct pnfs_layout_segment *lseg, swap(cinfo->ds->buckets, buckets); cinfo->ds->nbuckets = size; out: - spin_unlock(cinfo->lock); + spin_unlock(&cinfo->inode->i_lock); kfree(buckets); return 0; } @@ -890,6 +890,7 @@ filelayout_pg_init_read(struct nfs_pageio_descriptor *pgio, 0, NFS4_MAX_UINT64, IOMODE_READ, + false, GFP_KERNEL); if (IS_ERR(pgio->pg_lseg)) { pgio->pg_error = PTR_ERR(pgio->pg_lseg); @@ -915,6 +916,7 @@ filelayout_pg_init_write(struct nfs_pageio_descriptor *pgio, 0, NFS4_MAX_UINT64, IOMODE_RW, + false, GFP_NOFS); if (IS_ERR(pgio->pg_lseg)) { pgio->pg_error = PTR_ERR(pgio->pg_lseg); diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c index 0cb1abd535e3..0e8018bc9880 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.c +++ b/fs/nfs/flexfilelayout/flexfilelayout.c @@ -26,6 +26,8 @@ #define FF_LAYOUT_POLL_RETRY_MAX (15*HZ) +static struct group_info *ff_zero_group; + static struct pnfs_layout_hdr * ff_layout_alloc_layout_hdr(struct inode *inode, gfp_t gfp_flags) { @@ -53,14 +55,15 @@ ff_layout_free_layout_hdr(struct pnfs_layout_hdr *lo) kfree(FF_LAYOUT_FROM_HDR(lo)); } -static int decode_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid) +static int decode_pnfs_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid) { __be32 *p; p = xdr_inline_decode(xdr, NFS4_STATEID_SIZE); if (unlikely(p == NULL)) return -ENOBUFS; - memcpy(stateid, p, NFS4_STATEID_SIZE); + stateid->type = NFS4_PNFS_DS_STATEID_TYPE; + memcpy(stateid->data, p, NFS4_STATEID_SIZE); dprintk("%s: stateid id= [%x%x%x%x]\n", __func__, p[0], p[1], p[2], p[3]); return 0; @@ -211,10 +214,16 @@ static struct nfs4_ff_layout_mirror *ff_layout_alloc_mirror(gfp_t gfp_flags) static void ff_layout_free_mirror(struct nfs4_ff_layout_mirror *mirror) { + struct rpc_cred *cred; + ff_layout_remove_mirror(mirror); kfree(mirror->fh_versions); - if (mirror->cred) - put_rpccred(mirror->cred); + cred = rcu_access_pointer(mirror->ro_cred); + if (cred) + put_rpccred(cred); + cred = rcu_access_pointer(mirror->rw_cred); + if (cred) + put_rpccred(cred); nfs4_ff_layout_put_deviceid(mirror->mirror_ds); kfree(mirror); } @@ -290,6 +299,8 @@ ff_lseg_merge(struct pnfs_layout_segment *new, { u64 new_end, old_end; + if (test_bit(NFS_LSEG_LAYOUTRETURN, &old->pls_flags)) + return false; if (new->pls_range.iomode != old->pls_range.iomode) return false; old_end = pnfs_calc_offset_end(old->pls_range.offset, @@ -310,8 +321,6 @@ ff_lseg_merge(struct pnfs_layout_segment *new, new_end); if (test_bit(NFS_LSEG_ROC, &old->pls_flags)) set_bit(NFS_LSEG_ROC, &new->pls_flags); - if (test_bit(NFS_LSEG_LAYOUTRETURN, &old->pls_flags)) - set_bit(NFS_LSEG_LAYOUTRETURN, &new->pls_flags); return true; } @@ -407,8 +416,9 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh, struct nfs4_ff_layout_mirror *mirror; struct nfs4_deviceid devid; struct nfs4_deviceid_node *idnode; - u32 ds_count; - u32 fh_count; + struct auth_cred acred = { .group_info = ff_zero_group }; + struct rpc_cred __rcu *cred; + u32 ds_count, fh_count, id; int j; rc = -EIO; @@ -456,7 +466,7 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh, fls->mirror_array[i]->efficiency = be32_to_cpup(p); /* stateid */ - rc = decode_stateid(&stream, &fls->mirror_array[i]->stateid); + rc = decode_pnfs_stateid(&stream, &fls->mirror_array[i]->stateid); if (rc) goto out_err_free; @@ -484,24 +494,49 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh, fls->mirror_array[i]->fh_versions_cnt = fh_count; /* user */ - rc = decode_name(&stream, &fls->mirror_array[i]->uid); + rc = decode_name(&stream, &id); if (rc) goto out_err_free; + acred.uid = make_kuid(&init_user_ns, id); + /* group */ - rc = decode_name(&stream, &fls->mirror_array[i]->gid); + rc = decode_name(&stream, &id); if (rc) goto out_err_free; + acred.gid = make_kgid(&init_user_ns, id); + + /* find the cred for it */ + rcu_assign_pointer(cred, rpc_lookup_generic_cred(&acred, 0, gfp_flags)); + if (IS_ERR(cred)) { + rc = PTR_ERR(cred); + goto out_err_free; + } + + if (lgr->range.iomode == IOMODE_READ) + rcu_assign_pointer(fls->mirror_array[i]->ro_cred, cred); + else + rcu_assign_pointer(fls->mirror_array[i]->rw_cred, cred); + mirror = ff_layout_add_mirror(lh, fls->mirror_array[i]); if (mirror != fls->mirror_array[i]) { + /* swap cred ptrs so free_mirror will clean up old */ + if (lgr->range.iomode == IOMODE_READ) { + cred = xchg(&mirror->ro_cred, cred); + rcu_assign_pointer(fls->mirror_array[i]->ro_cred, cred); + } else { + cred = xchg(&mirror->rw_cred, cred); + rcu_assign_pointer(fls->mirror_array[i]->rw_cred, cred); + } ff_layout_free_mirror(fls->mirror_array[i]); fls->mirror_array[i] = mirror; } - dprintk("%s: uid %d gid %d\n", __func__, - fls->mirror_array[i]->uid, - fls->mirror_array[i]->gid); + dprintk("%s: iomode %s uid %u gid %u\n", __func__, + lgr->range.iomode == IOMODE_READ ? "READ" : "RW", + from_kuid(&init_user_ns, acred.uid), + from_kgid(&init_user_ns, acred.gid)); } p = xdr_inline_decode(&stream, 4); @@ -745,7 +780,7 @@ ff_layout_alloc_commit_info(struct pnfs_layout_segment *lseg, else { int i; - spin_lock(cinfo->lock); + spin_lock(&cinfo->inode->i_lock); if (cinfo->ds->nbuckets != 0) kfree(buckets); else { @@ -759,7 +794,7 @@ ff_layout_alloc_commit_info(struct pnfs_layout_segment *lseg, NFS_INVALID_STABLE_HOW; } } - spin_unlock(cinfo->lock); + spin_unlock(&cinfo->inode->i_lock); return 0; } } @@ -786,6 +821,36 @@ ff_layout_choose_best_ds_for_read(struct pnfs_layout_segment *lseg, } static void +ff_layout_pg_get_read(struct nfs_pageio_descriptor *pgio, + struct nfs_page *req, + bool strict_iomode) +{ +retry_strict: + pnfs_put_lseg(pgio->pg_lseg); + pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, + req->wb_context, + 0, + NFS4_MAX_UINT64, + IOMODE_READ, + strict_iomode, + GFP_KERNEL); + if (IS_ERR(pgio->pg_lseg)) { + pgio->pg_error = PTR_ERR(pgio->pg_lseg); + pgio->pg_lseg = NULL; + } + + /* If we don't have checking, do get a IOMODE_RW + * segment, and the server wants to avoid READs + * there, then retry! + */ + if (pgio->pg_lseg && !strict_iomode && + ff_layout_avoid_read_on_rw(pgio->pg_lseg)) { + strict_iomode = true; + goto retry_strict; + } +} + +static void ff_layout_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) { @@ -795,26 +860,23 @@ ff_layout_pg_init_read(struct nfs_pageio_descriptor *pgio, int ds_idx; /* Use full layout for now */ - if (!pgio->pg_lseg) { - pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, - req->wb_context, - 0, - NFS4_MAX_UINT64, - IOMODE_READ, - GFP_KERNEL); - if (IS_ERR(pgio->pg_lseg)) { - pgio->pg_error = PTR_ERR(pgio->pg_lseg); - pgio->pg_lseg = NULL; - return; - } - } + if (!pgio->pg_lseg) + ff_layout_pg_get_read(pgio, req, false); + else if (ff_layout_avoid_read_on_rw(pgio->pg_lseg)) + ff_layout_pg_get_read(pgio, req, true); + /* If no lseg, fall back to read through mds */ if (pgio->pg_lseg == NULL) goto out_mds; ds = ff_layout_choose_best_ds_for_read(pgio->pg_lseg, 0, &ds_idx); - if (!ds) - goto out_mds; + if (!ds) { + if (ff_layout_no_fallback_to_mds(pgio->pg_lseg)) + goto out_pnfs; + else + goto out_mds; + } + mirror = FF_LAYOUT_COMP(pgio->pg_lseg, ds_idx); pgio->pg_mirror_idx = ds_idx; @@ -828,6 +890,12 @@ out_mds: pnfs_put_lseg(pgio->pg_lseg); pgio->pg_lseg = NULL; nfs_pageio_reset_read_mds(pgio); + return; + +out_pnfs: + pnfs_set_lo_fail(pgio->pg_lseg); + pnfs_put_lseg(pgio->pg_lseg); + pgio->pg_lseg = NULL; } static void @@ -847,6 +915,7 @@ ff_layout_pg_init_write(struct nfs_pageio_descriptor *pgio, 0, NFS4_MAX_UINT64, IOMODE_RW, + false, GFP_NOFS); if (IS_ERR(pgio->pg_lseg)) { pgio->pg_error = PTR_ERR(pgio->pg_lseg); @@ -870,8 +939,12 @@ ff_layout_pg_init_write(struct nfs_pageio_descriptor *pgio, for (i = 0; i < pgio->pg_mirror_count; i++) { ds = nfs4_ff_layout_prepare_ds(pgio->pg_lseg, i, true); - if (!ds) - goto out_mds; + if (!ds) { + if (ff_layout_no_fallback_to_mds(pgio->pg_lseg)) + goto out_pnfs; + else + goto out_mds; + } pgm = &pgio->pg_mirrors[i]; mirror = FF_LAYOUT_COMP(pgio->pg_lseg, i); pgm->pg_bsize = mirror->mirror_ds->ds_versions[0].wsize; @@ -883,6 +956,12 @@ out_mds: pnfs_put_lseg(pgio->pg_lseg); pgio->pg_lseg = NULL; nfs_pageio_reset_write_mds(pgio); + return; + +out_pnfs: + pnfs_set_lo_fail(pgio->pg_lseg); + pnfs_put_lseg(pgio->pg_lseg); + pgio->pg_lseg = NULL; } static unsigned int @@ -895,6 +974,7 @@ ff_layout_pg_get_mirror_count_write(struct nfs_pageio_descriptor *pgio, 0, NFS4_MAX_UINT64, IOMODE_RW, + false, GFP_NOFS); if (IS_ERR(pgio->pg_lseg)) { pgio->pg_error = PTR_ERR(pgio->pg_lseg); @@ -1067,8 +1147,7 @@ static int ff_layout_async_handle_error_v4(struct rpc_task *task, rpc_wake_up(&tbl->slot_tbl_waitq); /* fall through */ default: - if (ff_layout_no_fallback_to_mds(lseg) || - ff_layout_has_available_ds(lseg)) + if (ff_layout_avoid_mds_available_ds(lseg)) return -NFS4ERR_RESET_TO_PNFS; reset: dprintk("%s Retry through MDS. Error %d\n", __func__, @@ -1215,8 +1294,6 @@ static int ff_layout_read_done_cb(struct rpc_task *task, hdr->pgio_mirror_idx + 1, &hdr->pgio_mirror_idx)) goto out_eagain; - set_bit(NFS_LAYOUT_RETURN_REQUESTED, - &hdr->lseg->pls_layout->plh_flags); pnfs_read_resend_pnfs(hdr); return task->tk_status; case -NFS4ERR_RESET_TO_MDS: @@ -1260,7 +1337,7 @@ ff_layout_set_layoutcommit(struct nfs_pgio_header *hdr) } static bool -ff_layout_reset_to_mds(struct pnfs_layout_segment *lseg, int idx) +ff_layout_device_unavailable(struct pnfs_layout_segment *lseg, int idx) { /* No mirroring for now */ struct nfs4_deviceid_node *node = FF_LAYOUT_DEVID_NODE(lseg, idx); @@ -1297,16 +1374,10 @@ static int ff_layout_read_prepare_common(struct rpc_task *task, rpc_exit(task, -EIO); return -EIO; } - if (ff_layout_reset_to_mds(hdr->lseg, hdr->pgio_mirror_idx)) { - dprintk("%s task %u reset io to MDS\n", __func__, task->tk_pid); - if (ff_layout_has_available_ds(hdr->lseg)) - pnfs_read_resend_pnfs(hdr); - else - ff_layout_reset_read(hdr); - rpc_exit(task, 0); + if (ff_layout_device_unavailable(hdr->lseg, hdr->pgio_mirror_idx)) { + rpc_exit(task, -EHOSTDOWN); return -EAGAIN; } - hdr->pgio_done_cb = ff_layout_read_done_cb; ff_layout_read_record_layoutstats_start(task, hdr); return 0; @@ -1496,14 +1567,8 @@ static int ff_layout_write_prepare_common(struct rpc_task *task, return -EIO; } - if (ff_layout_reset_to_mds(hdr->lseg, hdr->pgio_mirror_idx)) { - bool retry_pnfs; - - retry_pnfs = ff_layout_has_available_ds(hdr->lseg); - dprintk("%s task %u reset io to %s\n", __func__, - task->tk_pid, retry_pnfs ? "pNFS" : "MDS"); - ff_layout_reset_write(hdr, retry_pnfs); - rpc_exit(task, 0); + if (ff_layout_device_unavailable(hdr->lseg, hdr->pgio_mirror_idx)) { + rpc_exit(task, -EHOSTDOWN); return -EAGAIN; } @@ -1712,7 +1777,7 @@ ff_layout_read_pagelist(struct nfs_pgio_header *hdr) goto out_failed; ds_cred = ff_layout_get_ds_cred(lseg, idx, hdr->cred); - if (IS_ERR(ds_cred)) + if (!ds_cred) goto out_failed; vers = nfs4_ff_layout_ds_version(lseg, idx); @@ -1720,6 +1785,7 @@ ff_layout_read_pagelist(struct nfs_pgio_header *hdr) dprintk("%s USE DS: %s cl_count %d vers %d\n", __func__, ds->ds_remotestr, atomic_read(&ds->ds_clp->cl_count), vers); + hdr->pgio_done_cb = ff_layout_read_done_cb; atomic_inc(&ds->ds_clp->cl_count); hdr->ds_clp = ds->ds_clp; fh = nfs4_ff_layout_select_ds_fh(lseg, idx); @@ -1737,11 +1803,11 @@ ff_layout_read_pagelist(struct nfs_pgio_header *hdr) vers == 3 ? &ff_layout_read_call_ops_v3 : &ff_layout_read_call_ops_v4, 0, RPC_TASK_SOFTCONN); - + put_rpccred(ds_cred); return PNFS_ATTEMPTED; out_failed: - if (ff_layout_has_available_ds(lseg)) + if (ff_layout_avoid_mds_available_ds(lseg)) return PNFS_TRY_AGAIN; return PNFS_NOT_ATTEMPTED; } @@ -1769,7 +1835,7 @@ ff_layout_write_pagelist(struct nfs_pgio_header *hdr, int sync) return PNFS_NOT_ATTEMPTED; ds_cred = ff_layout_get_ds_cred(lseg, idx, hdr->cred); - if (IS_ERR(ds_cred)) + if (!ds_cred) return PNFS_NOT_ATTEMPTED; vers = nfs4_ff_layout_ds_version(lseg, idx); @@ -1798,6 +1864,7 @@ ff_layout_write_pagelist(struct nfs_pgio_header *hdr, int sync) vers == 3 ? &ff_layout_write_call_ops_v3 : &ff_layout_write_call_ops_v4, sync, RPC_TASK_SOFTCONN); + put_rpccred(ds_cred); return PNFS_ATTEMPTED; } @@ -1824,7 +1891,7 @@ static int ff_layout_initiate_commit(struct nfs_commit_data *data, int how) struct rpc_clnt *ds_clnt; struct rpc_cred *ds_cred; u32 idx; - int vers; + int vers, ret; struct nfs_fh *fh; idx = calc_ds_index_from_commit(lseg, data->ds_commit_index); @@ -1838,7 +1905,7 @@ static int ff_layout_initiate_commit(struct nfs_commit_data *data, int how) goto out_err; ds_cred = ff_layout_get_ds_cred(lseg, idx, data->cred); - if (IS_ERR(ds_cred)) + if (!ds_cred) goto out_err; vers = nfs4_ff_layout_ds_version(lseg, idx); @@ -1854,10 +1921,12 @@ static int ff_layout_initiate_commit(struct nfs_commit_data *data, int how) if (fh) data->args.fh = fh; - return nfs_initiate_commit(ds_clnt, data, ds->ds_clp->rpc_ops, + ret = nfs_initiate_commit(ds_clnt, data, ds->ds_clp->rpc_ops, vers == 3 ? &ff_layout_commit_call_ops_v3 : &ff_layout_commit_call_ops_v4, how, RPC_TASK_SOFTCONN); + put_rpccred(ds_cred); + return ret; out_err: pnfs_generic_prepare_to_resend_writes(data); pnfs_generic_commit_release(data); @@ -2223,6 +2292,11 @@ static int __init nfs4flexfilelayout_init(void) { printk(KERN_INFO "%s: NFSv4 Flexfile Layout Driver Registering...\n", __func__); + if (!ff_zero_group) { + ff_zero_group = groups_alloc(0); + if (!ff_zero_group) + return -ENOMEM; + } return pnfs_register_layoutdriver(&flexfilelayout_type); } @@ -2231,6 +2305,10 @@ static void __exit nfs4flexfilelayout_exit(void) printk(KERN_INFO "%s: NFSv4 Flexfile Layout Driver Unregistering...\n", __func__); pnfs_unregister_layoutdriver(&flexfilelayout_type); + if (ff_zero_group) { + put_group_info(ff_zero_group); + ff_zero_group = NULL; + } } MODULE_ALIAS("nfs-layouttype4-4"); diff --git a/fs/nfs/flexfilelayout/flexfilelayout.h b/fs/nfs/flexfilelayout/flexfilelayout.h index dd353bb7dc0a..1bcdb15d0c41 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.h +++ b/fs/nfs/flexfilelayout/flexfilelayout.h @@ -10,7 +10,8 @@ #define FS_NFS_NFS4FLEXFILELAYOUT_H #define FF_FLAGS_NO_LAYOUTCOMMIT 1 -#define FF_FLAGS_NO_IO_THRU_MDS 2 +#define FF_FLAGS_NO_IO_THRU_MDS 2 +#define FF_FLAGS_NO_READ_IO 4 #include "../pnfs.h" @@ -76,9 +77,8 @@ struct nfs4_ff_layout_mirror { u32 fh_versions_cnt; struct nfs_fh *fh_versions; nfs4_stateid stateid; - u32 uid; - u32 gid; - struct rpc_cred *cred; + struct rpc_cred __rcu *ro_cred; + struct rpc_cred __rcu *rw_cred; atomic_t ref; spinlock_t lock; struct nfs4_ff_layoutstat read_stat; @@ -154,6 +154,12 @@ ff_layout_no_fallback_to_mds(struct pnfs_layout_segment *lseg) } static inline bool +ff_layout_no_read_on_rw(struct pnfs_layout_segment *lseg) +{ + return FF_LAYOUT_LSEG(lseg)->flags & FF_FLAGS_NO_READ_IO; +} + +static inline bool ff_layout_test_devid_unavailable(struct nfs4_deviceid_node *node) { return nfs4_test_deviceid_unavailable(node); @@ -192,4 +198,7 @@ nfs4_ff_find_or_create_ds_client(struct pnfs_layout_segment *lseg, struct rpc_cred *ff_layout_get_ds_cred(struct pnfs_layout_segment *lseg, u32 ds_idx, struct rpc_cred *mdscred); bool ff_layout_has_available_ds(struct pnfs_layout_segment *lseg); +bool ff_layout_avoid_mds_available_ds(struct pnfs_layout_segment *lseg); +bool ff_layout_avoid_read_on_rw(struct pnfs_layout_segment *lseg); + #endif /* FS_NFS_NFS4FLEXFILELAYOUT_H */ diff --git a/fs/nfs/flexfilelayout/flexfilelayoutdev.c b/fs/nfs/flexfilelayout/flexfilelayoutdev.c index add0e5a70bd6..0aa36be71fce 100644 --- a/fs/nfs/flexfilelayout/flexfilelayoutdev.c +++ b/fs/nfs/flexfilelayout/flexfilelayoutdev.c @@ -228,7 +228,8 @@ ff_ds_error_match(const struct nfs4_ff_layout_ds_err *e1, return e1->opnum < e2->opnum ? -1 : 1; if (e1->status != e2->status) return e1->status < e2->status ? -1 : 1; - ret = memcmp(&e1->stateid, &e2->stateid, sizeof(e1->stateid)); + ret = memcmp(e1->stateid.data, e2->stateid.data, + sizeof(e1->stateid.data)); if (ret != 0) return ret; ret = memcmp(&e1->deviceid, &e2->deviceid, sizeof(e1->deviceid)); @@ -302,40 +303,26 @@ int ff_layout_track_ds_error(struct nfs4_flexfile_layout *flo, return 0; } -/* currently we only support AUTH_NONE and AUTH_SYS */ -static rpc_authflavor_t -nfs4_ff_layout_choose_authflavor(struct nfs4_ff_layout_mirror *mirror) +static struct rpc_cred * +ff_layout_get_mirror_cred(struct nfs4_ff_layout_mirror *mirror, u32 iomode) { - if (mirror->uid == (u32)-1) - return RPC_AUTH_NULL; - return RPC_AUTH_UNIX; -} + struct rpc_cred *cred, __rcu **pcred; -/* fetch cred for NFSv3 DS */ -static int ff_layout_update_mirror_cred(struct nfs4_ff_layout_mirror *mirror, - struct nfs4_pnfs_ds *ds) -{ - if (ds->ds_clp && !mirror->cred && - mirror->mirror_ds->ds_versions[0].version == 3) { - struct rpc_auth *auth = ds->ds_clp->cl_rpcclient->cl_auth; - struct rpc_cred *cred; - struct auth_cred acred = { - .uid = make_kuid(&init_user_ns, mirror->uid), - .gid = make_kgid(&init_user_ns, mirror->gid), - }; - - /* AUTH_NULL ignores acred */ - cred = auth->au_ops->lookup_cred(auth, &acred, 0); - if (IS_ERR(cred)) { - dprintk("%s: lookup_cred failed with %ld\n", - __func__, PTR_ERR(cred)); - return PTR_ERR(cred); - } else { - if (cmpxchg(&mirror->cred, NULL, cred)) - put_rpccred(cred); - } - } - return 0; + if (iomode == IOMODE_READ) + pcred = &mirror->ro_cred; + else + pcred = &mirror->rw_cred; + + rcu_read_lock(); + do { + cred = rcu_dereference(*pcred); + if (!cred) + break; + + cred = get_rpccred_rcu(cred); + } while(!cred); + rcu_read_unlock(); + return cred; } struct nfs_fh * @@ -356,7 +343,23 @@ out: return fh; } -/* Upon return, either ds is connected, or ds is NULL */ +/** + * nfs4_ff_layout_prepare_ds - prepare a DS connection for an RPC call + * @lseg: the layout segment we're operating on + * @ds_idx: index of the DS to use + * @fail_return: return layout on connect failure? + * + * Try to prepare a DS connection to accept an RPC call. This involves + * selecting a mirror to use and connecting the client to it if it's not + * already connected. + * + * Since we only need a single functioning mirror to satisfy a read, we don't + * want to return the layout if there is one. For writes though, any down + * mirror should result in a LAYOUTRETURN. @fail_return is how we distinguish + * between the two cases. + * + * Returns a pointer to a connected DS object on success or NULL on failure. + */ struct nfs4_pnfs_ds * nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx, bool fail_return) @@ -367,7 +370,6 @@ nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx, struct inode *ino = lseg->pls_layout->plh_inode; struct nfs_server *s = NFS_SERVER(ino); unsigned int max_payload; - rpc_authflavor_t flavor; if (!ff_layout_mirror_valid(lseg, mirror)) { pr_err_ratelimited("NFS: %s: No data server for offset index %d\n", @@ -383,9 +385,7 @@ nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx, /* matching smp_wmb() in _nfs4_pnfs_v3/4_ds_connect */ smp_rmb(); if (ds->ds_clp) - goto out_update_creds; - - flavor = nfs4_ff_layout_choose_authflavor(mirror); + goto out; /* FIXME: For now we assume the server sent only one version of NFS * to use for the DS. @@ -394,7 +394,7 @@ nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx, dataserver_retrans, mirror->mirror_ds->ds_versions[0].version, mirror->mirror_ds->ds_versions[0].minor_version, - flavor); + RPC_AUTH_UNIX); /* connect success, check rsize/wsize limit */ if (ds->ds_clp) { @@ -410,20 +410,10 @@ nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx, mirror, lseg->pls_range.offset, lseg->pls_range.length, NFS4ERR_NXIO, OP_ILLEGAL, GFP_NOIO); - if (!fail_return) { - if (ff_layout_has_available_ds(lseg)) - set_bit(NFS_LAYOUT_RETURN_REQUESTED, - &lseg->pls_layout->plh_flags); - else - pnfs_error_mark_layout_for_return(ino, lseg); - } else + if (fail_return || !ff_layout_has_available_ds(lseg)) pnfs_error_mark_layout_for_return(ino, lseg); ds = NULL; - goto out; } -out_update_creds: - if (ff_layout_update_mirror_cred(mirror, ds)) - ds = NULL; out: return ds; } @@ -433,16 +423,15 @@ ff_layout_get_ds_cred(struct pnfs_layout_segment *lseg, u32 ds_idx, struct rpc_cred *mdscred) { struct nfs4_ff_layout_mirror *mirror = FF_LAYOUT_COMP(lseg, ds_idx); - struct rpc_cred *cred = ERR_PTR(-EINVAL); - - if (!nfs4_ff_layout_prepare_ds(lseg, ds_idx, true)) - goto out; + struct rpc_cred *cred; - if (mirror && mirror->cred) - cred = mirror->cred; - else - cred = mdscred; -out: + if (mirror) { + cred = ff_layout_get_mirror_cred(mirror, lseg->pls_range.iomode); + if (!cred) + cred = get_rpccred(mdscred); + } else { + cred = get_rpccred(mdscred); + } return cred; } @@ -562,6 +551,18 @@ bool ff_layout_has_available_ds(struct pnfs_layout_segment *lseg) return ff_rw_layout_has_available_ds(lseg); } +bool ff_layout_avoid_mds_available_ds(struct pnfs_layout_segment *lseg) +{ + return ff_layout_no_fallback_to_mds(lseg) || + ff_layout_has_available_ds(lseg); +} + +bool ff_layout_avoid_read_on_rw(struct pnfs_layout_segment *lseg) +{ + return lseg->pls_range.iomode == IOMODE_RW && + ff_layout_no_read_on_rw(lseg); +} + module_param(dataserver_retrans, uint, 0644); MODULE_PARM_DESC(dataserver_retrans, "The number of times the NFSv4.1 client " "retries a request before it attempts further " diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 52e7d6869e3b..dda689d7a8a7 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -282,6 +282,7 @@ nfs_init_locked(struct inode *inode, void *opaque) struct nfs_fattr *fattr = desc->fattr; set_nfs_fileid(inode, fattr->fileid); + inode->i_mode = fattr->mode; nfs_copy_fh(NFS_FH(inode), desc->fh); return 0; } diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index f1d1d2c472e9..5154fa65a2f2 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -477,6 +477,7 @@ void nfs_mark_request_commit(struct nfs_page *req, u32 ds_commit_idx); int nfs_write_need_commit(struct nfs_pgio_header *); void nfs_writeback_update_inode(struct nfs_pgio_header *hdr); +int nfs_commit_file(struct file *file, struct nfs_write_verifier *verf); int nfs_generic_commit_list(struct inode *inode, struct list_head *head, int how, struct nfs_commit_info *cinfo); void nfs_retry_commit(struct list_head *page_list, diff --git a/fs/nfs/nfs42.h b/fs/nfs/nfs42.h index b587ccd31083..b6cd15314bab 100644 --- a/fs/nfs/nfs42.h +++ b/fs/nfs/nfs42.h @@ -13,6 +13,7 @@ /* nfs4.2proc.c */ int nfs42_proc_allocate(struct file *, loff_t, loff_t); +ssize_t nfs42_proc_copy(struct file *, loff_t, struct file *, loff_t, size_t); int nfs42_proc_deallocate(struct file *, loff_t, loff_t); loff_t nfs42_proc_llseek(struct file *, loff_t, int); int nfs42_proc_layoutstats_generic(struct nfs_server *, diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c index dff83460e5a6..aa03ed09ba06 100644 --- a/fs/nfs/nfs42proc.c +++ b/fs/nfs/nfs42proc.c @@ -126,6 +126,111 @@ int nfs42_proc_deallocate(struct file *filep, loff_t offset, loff_t len) return err; } +static ssize_t _nfs42_proc_copy(struct file *src, loff_t pos_src, + struct nfs_lock_context *src_lock, + struct file *dst, loff_t pos_dst, + struct nfs_lock_context *dst_lock, + size_t count) +{ + struct nfs42_copy_args args = { + .src_fh = NFS_FH(file_inode(src)), + .src_pos = pos_src, + .dst_fh = NFS_FH(file_inode(dst)), + .dst_pos = pos_dst, + .count = count, + }; + struct nfs42_copy_res res; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_COPY], + .rpc_argp = &args, + .rpc_resp = &res, + }; + struct inode *dst_inode = file_inode(dst); + struct nfs_server *server = NFS_SERVER(dst_inode); + int status; + + status = nfs4_set_rw_stateid(&args.src_stateid, src_lock->open_context, + src_lock, FMODE_READ); + if (status) + return status; + + status = nfs4_set_rw_stateid(&args.dst_stateid, dst_lock->open_context, + dst_lock, FMODE_WRITE); + if (status) + return status; + + status = nfs4_call_sync(server->client, server, &msg, + &args.seq_args, &res.seq_res, 0); + if (status == -ENOTSUPP) + server->caps &= ~NFS_CAP_COPY; + if (status) + return status; + + if (res.write_res.verifier.committed != NFS_FILE_SYNC) { + status = nfs_commit_file(dst, &res.write_res.verifier.verifier); + if (status) + return status; + } + + truncate_pagecache_range(dst_inode, pos_dst, + pos_dst + res.write_res.count); + + return res.write_res.count; +} + +ssize_t nfs42_proc_copy(struct file *src, loff_t pos_src, + struct file *dst, loff_t pos_dst, + size_t count) +{ + struct nfs_server *server = NFS_SERVER(file_inode(dst)); + struct nfs_lock_context *src_lock; + struct nfs_lock_context *dst_lock; + struct nfs4_exception src_exception = { }; + struct nfs4_exception dst_exception = { }; + ssize_t err, err2; + + if (!nfs_server_capable(file_inode(dst), NFS_CAP_COPY)) + return -EOPNOTSUPP; + + src_lock = nfs_get_lock_context(nfs_file_open_context(src)); + if (IS_ERR(src_lock)) + return PTR_ERR(src_lock); + + src_exception.inode = file_inode(src); + src_exception.state = src_lock->open_context->state; + + dst_lock = nfs_get_lock_context(nfs_file_open_context(dst)); + if (IS_ERR(dst_lock)) { + err = PTR_ERR(dst_lock); + goto out_put_src_lock; + } + + dst_exception.inode = file_inode(dst); + dst_exception.state = dst_lock->open_context->state; + + do { + inode_lock(file_inode(dst)); + err = _nfs42_proc_copy(src, pos_src, src_lock, + dst, pos_dst, dst_lock, count); + inode_unlock(file_inode(dst)); + + if (err == -ENOTSUPP) { + err = -EOPNOTSUPP; + break; + } + + err2 = nfs4_handle_exception(server, err, &src_exception); + err = nfs4_handle_exception(server, err, &dst_exception); + if (!err) + err = err2; + } while (src_exception.retry || dst_exception.retry); + + nfs_put_lock_context(dst_lock); +out_put_src_lock: + nfs_put_lock_context(src_lock); + return err; +} + static loff_t _nfs42_proc_llseek(struct file *filep, struct nfs_lock_context *lock, loff_t offset, int whence) { @@ -232,7 +337,7 @@ nfs42_layoutstat_done(struct rpc_task *task, void *calldata) * with the current stateid. */ set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags); - pnfs_mark_matching_lsegs_invalid(lo, &head, NULL); + pnfs_mark_matching_lsegs_invalid(lo, &head, NULL, 0); spin_unlock(&inode->i_lock); pnfs_free_lseg_list(&head); } else diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c index 0ca482a51e53..6dc6f2aea0d6 100644 --- a/fs/nfs/nfs42xdr.c +++ b/fs/nfs/nfs42xdr.c @@ -9,9 +9,22 @@ #define encode_fallocate_maxsz (encode_stateid_maxsz + \ 2 /* offset */ + \ 2 /* length */) +#define NFS42_WRITE_RES_SIZE (1 /* wr_callback_id size */ +\ + XDR_QUADLEN(NFS4_STATEID_SIZE) + \ + 2 /* wr_count */ + \ + 1 /* wr_committed */ + \ + XDR_QUADLEN(NFS4_VERIFIER_SIZE)) #define encode_allocate_maxsz (op_encode_hdr_maxsz + \ encode_fallocate_maxsz) #define decode_allocate_maxsz (op_decode_hdr_maxsz) +#define encode_copy_maxsz (op_encode_hdr_maxsz + \ + XDR_QUADLEN(NFS4_STATEID_SIZE) + \ + XDR_QUADLEN(NFS4_STATEID_SIZE) + \ + 2 + 2 + 2 + 1 + 1 + 1) +#define decode_copy_maxsz (op_decode_hdr_maxsz + \ + NFS42_WRITE_RES_SIZE + \ + 1 /* cr_consecutive */ + \ + 1 /* cr_synchronous */) #define encode_deallocate_maxsz (op_encode_hdr_maxsz + \ encode_fallocate_maxsz) #define decode_deallocate_maxsz (op_decode_hdr_maxsz) @@ -49,6 +62,16 @@ decode_putfh_maxsz + \ decode_allocate_maxsz + \ decode_getattr_maxsz) +#define NFS4_enc_copy_sz (compound_encode_hdr_maxsz + \ + encode_putfh_maxsz + \ + encode_savefh_maxsz + \ + encode_putfh_maxsz + \ + encode_copy_maxsz) +#define NFS4_dec_copy_sz (compound_decode_hdr_maxsz + \ + decode_putfh_maxsz + \ + decode_savefh_maxsz + \ + decode_putfh_maxsz + \ + decode_copy_maxsz) #define NFS4_enc_deallocate_sz (compound_encode_hdr_maxsz + \ encode_putfh_maxsz + \ encode_deallocate_maxsz + \ @@ -102,6 +125,23 @@ static void encode_allocate(struct xdr_stream *xdr, encode_fallocate(xdr, args); } +static void encode_copy(struct xdr_stream *xdr, + struct nfs42_copy_args *args, + struct compound_hdr *hdr) +{ + encode_op_hdr(xdr, OP_COPY, decode_copy_maxsz, hdr); + encode_nfs4_stateid(xdr, &args->src_stateid); + encode_nfs4_stateid(xdr, &args->dst_stateid); + + encode_uint64(xdr, args->src_pos); + encode_uint64(xdr, args->dst_pos); + encode_uint64(xdr, args->count); + + encode_uint32(xdr, 1); /* consecutive = true */ + encode_uint32(xdr, 1); /* synchronous = true */ + encode_uint32(xdr, 0); /* src server list */ +} + static void encode_deallocate(struct xdr_stream *xdr, struct nfs42_falloc_args *args, struct compound_hdr *hdr) @@ -182,6 +222,26 @@ static void nfs4_xdr_enc_allocate(struct rpc_rqst *req, } /* + * Encode COPY request + */ +static void nfs4_xdr_enc_copy(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs42_copy_args *args) +{ + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->seq_args), + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, args->src_fh, &hdr); + encode_savefh(xdr, &hdr); + encode_putfh(xdr, args->dst_fh, &hdr); + encode_copy(xdr, args, &hdr); + encode_nops(&hdr); +} + +/* * Encode DEALLOCATE request */ static void nfs4_xdr_enc_deallocate(struct rpc_rqst *req, @@ -266,6 +326,62 @@ static int decode_allocate(struct xdr_stream *xdr, struct nfs42_falloc_res *res) return decode_op_hdr(xdr, OP_ALLOCATE); } +static int decode_write_response(struct xdr_stream *xdr, + struct nfs42_write_res *res) +{ + __be32 *p; + int stateids; + + p = xdr_inline_decode(xdr, 4 + 8 + 4); + if (unlikely(!p)) + goto out_overflow; + + stateids = be32_to_cpup(p++); + p = xdr_decode_hyper(p, &res->count); + res->verifier.committed = be32_to_cpup(p); + return decode_verifier(xdr, &res->verifier.verifier); + +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_copy_requirements(struct xdr_stream *xdr, + struct nfs42_copy_res *res) { + __be32 *p; + + p = xdr_inline_decode(xdr, 4 + 4); + if (unlikely(!p)) + goto out_overflow; + + res->consecutive = be32_to_cpup(p++); + res->synchronous = be32_to_cpup(p++); + return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_copy(struct xdr_stream *xdr, struct nfs42_copy_res *res) +{ + int status; + + status = decode_op_hdr(xdr, OP_COPY); + if (status == NFS4ERR_OFFLOAD_NO_REQS) { + status = decode_copy_requirements(xdr, res); + if (status) + return status; + return NFS4ERR_OFFLOAD_NO_REQS; + } else if (status) + return status; + + status = decode_write_response(xdr, &res->write_res); + if (status) + return status; + + return decode_copy_requirements(xdr, res); +} + static int decode_deallocate(struct xdr_stream *xdr, struct nfs42_falloc_res *res) { return decode_op_hdr(xdr, OP_DEALLOCATE); @@ -331,6 +447,36 @@ out: } /* + * Decode COPY response + */ +static int nfs4_xdr_dec_copy(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, + struct nfs42_copy_res *res) +{ + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); + if (status) + goto out; + status = decode_sequence(xdr, &res->seq_res, rqstp); + if (status) + goto out; + status = decode_putfh(xdr); + if (status) + goto out; + status = decode_savefh(xdr); + if (status) + goto out; + status = decode_putfh(xdr); + if (status) + goto out; + status = decode_copy(xdr, res); +out: + return status; +} + +/* * Decode DEALLOCATE request */ static int nfs4_xdr_dec_deallocate(struct rpc_rqst *rqstp, diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index 4afdee420d25..768456fa1b17 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -438,8 +438,9 @@ extern void nfs41_handle_server_scope(struct nfs_client *, struct nfs41_server_scope **); extern void nfs4_put_lock_state(struct nfs4_lock_state *lsp); extern int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl); -extern int nfs4_select_rw_stateid(nfs4_stateid *, struct nfs4_state *, - fmode_t, const struct nfs_lockowner *); +extern int nfs4_select_rw_stateid(struct nfs4_state *, fmode_t, + const struct nfs_lockowner *, nfs4_stateid *, + struct rpc_cred **); extern struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter, gfp_t gfp_mask); extern int nfs_wait_on_sequence(struct nfs_seqid *seqid, struct rpc_task *task); @@ -496,12 +497,15 @@ extern struct svc_version nfs4_callback_version4; static inline void nfs4_stateid_copy(nfs4_stateid *dst, const nfs4_stateid *src) { - memcpy(dst, src, sizeof(*dst)); + memcpy(dst->data, src->data, sizeof(dst->data)); + dst->type = src->type; } static inline bool nfs4_stateid_match(const nfs4_stateid *dst, const nfs4_stateid *src) { - return memcmp(dst, src, sizeof(*dst)) == 0; + if (dst->type != src->type) + return false; + return memcmp(dst->data, src->data, sizeof(dst->data)) == 0; } static inline bool nfs4_stateid_match_other(const nfs4_stateid *dst, const nfs4_stateid *src) diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c index d0390516467c..014b0e41ace5 100644 --- a/fs/nfs/nfs4file.c +++ b/fs/nfs/nfs4file.c @@ -129,6 +129,28 @@ nfs4_file_flush(struct file *file, fl_owner_t id) } #ifdef CONFIG_NFS_V4_2 +static ssize_t nfs4_copy_file_range(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, + size_t count, unsigned int flags) +{ + struct inode *in_inode = file_inode(file_in); + struct inode *out_inode = file_inode(file_out); + int ret; + + if (in_inode == out_inode) + return -EINVAL; + + /* flush any pending writes */ + ret = nfs_sync_inode(in_inode); + if (ret) + return ret; + ret = nfs_sync_inode(out_inode); + if (ret) + return ret; + + return nfs42_proc_copy(file_in, pos_in, file_out, pos_out, count); +} + static loff_t nfs4_file_llseek(struct file *filep, loff_t offset, int whence) { loff_t ret; @@ -243,6 +265,7 @@ const struct file_operations nfs4_file_operations = { .check_flags = nfs_check_flags, .setlease = simple_nosetlease, #ifdef CONFIG_NFS_V4_2 + .copy_file_range = nfs4_copy_file_range, .llseek = nfs4_file_llseek, .fallocate = nfs42_fallocate, .clone_file_range = nfs42_clone_file_range, diff --git a/fs/nfs/nfs4idmap.c b/fs/nfs/nfs4idmap.c index 5ba22c6b0ffa..c444285bb1b1 100644 --- a/fs/nfs/nfs4idmap.c +++ b/fs/nfs/nfs4idmap.c @@ -201,7 +201,7 @@ int nfs_idmap_init(void) GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, cred, (KEY_POS_ALL & ~KEY_POS_SETATTR) | KEY_USR_VIEW | KEY_USR_READ, - KEY_ALLOC_NOT_IN_QUOTA, NULL); + KEY_ALLOC_NOT_IN_QUOTA, NULL, NULL); if (IS_ERR(keyring)) { ret = PTR_ERR(keyring); goto failed_put_cred; diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 084e8570da18..ff416d0e24bc 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -74,6 +74,17 @@ #define NFS4_POLL_RETRY_MIN (HZ/10) #define NFS4_POLL_RETRY_MAX (15*HZ) +/* file attributes which can be mapped to nfs attributes */ +#define NFS4_VALID_ATTRS (ATTR_MODE \ + | ATTR_UID \ + | ATTR_GID \ + | ATTR_SIZE \ + | ATTR_ATIME \ + | ATTR_MTIME \ + | ATTR_CTIME \ + | ATTR_ATIME_SET \ + | ATTR_MTIME_SET) + struct nfs4_opendata; static int _nfs4_proc_open(struct nfs4_opendata *data); static int _nfs4_recover_proc_open(struct nfs4_opendata *data); @@ -416,6 +427,7 @@ static int nfs4_do_handle_exception(struct nfs_server *server, case -NFS4ERR_DELAY: nfs_inc_server_stats(server, NFSIOS_DELAY); case -NFS4ERR_GRACE: + case -NFS4ERR_RECALLCONFLICT: exception->delay = 1; return 0; @@ -2558,15 +2570,20 @@ static int _nfs4_do_open(struct inode *dir, if ((opendata->o_arg.open_flags & (O_CREAT|O_EXCL)) == (O_CREAT|O_EXCL) && (opendata->o_arg.createmode != NFS4_CREATE_GUARDED)) { nfs4_exclusive_attrset(opendata, sattr, &label); - - nfs_fattr_init(opendata->o_res.f_attr); - status = nfs4_do_setattr(state->inode, cred, - opendata->o_res.f_attr, sattr, - state, label, olabel); - if (status == 0) { - nfs_setattr_update_inode(state->inode, sattr, - opendata->o_res.f_attr); - nfs_setsecurity(state->inode, opendata->o_res.f_attr, olabel); + /* + * send create attributes which was not set by open + * with an extra setattr. + */ + if (sattr->ia_valid & NFS4_VALID_ATTRS) { + nfs_fattr_init(opendata->o_res.f_attr); + status = nfs4_do_setattr(state->inode, cred, + opendata->o_res.f_attr, sattr, + state, label, olabel); + if (status == 0) { + nfs_setattr_update_inode(state->inode, sattr, + opendata->o_res.f_attr); + nfs_setsecurity(state->inode, opendata->o_res.f_attr, olabel); + } } } if (opened && opendata->file_created) @@ -2676,6 +2693,7 @@ static int _nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred, .rpc_resp = &res, .rpc_cred = cred, }; + struct rpc_cred *delegation_cred = NULL; unsigned long timestamp = jiffies; fmode_t fmode; bool truncate; @@ -2691,7 +2709,7 @@ static int _nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred, truncate = (sattr->ia_valid & ATTR_SIZE) ? true : false; fmode = truncate ? FMODE_WRITE : FMODE_READ; - if (nfs4_copy_delegation_stateid(&arg.stateid, inode, fmode)) { + if (nfs4_copy_delegation_stateid(inode, fmode, &arg.stateid, &delegation_cred)) { /* Use that stateid */ } else if (truncate && state != NULL) { struct nfs_lockowner lockowner = { @@ -2700,13 +2718,17 @@ static int _nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred, }; if (!nfs4_valid_open_stateid(state)) return -EBADF; - if (nfs4_select_rw_stateid(&arg.stateid, state, FMODE_WRITE, - &lockowner) == -EIO) + if (nfs4_select_rw_stateid(state, FMODE_WRITE, &lockowner, + &arg.stateid, &delegation_cred) == -EIO) return -EBADF; } else nfs4_stateid_copy(&arg.stateid, &zero_stateid); + if (delegation_cred) + msg.rpc_cred = delegation_cred; status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1); + + put_rpccred(delegation_cred); if (status == 0 && state != NULL) renew_lease(server, timestamp); trace_nfs4_setattr(inode, &arg.stateid, status); @@ -2860,12 +2882,11 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data) call_close |= is_wronly; else if (is_wronly) calldata->arg.fmode |= FMODE_WRITE; + if (calldata->arg.fmode != (FMODE_READ|FMODE_WRITE)) + call_close |= is_rdwr; } else if (is_rdwr) calldata->arg.fmode |= FMODE_READ|FMODE_WRITE; - if (calldata->arg.fmode == 0) - call_close |= is_rdwr; - if (!nfs4_valid_open_stateid(state)) call_close = 0; spin_unlock(&state->owner->so_lock); @@ -4285,7 +4306,7 @@ int nfs4_set_rw_stateid(nfs4_stateid *stateid, if (l_ctx != NULL) lockowner = &l_ctx->lockowner; - return nfs4_select_rw_stateid(stateid, ctx->state, fmode, lockowner); + return nfs4_select_rw_stateid(ctx->state, fmode, lockowner, stateid, NULL); } EXPORT_SYMBOL_GPL(nfs4_set_rw_stateid); @@ -4993,12 +5014,11 @@ static int nfs4_do_set_security_label(struct inode *inode, } static int -nfs4_set_security_label(struct dentry *dentry, const void *buf, size_t buflen) +nfs4_set_security_label(struct inode *inode, const void *buf, size_t buflen) { struct nfs4_label ilabel, *olabel = NULL; struct nfs_fattr fattr; struct rpc_cred *cred; - struct inode *inode = d_inode(dentry); int status; if (!nfs_server_capable(inode, NFS_CAP_SECURITY_LABEL)) @@ -6054,6 +6074,7 @@ static int nfs41_lock_expired(struct nfs4_state *state, struct file_lock *reques static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock *request) { struct nfs_inode *nfsi = NFS_I(state->inode); + struct nfs4_state_owner *sp = state->owner; unsigned char fl_flags = request->fl_flags; int status = -ENOLCK; @@ -6068,6 +6089,7 @@ static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock status = do_vfs_lock(state->inode, request); if (status < 0) goto out; + mutex_lock(&sp->so_delegreturn_mutex); down_read(&nfsi->rwsem); if (test_bit(NFS_DELEGATED_STATE, &state->flags)) { /* Yes: cache locks! */ @@ -6075,9 +6097,11 @@ static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock request->fl_flags = fl_flags & ~FL_SLEEP; status = do_vfs_lock(state->inode, request); up_read(&nfsi->rwsem); + mutex_unlock(&sp->so_delegreturn_mutex); goto out; } up_read(&nfsi->rwsem); + mutex_unlock(&sp->so_delegreturn_mutex); status = _nfs4_do_setlk(state, cmd, request, NFS_LOCK_NEW); out: request->fl_flags = fl_flags; @@ -6255,11 +6279,11 @@ nfs4_release_lockowner(struct nfs_server *server, struct nfs4_lock_state *lsp) #define XATTR_NAME_NFSV4_ACL "system.nfs4_acl" static int nfs4_xattr_set_nfs4_acl(const struct xattr_handler *handler, - struct dentry *dentry, const char *key, - const void *buf, size_t buflen, - int flags) + struct dentry *unused, struct inode *inode, + const char *key, const void *buf, + size_t buflen, int flags) { - return nfs4_proc_set_acl(d_inode(dentry), buf, buflen); + return nfs4_proc_set_acl(inode, buf, buflen); } static int nfs4_xattr_get_nfs4_acl(const struct xattr_handler *handler, @@ -6277,12 +6301,12 @@ static bool nfs4_xattr_list_nfs4_acl(struct dentry *dentry) #ifdef CONFIG_NFS_V4_SECURITY_LABEL static int nfs4_xattr_set_nfs4_label(const struct xattr_handler *handler, - struct dentry *dentry, const char *key, - const void *buf, size_t buflen, - int flags) + struct dentry *unused, struct inode *inode, + const char *key, const void *buf, + size_t buflen, int flags) { if (security_ismaclabel(key)) - return nfs4_set_security_label(dentry, buf, buflen); + return nfs4_set_security_label(inode, buf, buflen); return -EOPNOTSUPP; } @@ -7351,9 +7375,11 @@ int nfs4_proc_get_lease_time(struct nfs_client *clp, struct nfs_fsinfo *fsinfo) * always set csa_cachethis to FALSE because the current implementation * of the back channel DRC only supports caching the CB_SEQUENCE operation. */ -static void nfs4_init_channel_attrs(struct nfs41_create_session_args *args) +static void nfs4_init_channel_attrs(struct nfs41_create_session_args *args, + struct rpc_clnt *clnt) { unsigned int max_rqst_sz, max_resp_sz; + unsigned int max_bc_payload = rpc_max_bc_payload(clnt); max_rqst_sz = NFS_MAX_FILE_IO_SIZE + nfs41_maxwrite_overhead; max_resp_sz = NFS_MAX_FILE_IO_SIZE + nfs41_maxread_overhead; @@ -7371,8 +7397,8 @@ static void nfs4_init_channel_attrs(struct nfs41_create_session_args *args) args->fc_attrs.max_ops, args->fc_attrs.max_reqs); /* Back channel attributes */ - args->bc_attrs.max_rqst_sz = PAGE_SIZE; - args->bc_attrs.max_resp_sz = PAGE_SIZE; + args->bc_attrs.max_rqst_sz = max_bc_payload; + args->bc_attrs.max_resp_sz = max_bc_payload; args->bc_attrs.max_resp_sz_cached = 0; args->bc_attrs.max_ops = NFS4_MAX_BACK_CHANNEL_OPS; args->bc_attrs.max_reqs = NFS41_BC_MAX_CALLBACKS; @@ -7476,7 +7502,7 @@ static int _nfs4_proc_create_session(struct nfs_client *clp, }; int status; - nfs4_init_channel_attrs(&args); + nfs4_init_channel_attrs(&args, clp->cl_rpcclient); args.flags = (SESSION4_PERSIST | SESSION4_BACK_CHAN); status = rpc_call_sync(session->clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT); @@ -7820,40 +7846,34 @@ nfs4_layoutget_prepare(struct rpc_task *task, void *calldata) struct nfs4_layoutget *lgp = calldata; struct nfs_server *server = NFS_SERVER(lgp->args.inode); struct nfs4_session *session = nfs4_get_session(server); - int ret; dprintk("--> %s\n", __func__); - /* Note the is a race here, where a CB_LAYOUTRECALL can come in - * right now covering the LAYOUTGET we are about to send. - * However, that is not so catastrophic, and there seems - * to be no way to prevent it completely. - */ - if (nfs41_setup_sequence(session, &lgp->args.seq_args, - &lgp->res.seq_res, task)) - return; - ret = pnfs_choose_layoutget_stateid(&lgp->args.stateid, - NFS_I(lgp->args.inode)->layout, - &lgp->args.range, - lgp->args.ctx->state); - if (ret < 0) - rpc_exit(task, ret); + nfs41_setup_sequence(session, &lgp->args.seq_args, + &lgp->res.seq_res, task); + dprintk("<-- %s\n", __func__); } static void nfs4_layoutget_done(struct rpc_task *task, void *calldata) { struct nfs4_layoutget *lgp = calldata; + + dprintk("--> %s\n", __func__); + nfs41_sequence_done(task, &lgp->res.seq_res); + dprintk("<-- %s\n", __func__); +} + +static int +nfs4_layoutget_handle_exception(struct rpc_task *task, + struct nfs4_layoutget *lgp, struct nfs4_exception *exception) +{ struct inode *inode = lgp->args.inode; struct nfs_server *server = NFS_SERVER(inode); struct pnfs_layout_hdr *lo; - struct nfs4_state *state = NULL; - unsigned long timeo, now, giveup; + int status = task->tk_status; dprintk("--> %s tk_status => %d\n", __func__, -task->tk_status); - if (!nfs41_sequence_done(task, &lgp->res.seq_res)) - goto out; - - switch (task->tk_status) { + switch (status) { case 0: goto out; @@ -7863,62 +7883,48 @@ static void nfs4_layoutget_done(struct rpc_task *task, void *calldata) * retry go inband. */ case -NFS4ERR_LAYOUTUNAVAILABLE: - task->tk_status = -ENODATA; + status = -ENODATA; goto out; /* * NFS4ERR_BADLAYOUT means the MDS cannot return a layout of * length lgp->args.minlength != 0 (see RFC5661 section 18.43.3). */ case -NFS4ERR_BADLAYOUT: - goto out_overflow; + status = -EOVERFLOW; + goto out; /* * NFS4ERR_LAYOUTTRYLATER is a conflict with another client * (or clients) writing to the same RAID stripe except when * the minlength argument is 0 (see RFC5661 section 18.43.3). + * + * Treat it like we would RECALLCONFLICT -- we retry for a little + * while, and then eventually give up. */ case -NFS4ERR_LAYOUTTRYLATER: - if (lgp->args.minlength == 0) - goto out_overflow; - /* - * NFS4ERR_RECALLCONFLICT is when conflict with self (must recall - * existing layout before getting a new one). - */ - case -NFS4ERR_RECALLCONFLICT: - timeo = rpc_get_timeout(task->tk_client); - giveup = lgp->args.timestamp + timeo; - now = jiffies; - if (time_after(giveup, now)) { - unsigned long delay; - - /* Delay for: - * - Not less then NFS4_POLL_RETRY_MIN. - * - One last time a jiffie before we give up - * - exponential backoff (time_now minus start_attempt) - */ - delay = max_t(unsigned long, NFS4_POLL_RETRY_MIN, - min((giveup - now - 1), - now - lgp->args.timestamp)); - - dprintk("%s: NFS4ERR_RECALLCONFLICT waiting %lu\n", - __func__, delay); - rpc_delay(task, delay); - /* Do not call nfs4_async_handle_error() */ - goto out_restart; + if (lgp->args.minlength == 0) { + status = -EOVERFLOW; + goto out; } - break; + /* Fallthrough */ + case -NFS4ERR_RECALLCONFLICT: + nfs4_handle_exception(server, -NFS4ERR_RECALLCONFLICT, + exception); + status = -ERECALLCONFLICT; + goto out; case -NFS4ERR_EXPIRED: case -NFS4ERR_BAD_STATEID: + exception->timeout = 0; spin_lock(&inode->i_lock); if (nfs4_stateid_match(&lgp->args.stateid, &lgp->args.ctx->state->stateid)) { spin_unlock(&inode->i_lock); /* If the open stateid was bad, then recover it. */ - state = lgp->args.ctx->state; + exception->state = lgp->args.ctx->state; break; } lo = NFS_I(inode)->layout; - if (lo && nfs4_stateid_match(&lgp->args.stateid, - &lo->plh_stateid)) { + if (lo && !test_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags) && + nfs4_stateid_match_other(&lgp->args.stateid, &lo->plh_stateid)) { LIST_HEAD(head); /* @@ -7926,25 +7932,21 @@ static void nfs4_layoutget_done(struct rpc_task *task, void *calldata) * with the current stateid. */ set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags); - pnfs_mark_matching_lsegs_invalid(lo, &head, NULL); + pnfs_mark_matching_lsegs_invalid(lo, &head, NULL, 0); spin_unlock(&inode->i_lock); pnfs_free_lseg_list(&head); + status = -EAGAIN; + goto out; } else spin_unlock(&inode->i_lock); - goto out_restart; } - if (nfs4_async_handle_error(task, server, state, &lgp->timeout) == -EAGAIN) - goto out_restart; + + status = nfs4_handle_exception(server, status, exception); + if (exception->retry) + status = -EAGAIN; out: dprintk("<-- %s\n", __func__); - return; -out_restart: - task->tk_status = 0; - rpc_restart_call_prepare(task); - return; -out_overflow: - task->tk_status = -EOVERFLOW; - goto out; + return status; } static size_t max_response_pages(struct nfs_server *server) @@ -8013,7 +8015,7 @@ static const struct rpc_call_ops nfs4_layoutget_call_ops = { }; struct pnfs_layout_segment * -nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags) +nfs4_proc_layoutget(struct nfs4_layoutget *lgp, long *timeout, gfp_t gfp_flags) { struct inode *inode = lgp->args.inode; struct nfs_server *server = NFS_SERVER(inode); @@ -8033,6 +8035,10 @@ nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags) .flags = RPC_TASK_ASYNC, }; struct pnfs_layout_segment *lseg = NULL; + struct nfs4_exception exception = { + .inode = inode, + .timeout = *timeout, + }; int status = 0; dprintk("--> %s\n", __func__); @@ -8046,7 +8052,6 @@ nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags) return ERR_PTR(-ENOMEM); } lgp->args.layout.pglen = max_pages * PAGE_SIZE; - lgp->args.timestamp = jiffies; lgp->res.layoutp = &lgp->args.layout; lgp->res.seq_res.sr_slot = NULL; @@ -8056,13 +8061,17 @@ nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags) if (IS_ERR(task)) return ERR_CAST(task); status = nfs4_wait_for_completion_rpc_task(task); - if (status == 0) - status = task->tk_status; + if (status == 0) { + status = nfs4_layoutget_handle_exception(task, lgp, &exception); + *timeout = exception.timeout; + } + trace_nfs4_layoutget(lgp->args.ctx, &lgp->args.range, &lgp->res.range, &lgp->res.stateid, status); + /* if layoutp->len is 0, nfs4_layoutget_prepare called rpc_exit */ if (status == 0 && lgp->res.layoutp->len) lseg = pnfs_layout_process(lgp); @@ -8118,7 +8127,8 @@ static void nfs4_layoutreturn_release(void *calldata) dprintk("--> %s\n", __func__); spin_lock(&lo->plh_inode->i_lock); - pnfs_mark_matching_lsegs_invalid(lo, &freeme, &lrp->args.range); + pnfs_mark_matching_lsegs_invalid(lo, &freeme, &lrp->args.range, + be32_to_cpu(lrp->args.stateid.seqid)); pnfs_mark_layout_returned_if_empty(lo); if (lrp->res.lrs_present) pnfs_set_layout_stateid(lo, &lrp->res.stateid, true); @@ -8653,6 +8663,9 @@ nfs41_free_lock_state(struct nfs_server *server, struct nfs4_lock_state *lsp) static bool nfs41_match_stateid(const nfs4_stateid *s1, const nfs4_stateid *s2) { + if (s1->type != s2->type) + return false; + if (memcmp(s1->other, s2->other, sizeof(s1->other)) != 0) return false; @@ -8793,6 +8806,7 @@ static const struct nfs4_minor_version_ops nfs_v4_2_minor_ops = { | NFS_CAP_STATEID_NFSV41 | NFS_CAP_ATOMIC_OPEN_V1 | NFS_CAP_ALLOCATE + | NFS_CAP_COPY | NFS_CAP_DEALLOCATE | NFS_CAP_SEEK | NFS_CAP_LAYOUTSTATS diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index d854693a15b0..834b875900d6 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -65,7 +65,10 @@ #define OPENOWNER_POOL_SIZE 8 -const nfs4_stateid zero_stateid; +const nfs4_stateid zero_stateid = { + { .data = { 0 } }, + .type = NFS4_SPECIAL_STATEID_TYPE, +}; static DEFINE_MUTEX(nfs_clid_init_mutex); int nfs4_init_clientid(struct nfs_client *clp, struct rpc_cred *cred) @@ -985,15 +988,20 @@ static void nfs4_copy_open_stateid(nfs4_stateid *dst, struct nfs4_state *state) * Byte-range lock aware utility to initialize the stateid of read/write * requests. */ -int nfs4_select_rw_stateid(nfs4_stateid *dst, struct nfs4_state *state, - fmode_t fmode, const struct nfs_lockowner *lockowner) +int nfs4_select_rw_stateid(struct nfs4_state *state, + fmode_t fmode, const struct nfs_lockowner *lockowner, + nfs4_stateid *dst, struct rpc_cred **cred) { - int ret = nfs4_copy_lock_stateid(dst, state, lockowner); + int ret; + + if (cred != NULL) + *cred = NULL; + ret = nfs4_copy_lock_stateid(dst, state, lockowner); if (ret == -EIO) /* A lost lock - don't even consider delegations */ goto out; /* returns true if delegation stateid found and copied */ - if (nfs4_copy_delegation_stateid(dst, state->inode, fmode)) { + if (nfs4_copy_delegation_stateid(state->inode, fmode, dst, cred)) { ret = 0; goto out; } @@ -1480,9 +1488,9 @@ restart: } spin_unlock(&state->state_lock); } - nfs4_put_open_state(state); clear_bit(NFS_STATE_RECLAIM_NOGRACE, &state->flags); + nfs4_put_open_state(state); spin_lock(&sp->so_lock); goto restart; } diff --git a/fs/nfs/nfs4trace.h b/fs/nfs/nfs4trace.h index 2c8d05dae5b1..9c150b153782 100644 --- a/fs/nfs/nfs4trace.h +++ b/fs/nfs/nfs4trace.h @@ -1520,6 +1520,8 @@ DEFINE_NFS4_INODE_EVENT(nfs4_layoutreturn_on_close); { PNFS_UPDATE_LAYOUT_FOUND_CACHED, "found cached" }, \ { PNFS_UPDATE_LAYOUT_RETURN, "layoutreturn" }, \ { PNFS_UPDATE_LAYOUT_BLOCKED, "layouts blocked" }, \ + { PNFS_UPDATE_LAYOUT_INVALID_OPEN, "invalid open" }, \ + { PNFS_UPDATE_LAYOUT_RETRY, "retrying" }, \ { PNFS_UPDATE_LAYOUT_SEND_LAYOUTGET, "sent layoutget" }) TRACE_EVENT(pnfs_update_layout, @@ -1528,9 +1530,10 @@ TRACE_EVENT(pnfs_update_layout, u64 count, enum pnfs_iomode iomode, struct pnfs_layout_hdr *lo, + struct pnfs_layout_segment *lseg, enum pnfs_update_layout_reason reason ), - TP_ARGS(inode, pos, count, iomode, lo, reason), + TP_ARGS(inode, pos, count, iomode, lo, lseg, reason), TP_STRUCT__entry( __field(dev_t, dev) __field(u64, fileid) @@ -1540,6 +1543,7 @@ TRACE_EVENT(pnfs_update_layout, __field(enum pnfs_iomode, iomode) __field(int, layoutstateid_seq) __field(u32, layoutstateid_hash) + __field(long, lseg) __field(enum pnfs_update_layout_reason, reason) ), TP_fast_assign( @@ -1559,11 +1563,12 @@ TRACE_EVENT(pnfs_update_layout, __entry->layoutstateid_seq = 0; __entry->layoutstateid_hash = 0; } + __entry->lseg = (long)lseg; ), TP_printk( "fileid=%02x:%02x:%llu fhandle=0x%08x " "iomode=%s pos=%llu count=%llu " - "layoutstateid=%d:0x%08x (%s)", + "layoutstateid=%d:0x%08x lseg=0x%lx (%s)", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->fileid, __entry->fhandle, @@ -1571,6 +1576,7 @@ TRACE_EVENT(pnfs_update_layout, (unsigned long long)__entry->pos, (unsigned long long)__entry->count, __entry->layoutstateid_seq, __entry->layoutstateid_hash, + __entry->lseg, show_pnfs_update_layout_reason(__entry->reason) ) ); diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 88474a4fc669..661e753fe1c9 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -4270,6 +4270,24 @@ static int decode_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid) return decode_opaque_fixed(xdr, stateid, NFS4_STATEID_SIZE); } +static int decode_open_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid) +{ + stateid->type = NFS4_OPEN_STATEID_TYPE; + return decode_stateid(xdr, stateid); +} + +static int decode_lock_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid) +{ + stateid->type = NFS4_LOCK_STATEID_TYPE; + return decode_stateid(xdr, stateid); +} + +static int decode_delegation_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid) +{ + stateid->type = NFS4_DELEGATION_STATEID_TYPE; + return decode_stateid(xdr, stateid); +} + static int decode_close(struct xdr_stream *xdr, struct nfs_closeres *res) { int status; @@ -4278,7 +4296,7 @@ static int decode_close(struct xdr_stream *xdr, struct nfs_closeres *res) if (status != -EIO) nfs_increment_open_seqid(status, res->seqid); if (!status) - status = decode_stateid(xdr, &res->stateid); + status = decode_open_stateid(xdr, &res->stateid); return status; } @@ -4937,7 +4955,7 @@ static int decode_lock(struct xdr_stream *xdr, struct nfs_lock_res *res) if (status == -EIO) goto out; if (status == 0) { - status = decode_stateid(xdr, &res->stateid); + status = decode_lock_stateid(xdr, &res->stateid); if (unlikely(status)) goto out; } else if (status == -NFS4ERR_DENIED) @@ -4966,7 +4984,7 @@ static int decode_locku(struct xdr_stream *xdr, struct nfs_locku_res *res) if (status != -EIO) nfs_increment_lock_seqid(status, res->seqid); if (status == 0) - status = decode_stateid(xdr, &res->stateid); + status = decode_lock_stateid(xdr, &res->stateid); return status; } @@ -5016,7 +5034,7 @@ static int decode_rw_delegation(struct xdr_stream *xdr, __be32 *p; int status; - status = decode_stateid(xdr, &res->delegation); + status = decode_delegation_stateid(xdr, &res->delegation); if (unlikely(status)) return status; p = xdr_inline_decode(xdr, 4); @@ -5096,7 +5114,7 @@ static int decode_open(struct xdr_stream *xdr, struct nfs_openres *res) nfs_increment_open_seqid(status, res->seqid); if (status) return status; - status = decode_stateid(xdr, &res->stateid); + status = decode_open_stateid(xdr, &res->stateid); if (unlikely(status)) return status; @@ -5136,7 +5154,7 @@ static int decode_open_confirm(struct xdr_stream *xdr, struct nfs_open_confirmre if (status != -EIO) nfs_increment_open_seqid(status, res->seqid); if (!status) - status = decode_stateid(xdr, &res->stateid); + status = decode_open_stateid(xdr, &res->stateid); return status; } @@ -5148,7 +5166,7 @@ static int decode_open_downgrade(struct xdr_stream *xdr, struct nfs_closeres *re if (status != -EIO) nfs_increment_open_seqid(status, res->seqid); if (!status) - status = decode_stateid(xdr, &res->stateid); + status = decode_open_stateid(xdr, &res->stateid); return status; } @@ -5838,6 +5856,12 @@ out_overflow: } #if defined(CONFIG_NFS_V4_1) +static int decode_layout_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid) +{ + stateid->type = NFS4_LAYOUT_STATEID_TYPE; + return decode_stateid(xdr, stateid); +} + static int decode_getdeviceinfo(struct xdr_stream *xdr, struct nfs4_getdeviceinfo_res *res) { @@ -5919,7 +5943,7 @@ static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req, if (unlikely(!p)) goto out_overflow; res->return_on_close = be32_to_cpup(p); - decode_stateid(xdr, &res->stateid); + decode_layout_stateid(xdr, &res->stateid); p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) goto out_overflow; @@ -5985,7 +6009,7 @@ static int decode_layoutreturn(struct xdr_stream *xdr, goto out_overflow; res->lrs_present = be32_to_cpup(p); if (res->lrs_present) - status = decode_stateid(xdr, &res->stateid); + status = decode_layout_stateid(xdr, &res->stateid); return status; out_overflow: print_overflow_msg(__func__, xdr); @@ -7515,6 +7539,7 @@ struct rpc_procinfo nfs4_procedures[] = { PROC(DEALLOCATE, enc_deallocate, dec_deallocate), PROC(LAYOUTSTATS, enc_layoutstats, dec_layoutstats), PROC(CLONE, enc_clone, dec_clone), + PROC(COPY, enc_copy, dec_copy), #endif /* CONFIG_NFS_V4_2 */ }; diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index 1f6db4231057..174dd4cf5747 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -341,8 +341,10 @@ nfs_create_request(struct nfs_open_context *ctx, struct page *page, * long write-back delay. This will be adjusted in * update_nfs_request below if the region is not locked. */ req->wb_page = page; - req->wb_index = page_file_index(page); - get_page(page); + if (page) { + req->wb_index = page_file_index(page); + get_page(page); + } req->wb_offset = offset; req->wb_pgbase = offset; req->wb_bytes = count; diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 89a5ef4df08a..0fbe734cc38c 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -270,7 +270,7 @@ pnfs_mark_layout_stateid_invalid(struct pnfs_layout_hdr *lo, }; set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags); - return pnfs_mark_matching_lsegs_invalid(lo, lseg_list, &range); + return pnfs_mark_matching_lsegs_invalid(lo, lseg_list, &range, 0); } static int @@ -308,7 +308,7 @@ pnfs_layout_io_set_failed(struct pnfs_layout_hdr *lo, u32 iomode) spin_lock(&inode->i_lock); pnfs_layout_set_fail_bit(lo, pnfs_iomode_to_fail_bit(iomode)); - pnfs_mark_matching_lsegs_invalid(lo, &head, &range); + pnfs_mark_matching_lsegs_invalid(lo, &head, &range, 0); spin_unlock(&inode->i_lock); pnfs_free_lseg_list(&head); dprintk("%s Setting layout IOMODE_%s fail bit\n", __func__, @@ -361,8 +361,10 @@ pnfs_layout_remove_lseg(struct pnfs_layout_hdr *lo, list_del_init(&lseg->pls_list); /* Matched by pnfs_get_layout_hdr in pnfs_layout_insert_lseg */ atomic_dec(&lo->plh_refcount); - if (list_empty(&lo->plh_segs)) + if (list_empty(&lo->plh_segs)) { + set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags); clear_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags); + } rpc_wake_up(&NFS_SERVER(inode)->roc_rpcwaitq); } @@ -522,13 +524,35 @@ static int mark_lseg_invalid(struct pnfs_layout_segment *lseg, return rv; } -/* Returns count of number of matching invalid lsegs remaining in list - * after call. +/* + * Compare 2 layout stateid sequence ids, to see which is newer, + * taking into account wraparound issues. + */ +static bool pnfs_seqid_is_newer(u32 s1, u32 s2) +{ + return (s32)(s1 - s2) > 0; +} + +/** + * pnfs_mark_matching_lsegs_invalid - tear down lsegs or mark them for later + * @lo: layout header containing the lsegs + * @tmp_list: list head where doomed lsegs should go + * @recall_range: optional recall range argument to match (may be NULL) + * @seq: only invalidate lsegs obtained prior to this sequence (may be 0) + * + * Walk the list of lsegs in the layout header, and tear down any that should + * be destroyed. If "recall_range" is specified then the segment must match + * that range. If "seq" is non-zero, then only match segments that were handed + * out at or before that sequence. + * + * Returns number of matching invalid lsegs remaining in list after scanning + * it and purging them. */ int pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo, struct list_head *tmp_list, - const struct pnfs_layout_range *recall_range) + const struct pnfs_layout_range *recall_range, + u32 seq) { struct pnfs_layout_segment *lseg, *next; int remaining = 0; @@ -540,10 +564,12 @@ pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo, list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list) if (!recall_range || should_free_lseg(&lseg->pls_range, recall_range)) { - dprintk("%s: freeing lseg %p iomode %d " + if (seq && pnfs_seqid_is_newer(lseg->pls_seq, seq)) + continue; + dprintk("%s: freeing lseg %p iomode %d seq %u" "offset %llu length %llu\n", __func__, - lseg, lseg->pls_range.iomode, lseg->pls_range.offset, - lseg->pls_range.length); + lseg, lseg->pls_range.iomode, lseg->pls_seq, + lseg->pls_range.offset, lseg->pls_range.length); if (!mark_lseg_invalid(lseg, tmp_list)) remaining++; } @@ -730,15 +756,6 @@ pnfs_destroy_all_layouts(struct nfs_client *clp) pnfs_destroy_layouts_byclid(clp, false); } -/* - * Compare 2 layout stateid sequence ids, to see which is newer, - * taking into account wraparound issues. - */ -static bool pnfs_seqid_is_newer(u32 s1, u32 s2) -{ - return (s32)(s1 - s2) > 0; -} - /* update lo->plh_stateid with new if is more recent */ void pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new, @@ -781,50 +798,22 @@ pnfs_layoutgets_blocked(const struct pnfs_layout_hdr *lo) test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags); } -int -pnfs_choose_layoutget_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo, - const struct pnfs_layout_range *range, - struct nfs4_state *open_state) -{ - int status = 0; - - dprintk("--> %s\n", __func__); - spin_lock(&lo->plh_inode->i_lock); - if (pnfs_layoutgets_blocked(lo)) { - status = -EAGAIN; - } else if (!nfs4_valid_open_stateid(open_state)) { - status = -EBADF; - } else if (list_empty(&lo->plh_segs) || - test_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags)) { - int seq; - - do { - seq = read_seqbegin(&open_state->seqlock); - nfs4_stateid_copy(dst, &open_state->stateid); - } while (read_seqretry(&open_state->seqlock, seq)); - } else - nfs4_stateid_copy(dst, &lo->plh_stateid); - spin_unlock(&lo->plh_inode->i_lock); - dprintk("<-- %s\n", __func__); - return status; -} - /* -* Get layout from server. -* for now, assume that whole file layouts are requested. -* arg->offset: 0 -* arg->length: all ones -*/ + * Get layout from server. + * for now, assume that whole file layouts are requested. + * arg->offset: 0 + * arg->length: all ones + */ static struct pnfs_layout_segment * send_layoutget(struct pnfs_layout_hdr *lo, struct nfs_open_context *ctx, + nfs4_stateid *stateid, const struct pnfs_layout_range *range, - gfp_t gfp_flags) + long *timeout, gfp_t gfp_flags) { struct inode *ino = lo->plh_inode; struct nfs_server *server = NFS_SERVER(ino); struct nfs4_layoutget *lgp; - struct pnfs_layout_segment *lseg; loff_t i_size; dprintk("--> %s\n", __func__); @@ -834,40 +823,31 @@ send_layoutget(struct pnfs_layout_hdr *lo, * store in lseg. If we race with a concurrent seqid morphing * op, then re-send the LAYOUTGET. */ - do { - lgp = kzalloc(sizeof(*lgp), gfp_flags); - if (lgp == NULL) - return NULL; - - i_size = i_size_read(ino); - - lgp->args.minlength = PAGE_SIZE; - if (lgp->args.minlength > range->length) - lgp->args.minlength = range->length; - if (range->iomode == IOMODE_READ) { - if (range->offset >= i_size) - lgp->args.minlength = 0; - else if (i_size - range->offset < lgp->args.minlength) - lgp->args.minlength = i_size - range->offset; - } - lgp->args.maxcount = PNFS_LAYOUT_MAXSIZE; - pnfs_copy_range(&lgp->args.range, range); - lgp->args.type = server->pnfs_curr_ld->id; - lgp->args.inode = ino; - lgp->args.ctx = get_nfs_open_context(ctx); - lgp->gfp_flags = gfp_flags; - lgp->cred = lo->plh_lc_cred; - - lseg = nfs4_proc_layoutget(lgp, gfp_flags); - } while (lseg == ERR_PTR(-EAGAIN)); - - if (IS_ERR(lseg) && !nfs_error_is_fatal(PTR_ERR(lseg))) - lseg = NULL; - else - pnfs_layout_clear_fail_bit(lo, - pnfs_iomode_to_fail_bit(range->iomode)); + lgp = kzalloc(sizeof(*lgp), gfp_flags); + if (lgp == NULL) + return ERR_PTR(-ENOMEM); - return lseg; + i_size = i_size_read(ino); + + lgp->args.minlength = PAGE_SIZE; + if (lgp->args.minlength > range->length) + lgp->args.minlength = range->length; + if (range->iomode == IOMODE_READ) { + if (range->offset >= i_size) + lgp->args.minlength = 0; + else if (i_size - range->offset < lgp->args.minlength) + lgp->args.minlength = i_size - range->offset; + } + lgp->args.maxcount = PNFS_LAYOUT_MAXSIZE; + pnfs_copy_range(&lgp->args.range, range); + lgp->args.type = server->pnfs_curr_ld->id; + lgp->args.inode = ino; + lgp->args.ctx = get_nfs_open_context(ctx); + nfs4_stateid_copy(&lgp->args.stateid, stateid); + lgp->gfp_flags = gfp_flags; + lgp->cred = lo->plh_lc_cred; + + return nfs4_proc_layoutget(lgp, timeout, gfp_flags); } static void pnfs_clear_layoutcommit(struct inode *inode, @@ -899,6 +879,7 @@ pnfs_prepare_layoutreturn(struct pnfs_layout_hdr *lo) if (test_and_set_bit(NFS_LAYOUT_RETURN, &lo->plh_flags)) return false; lo->plh_return_iomode = 0; + lo->plh_return_seq = 0; pnfs_get_layout_hdr(lo); clear_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags); return true; @@ -969,6 +950,7 @@ static void pnfs_layoutreturn_before_put_layout_hdr(struct pnfs_layout_hdr *lo) bool send; nfs4_stateid_copy(&stateid, &lo->plh_stateid); + stateid.seqid = cpu_to_be32(lo->plh_return_seq); iomode = lo->plh_return_iomode; send = pnfs_prepare_layoutreturn(lo); spin_unlock(&inode->i_lock); @@ -1012,7 +994,7 @@ _pnfs_return_layout(struct inode *ino) pnfs_get_layout_hdr(lo); empty = list_empty(&lo->plh_segs); pnfs_clear_layoutcommit(ino, &tmp_list); - pnfs_mark_matching_lsegs_invalid(lo, &tmp_list, NULL); + pnfs_mark_matching_lsegs_invalid(lo, &tmp_list, NULL, 0); if (NFS_SERVER(ino)->pnfs_curr_ld->return_range) { struct pnfs_layout_range range = { @@ -1310,6 +1292,7 @@ alloc_init_layout_hdr(struct inode *ino, INIT_LIST_HEAD(&lo->plh_bulk_destroy); lo->plh_inode = ino; lo->plh_lc_cred = get_rpccred(ctx->cred); + lo->plh_flags |= 1 << NFS_LAYOUT_INVALID_STID; return lo; } @@ -1317,6 +1300,8 @@ static struct pnfs_layout_hdr * pnfs_find_alloc_layout(struct inode *ino, struct nfs_open_context *ctx, gfp_t gfp_flags) + __releases(&ino->i_lock) + __acquires(&ino->i_lock) { struct nfs_inode *nfsi = NFS_I(ino); struct pnfs_layout_hdr *new = NULL; @@ -1341,23 +1326,28 @@ out_existing: /* * iomode matching rules: - * iomode lseg match - * ----- ----- ----- - * ANY READ true - * ANY RW true - * RW READ false - * RW RW true - * READ READ true - * READ RW true + * iomode lseg strict match + * iomode + * ----- ----- ------ ----- + * ANY READ N/A true + * ANY RW N/A true + * RW READ N/A false + * RW RW N/A true + * READ READ N/A true + * READ RW true false + * READ RW false true */ static bool pnfs_lseg_range_match(const struct pnfs_layout_range *ls_range, - const struct pnfs_layout_range *range) + const struct pnfs_layout_range *range, + bool strict_iomode) { struct pnfs_layout_range range1; if ((range->iomode == IOMODE_RW && ls_range->iomode != IOMODE_RW) || + (range->iomode != ls_range->iomode && + strict_iomode == true) || !pnfs_lseg_range_intersecting(ls_range, range)) return 0; @@ -1372,7 +1362,8 @@ pnfs_lseg_range_match(const struct pnfs_layout_range *ls_range, */ static struct pnfs_layout_segment * pnfs_find_lseg(struct pnfs_layout_hdr *lo, - struct pnfs_layout_range *range) + struct pnfs_layout_range *range, + bool strict_iomode) { struct pnfs_layout_segment *lseg, *ret = NULL; @@ -1381,7 +1372,8 @@ pnfs_find_lseg(struct pnfs_layout_hdr *lo, list_for_each_entry(lseg, &lo->plh_segs, pls_list) { if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags) && !test_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags) && - pnfs_lseg_range_match(&lseg->pls_range, range)) { + pnfs_lseg_range_match(&lseg->pls_range, range, + strict_iomode)) { ret = pnfs_get_lseg(lseg); break; } @@ -1498,6 +1490,7 @@ pnfs_update_layout(struct inode *ino, loff_t pos, u64 count, enum pnfs_iomode iomode, + bool strict_iomode, gfp_t gfp_flags) { struct pnfs_layout_range arg = { @@ -1505,27 +1498,30 @@ pnfs_update_layout(struct inode *ino, .offset = pos, .length = count, }; - unsigned pg_offset; + unsigned pg_offset, seq; struct nfs_server *server = NFS_SERVER(ino); struct nfs_client *clp = server->nfs_client; - struct pnfs_layout_hdr *lo; + struct pnfs_layout_hdr *lo = NULL; struct pnfs_layout_segment *lseg = NULL; + nfs4_stateid stateid; + long timeout = 0; + unsigned long giveup = jiffies + rpc_get_timeout(server->client); bool first; if (!pnfs_enabled_sb(NFS_SERVER(ino))) { - trace_pnfs_update_layout(ino, pos, count, iomode, NULL, + trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg, PNFS_UPDATE_LAYOUT_NO_PNFS); goto out; } if (iomode == IOMODE_READ && i_size_read(ino) == 0) { - trace_pnfs_update_layout(ino, pos, count, iomode, NULL, + trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg, PNFS_UPDATE_LAYOUT_RD_ZEROLEN); goto out; } if (pnfs_within_mdsthreshold(ctx, ino, iomode)) { - trace_pnfs_update_layout(ino, pos, count, iomode, NULL, + trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg, PNFS_UPDATE_LAYOUT_MDSTHRESH); goto out; } @@ -1536,14 +1532,14 @@ lookup_again: lo = pnfs_find_alloc_layout(ino, ctx, gfp_flags); if (lo == NULL) { spin_unlock(&ino->i_lock); - trace_pnfs_update_layout(ino, pos, count, iomode, NULL, + trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg, PNFS_UPDATE_LAYOUT_NOMEM); goto out; } /* Do we even need to bother with this? */ if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) { - trace_pnfs_update_layout(ino, pos, count, iomode, lo, + trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg, PNFS_UPDATE_LAYOUT_BULK_RECALL); dprintk("%s matches recall, use MDS\n", __func__); goto out_unlock; @@ -1551,14 +1547,33 @@ lookup_again: /* if LAYOUTGET already failed once we don't try again */ if (pnfs_layout_io_test_failed(lo, iomode)) { - trace_pnfs_update_layout(ino, pos, count, iomode, lo, + trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg, PNFS_UPDATE_LAYOUT_IO_TEST_FAIL); goto out_unlock; } - first = list_empty(&lo->plh_segs); - if (first) { - /* The first layoutget for the file. Need to serialize per + lseg = pnfs_find_lseg(lo, &arg, strict_iomode); + if (lseg) { + trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg, + PNFS_UPDATE_LAYOUT_FOUND_CACHED); + goto out_unlock; + } + + if (!nfs4_valid_open_stateid(ctx->state)) { + trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg, + PNFS_UPDATE_LAYOUT_INVALID_OPEN); + goto out_unlock; + } + + /* + * Choose a stateid for the LAYOUTGET. If we don't have a layout + * stateid, or it has been invalidated, then we must use the open + * stateid. + */ + if (test_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags)) { + + /* + * The first layoutget for the file. Need to serialize per * RFC 5661 Errata 3208. */ if (test_and_set_bit(NFS_LAYOUT_FIRST_LAYOUTGET, @@ -1567,18 +1582,17 @@ lookup_again: wait_on_bit(&lo->plh_flags, NFS_LAYOUT_FIRST_LAYOUTGET, TASK_UNINTERRUPTIBLE); pnfs_put_layout_hdr(lo); + dprintk("%s retrying\n", __func__); goto lookup_again; } + + first = true; + do { + seq = read_seqbegin(&ctx->state->seqlock); + nfs4_stateid_copy(&stateid, &ctx->state->stateid); + } while (read_seqretry(&ctx->state->seqlock, seq)); } else { - /* Check to see if the layout for the given range - * already exists - */ - lseg = pnfs_find_lseg(lo, &arg); - if (lseg) { - trace_pnfs_update_layout(ino, pos, count, iomode, lo, - PNFS_UPDATE_LAYOUT_FOUND_CACHED); - goto out_unlock; - } + nfs4_stateid_copy(&stateid, &lo->plh_stateid); } /* @@ -1593,15 +1607,17 @@ lookup_again: pnfs_clear_first_layoutget(lo); pnfs_put_layout_hdr(lo); dprintk("%s retrying\n", __func__); + trace_pnfs_update_layout(ino, pos, count, iomode, lo, + lseg, PNFS_UPDATE_LAYOUT_RETRY); goto lookup_again; } - trace_pnfs_update_layout(ino, pos, count, iomode, lo, + trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg, PNFS_UPDATE_LAYOUT_RETURN); goto out_put_layout_hdr; } if (pnfs_layoutgets_blocked(lo)) { - trace_pnfs_update_layout(ino, pos, count, iomode, lo, + trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg, PNFS_UPDATE_LAYOUT_BLOCKED); goto out_unlock; } @@ -1626,10 +1642,36 @@ lookup_again: if (arg.length != NFS4_MAX_UINT64) arg.length = PAGE_ALIGN(arg.length); - lseg = send_layoutget(lo, ctx, &arg, gfp_flags); - atomic_dec(&lo->plh_outstanding); - trace_pnfs_update_layout(ino, pos, count, iomode, lo, + lseg = send_layoutget(lo, ctx, &stateid, &arg, &timeout, gfp_flags); + trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg, PNFS_UPDATE_LAYOUT_SEND_LAYOUTGET); + if (IS_ERR(lseg)) { + switch(PTR_ERR(lseg)) { + case -ERECALLCONFLICT: + if (time_after(jiffies, giveup)) + lseg = NULL; + /* Fallthrough */ + case -EAGAIN: + pnfs_put_layout_hdr(lo); + if (first) + pnfs_clear_first_layoutget(lo); + if (lseg) { + trace_pnfs_update_layout(ino, pos, count, + iomode, lo, lseg, PNFS_UPDATE_LAYOUT_RETRY); + goto lookup_again; + } + /* Fallthrough */ + default: + if (!nfs_error_is_fatal(PTR_ERR(lseg))) { + pnfs_layout_clear_fail_bit(lo, pnfs_iomode_to_fail_bit(iomode)); + lseg = NULL; + } + } + } else { + pnfs_layout_clear_fail_bit(lo, pnfs_iomode_to_fail_bit(iomode)); + } + + atomic_dec(&lo->plh_outstanding); out_put_layout_hdr: if (first) pnfs_clear_first_layoutget(lo); @@ -1678,38 +1720,36 @@ pnfs_layout_process(struct nfs4_layoutget *lgp) struct pnfs_layout_segment *lseg; struct inode *ino = lo->plh_inode; LIST_HEAD(free_me); - int status = -EINVAL; if (!pnfs_sanity_check_layout_range(&res->range)) - goto out; + return ERR_PTR(-EINVAL); /* Inject layout blob into I/O device driver */ lseg = NFS_SERVER(ino)->pnfs_curr_ld->alloc_lseg(lo, res, lgp->gfp_flags); - if (!lseg || IS_ERR(lseg)) { + if (IS_ERR_OR_NULL(lseg)) { if (!lseg) - status = -ENOMEM; - else - status = PTR_ERR(lseg); - dprintk("%s: Could not allocate layout: error %d\n", - __func__, status); - goto out; + lseg = ERR_PTR(-ENOMEM); + + dprintk("%s: Could not allocate layout: error %ld\n", + __func__, PTR_ERR(lseg)); + return lseg; } init_lseg(lo, lseg); lseg->pls_range = res->range; + lseg->pls_seq = be32_to_cpu(res->stateid.seqid); spin_lock(&ino->i_lock); if (pnfs_layoutgets_blocked(lo)) { dprintk("%s forget reply due to state\n", __func__); - goto out_forget_reply; + goto out_forget; } if (nfs4_stateid_match_other(&lo->plh_stateid, &res->stateid)) { /* existing state ID, make sure the sequence number matches. */ if (pnfs_layout_stateid_blocked(lo, &res->stateid)) { dprintk("%s forget reply due to sequence\n", __func__); - status = -EAGAIN; - goto out_forget_reply; + goto out_forget; } pnfs_set_layout_stateid(lo, &res->stateid, false); } else { @@ -1718,7 +1758,7 @@ pnfs_layout_process(struct nfs4_layoutget *lgp) * inode invalid, and don't bother validating the stateid * sequence number. */ - pnfs_mark_matching_lsegs_invalid(lo, &free_me, NULL); + pnfs_mark_matching_lsegs_invalid(lo, &free_me, NULL, 0); nfs4_stateid_copy(&lo->plh_stateid, &res->stateid); lo->plh_barrier = be32_to_cpu(res->stateid.seqid); @@ -1735,18 +1775,17 @@ pnfs_layout_process(struct nfs4_layoutget *lgp) spin_unlock(&ino->i_lock); pnfs_free_lseg_list(&free_me); return lseg; -out: - return ERR_PTR(status); -out_forget_reply: +out_forget: spin_unlock(&ino->i_lock); lseg->pls_layout = lo; NFS_SERVER(ino)->pnfs_curr_ld->free_lseg(lseg); - goto out; + return ERR_PTR(-EAGAIN); } static void -pnfs_set_plh_return_iomode(struct pnfs_layout_hdr *lo, enum pnfs_iomode iomode) +pnfs_set_plh_return_info(struct pnfs_layout_hdr *lo, enum pnfs_iomode iomode, + u32 seq) { if (lo->plh_return_iomode == iomode) return; @@ -1754,6 +1793,8 @@ pnfs_set_plh_return_iomode(struct pnfs_layout_hdr *lo, enum pnfs_iomode iomode) iomode = IOMODE_ANY; lo->plh_return_iomode = iomode; set_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags); + if (!lo->plh_return_seq || pnfs_seqid_is_newer(seq, lo->plh_return_seq)) + lo->plh_return_seq = seq; } /** @@ -1769,7 +1810,8 @@ pnfs_set_plh_return_iomode(struct pnfs_layout_hdr *lo, enum pnfs_iomode iomode) int pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo, struct list_head *tmp_list, - const struct pnfs_layout_range *return_range) + const struct pnfs_layout_range *return_range, + u32 seq) { struct pnfs_layout_segment *lseg, *next; int remaining = 0; @@ -1792,8 +1834,11 @@ pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo, continue; remaining++; set_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags); - pnfs_set_plh_return_iomode(lo, return_range->iomode); } + + if (remaining) + pnfs_set_plh_return_info(lo, return_range->iomode, seq); + return remaining; } @@ -1810,13 +1855,14 @@ void pnfs_error_mark_layout_for_return(struct inode *inode, bool return_now = false; spin_lock(&inode->i_lock); - pnfs_set_plh_return_iomode(lo, range.iomode); + pnfs_set_plh_return_info(lo, range.iomode, lseg->pls_seq); /* * mark all matching lsegs so that we are sure to have no live * segments at hand when sending layoutreturn. See pnfs_put_lseg() * for how it works. */ - if (!pnfs_mark_matching_lsegs_return(lo, &free_me, &range)) { + if (!pnfs_mark_matching_lsegs_return(lo, &free_me, + &range, lseg->pls_seq)) { nfs4_stateid stateid; enum pnfs_iomode iomode = lo->plh_return_iomode; @@ -1849,6 +1895,7 @@ pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *r req_offset(req), rd_size, IOMODE_READ, + false, GFP_KERNEL); if (IS_ERR(pgio->pg_lseg)) { pgio->pg_error = PTR_ERR(pgio->pg_lseg); @@ -1873,6 +1920,7 @@ pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio, req_offset(req), wb_size, IOMODE_RW, + false, GFP_NOFS); if (IS_ERR(pgio->pg_lseg)) { pgio->pg_error = PTR_ERR(pgio->pg_lseg); @@ -2143,12 +2191,15 @@ pnfs_try_to_read_data(struct nfs_pgio_header *hdr, } /* Resend all requests through pnfs. */ -int pnfs_read_resend_pnfs(struct nfs_pgio_header *hdr) +void pnfs_read_resend_pnfs(struct nfs_pgio_header *hdr) { struct nfs_pageio_descriptor pgio; - nfs_pageio_init_read(&pgio, hdr->inode, false, hdr->completion_ops); - return nfs_pageio_resend(&pgio, hdr); + if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) { + nfs_pageio_init_read(&pgio, hdr->inode, false, + hdr->completion_ops); + hdr->task.tk_status = nfs_pageio_resend(&pgio, hdr); + } } EXPORT_SYMBOL_GPL(pnfs_read_resend_pnfs); @@ -2158,12 +2209,11 @@ pnfs_do_read(struct nfs_pageio_descriptor *desc, struct nfs_pgio_header *hdr) const struct rpc_call_ops *call_ops = desc->pg_rpc_callops; struct pnfs_layout_segment *lseg = desc->pg_lseg; enum pnfs_try_status trypnfs; - int err = 0; trypnfs = pnfs_try_to_read_data(hdr, call_ops, lseg); if (trypnfs == PNFS_TRY_AGAIN) - err = pnfs_read_resend_pnfs(hdr); - if (trypnfs == PNFS_NOT_ATTEMPTED || err) + pnfs_read_resend_pnfs(hdr); + if (trypnfs == PNFS_NOT_ATTEMPTED || hdr->task.tk_status) pnfs_read_through_mds(desc, hdr); } @@ -2405,7 +2455,7 @@ pnfs_report_layoutstat(struct inode *inode, gfp_t gfp_flags) spin_lock(&inode->i_lock); if (!NFS_I(inode)->layout) { spin_unlock(&inode->i_lock); - goto out; + goto out_clear_layoutstats; } hdr = NFS_I(inode)->layout; pnfs_get_layout_hdr(hdr); @@ -2434,6 +2484,7 @@ out_free: kfree(data); out_put: pnfs_put_layout_hdr(hdr); +out_clear_layoutstats: smp_mb__before_atomic(); clear_bit(NFS_INO_LAYOUTSTATS, &nfsi->flags); smp_mb__after_atomic(); diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index 1ac1db5f6dad..b21bd0bee784 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -64,6 +64,7 @@ struct pnfs_layout_segment { struct list_head pls_lc_list; struct pnfs_layout_range pls_range; atomic_t pls_refcount; + u32 pls_seq; unsigned long pls_flags; struct pnfs_layout_hdr *pls_layout; struct work_struct pls_work; @@ -194,6 +195,7 @@ struct pnfs_layout_hdr { unsigned long plh_flags; nfs4_stateid plh_stateid; u32 plh_barrier; /* ignore lower seqids */ + u32 plh_return_seq; enum pnfs_iomode plh_return_iomode; loff_t plh_lwb; /* last write byte for layoutcommit */ struct rpc_cred *plh_lc_cred; /* layoutcommit cred */ @@ -226,7 +228,7 @@ extern void pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *); extern int nfs4_proc_getdeviceinfo(struct nfs_server *server, struct pnfs_device *dev, struct rpc_cred *cred); -extern struct pnfs_layout_segment* nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags); +extern struct pnfs_layout_segment* nfs4_proc_layoutget(struct nfs4_layoutget *lgp, long *timeout, gfp_t gfp_flags); extern int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp, bool sync); /* pnfs.c */ @@ -258,16 +260,14 @@ void pnfs_put_layout_hdr(struct pnfs_layout_hdr *lo); void pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new, bool update_barrier); -int pnfs_choose_layoutget_stateid(nfs4_stateid *dst, - struct pnfs_layout_hdr *lo, - const struct pnfs_layout_range *range, - struct nfs4_state *open_state); int pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo, struct list_head *tmp_list, - const struct pnfs_layout_range *recall_range); + const struct pnfs_layout_range *recall_range, + u32 seq); int pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo, struct list_head *tmp_list, - const struct pnfs_layout_range *recall_range); + const struct pnfs_layout_range *recall_range, + u32 seq); bool pnfs_roc(struct inode *ino); void pnfs_roc_release(struct inode *ino); void pnfs_roc_set_barrier(struct inode *ino, u32 barrier); @@ -282,12 +282,13 @@ int _pnfs_return_layout(struct inode *); int pnfs_commit_and_return_layout(struct inode *); void pnfs_ld_write_done(struct nfs_pgio_header *); void pnfs_ld_read_done(struct nfs_pgio_header *); -int pnfs_read_resend_pnfs(struct nfs_pgio_header *); +void pnfs_read_resend_pnfs(struct nfs_pgio_header *); struct pnfs_layout_segment *pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx, loff_t pos, u64 count, enum pnfs_iomode iomode, + bool strict_iomode, gfp_t gfp_flags); void pnfs_clear_layoutreturn_waitbit(struct pnfs_layout_hdr *lo); diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c index 4aaed890048f..b38e3c0dc790 100644 --- a/fs/nfs/pnfs_nfs.c +++ b/fs/nfs/pnfs_nfs.c @@ -61,7 +61,7 @@ EXPORT_SYMBOL_GPL(pnfs_generic_commit_release); /* The generic layer is about to remove the req from the commit list. * If this will make the bucket empty, it will need to put the lseg reference. - * Note this must be called holding the inode (/cinfo) lock + * Note this must be called holding i_lock */ void pnfs_generic_clear_request_commit(struct nfs_page *req, @@ -98,7 +98,7 @@ pnfs_generic_transfer_commit_list(struct list_head *src, struct list_head *dst, if (!nfs_lock_request(req)) continue; kref_get(&req->wb_kref); - if (cond_resched_lock(cinfo->lock)) + if (cond_resched_lock(&cinfo->inode->i_lock)) list_safe_reset_next(req, tmp, wb_list); nfs_request_remove_commit_list(req, cinfo); clear_bit(PG_COMMIT_TO_DS, &req->wb_flags); @@ -119,7 +119,7 @@ pnfs_generic_scan_ds_commit_list(struct pnfs_commit_bucket *bucket, struct list_head *dst = &bucket->committing; int ret; - lockdep_assert_held(cinfo->lock); + lockdep_assert_held(&cinfo->inode->i_lock); ret = pnfs_generic_transfer_commit_list(src, dst, cinfo, max); if (ret) { cinfo->ds->nwritten -= ret; @@ -142,7 +142,7 @@ int pnfs_generic_scan_commit_lists(struct nfs_commit_info *cinfo, { int i, rv = 0, cnt; - lockdep_assert_held(cinfo->lock); + lockdep_assert_held(&cinfo->inode->i_lock); for (i = 0; i < cinfo->ds->nbuckets && max != 0; i++) { cnt = pnfs_generic_scan_ds_commit_list(&cinfo->ds->buckets[i], cinfo, max); @@ -161,16 +161,16 @@ void pnfs_generic_recover_commit_reqs(struct list_head *dst, struct pnfs_layout_segment *freeme; int i; - lockdep_assert_held(cinfo->lock); + lockdep_assert_held(&cinfo->inode->i_lock); restart: for (i = 0, b = cinfo->ds->buckets; i < cinfo->ds->nbuckets; i++, b++) { if (pnfs_generic_transfer_commit_list(&b->written, dst, cinfo, 0)) { freeme = b->wlseg; b->wlseg = NULL; - spin_unlock(cinfo->lock); + spin_unlock(&cinfo->inode->i_lock); pnfs_put_lseg(freeme); - spin_lock(cinfo->lock); + spin_lock(&cinfo->inode->i_lock); goto restart; } } @@ -186,7 +186,7 @@ static void pnfs_generic_retry_commit(struct nfs_commit_info *cinfo, int idx) LIST_HEAD(pages); int i; - spin_lock(cinfo->lock); + spin_lock(&cinfo->inode->i_lock); for (i = idx; i < fl_cinfo->nbuckets; i++) { bucket = &fl_cinfo->buckets[i]; if (list_empty(&bucket->committing)) @@ -194,12 +194,12 @@ static void pnfs_generic_retry_commit(struct nfs_commit_info *cinfo, int idx) freeme = bucket->clseg; bucket->clseg = NULL; list_splice_init(&bucket->committing, &pages); - spin_unlock(cinfo->lock); + spin_unlock(&cinfo->inode->i_lock); nfs_retry_commit(&pages, freeme, cinfo, i); pnfs_put_lseg(freeme); - spin_lock(cinfo->lock); + spin_lock(&cinfo->inode->i_lock); } - spin_unlock(cinfo->lock); + spin_unlock(&cinfo->inode->i_lock); } static unsigned int @@ -238,14 +238,39 @@ void pnfs_fetch_commit_bucket_list(struct list_head *pages, struct pnfs_commit_bucket *bucket; bucket = &cinfo->ds->buckets[data->ds_commit_index]; - spin_lock(cinfo->lock); + spin_lock(&cinfo->inode->i_lock); list_splice_init(&bucket->committing, pages); data->lseg = bucket->clseg; bucket->clseg = NULL; - spin_unlock(cinfo->lock); + spin_unlock(&cinfo->inode->i_lock); } +/* Helper function for pnfs_generic_commit_pagelist to catch an empty + * page list. This can happen when two commits race. + * + * This must be called instead of nfs_init_commit - call one or the other, but + * not both! + */ +static bool +pnfs_generic_commit_cancel_empty_pagelist(struct list_head *pages, + struct nfs_commit_data *data, + struct nfs_commit_info *cinfo) +{ + if (list_empty(pages)) { + if (atomic_dec_and_test(&cinfo->mds->rpcs_out)) + wake_up_atomic_t(&cinfo->mds->rpcs_out); + /* don't call nfs_commitdata_release - it tries to put + * the open_context which is not acquired until nfs_init_commit + * which has not been called on @data */ + WARN_ON_ONCE(data->context); + nfs_commit_free(data); + return true; + } + + return false; +} + /* This follows nfs_commit_list pretty closely */ int pnfs_generic_commit_pagelist(struct inode *inode, struct list_head *mds_pages, @@ -280,6 +305,11 @@ pnfs_generic_commit_pagelist(struct inode *inode, struct list_head *mds_pages, list_for_each_entry_safe(data, tmp, &list, pages) { list_del_init(&data->pages); if (data->ds_commit_index < 0) { + /* another commit raced with us */ + if (pnfs_generic_commit_cancel_empty_pagelist(mds_pages, + data, cinfo)) + continue; + nfs_init_commit(data, mds_pages, NULL, cinfo); nfs_initiate_commit(NFS_CLIENT(inode), data, NFS_PROTO(data->inode), @@ -288,6 +318,12 @@ pnfs_generic_commit_pagelist(struct inode *inode, struct list_head *mds_pages, LIST_HEAD(pages); pnfs_fetch_commit_bucket_list(&pages, data, cinfo); + + /* another commit raced with us */ + if (pnfs_generic_commit_cancel_empty_pagelist(&pages, + data, cinfo)) + continue; + nfs_init_commit(data, &pages, data->lseg, cinfo); initiate_commit(data, how); } @@ -874,12 +910,12 @@ pnfs_layout_mark_request_commit(struct nfs_page *req, struct list_head *list; struct pnfs_commit_bucket *buckets; - spin_lock(cinfo->lock); + spin_lock(&cinfo->inode->i_lock); buckets = cinfo->ds->buckets; list = &buckets[ds_commit_idx].written; if (list_empty(list)) { if (!pnfs_is_valid_lseg(lseg)) { - spin_unlock(cinfo->lock); + spin_unlock(&cinfo->inode->i_lock); cinfo->completion_ops->resched_write(cinfo, req); return; } @@ -896,7 +932,7 @@ pnfs_layout_mark_request_commit(struct nfs_page *req, cinfo->ds->nwritten++; nfs_request_add_commit_list_locked(req, list, cinfo); - spin_unlock(cinfo->lock); + spin_unlock(&cinfo->inode->i_lock); nfs_mark_page_unstable(req->wb_page, cinfo); } EXPORT_SYMBOL_GPL(pnfs_layout_mark_request_commit); diff --git a/fs/nfs/read.c b/fs/nfs/read.c index 6776d7a7839e..572e5b3b06f1 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -367,13 +367,13 @@ readpage_async_filler(void *data, struct page *page) nfs_list_remove_request(new); nfs_readpage_release(new); error = desc->pgio->pg_error; - goto out_unlock; + goto out; } return 0; out_error: error = PTR_ERR(new); -out_unlock: unlock_page(page); +out: return error; } diff --git a/fs/nfs/super.c b/fs/nfs/super.c index f1268280244e..2137e0202f25 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -191,6 +191,7 @@ static const match_table_t nfs_mount_option_tokens = { enum { Opt_xprt_udp, Opt_xprt_udp6, Opt_xprt_tcp, Opt_xprt_tcp6, Opt_xprt_rdma, + Opt_xprt_rdma6, Opt_xprt_err }; @@ -201,6 +202,7 @@ static const match_table_t nfs_xprt_protocol_tokens = { { Opt_xprt_tcp, "tcp" }, { Opt_xprt_tcp6, "tcp6" }, { Opt_xprt_rdma, "rdma" }, + { Opt_xprt_rdma6, "rdma6" }, { Opt_xprt_err, NULL } }; @@ -1456,6 +1458,8 @@ static int nfs_parse_mount_options(char *raw, mnt->flags |= NFS_MOUNT_TCP; mnt->nfs_server.protocol = XPRT_TRANSPORT_TCP; break; + case Opt_xprt_rdma6: + protofamily = AF_INET6; case Opt_xprt_rdma: /* vector side protocols to TCP */ mnt->flags |= NFS_MOUNT_TCP; @@ -2408,6 +2412,11 @@ static int nfs_compare_super_address(struct nfs_server *server1, struct nfs_server *server2) { struct sockaddr *sap1, *sap2; + struct rpc_xprt *xprt1 = server1->client->cl_xprt; + struct rpc_xprt *xprt2 = server2->client->cl_xprt; + + if (!net_eq(xprt1->xprt_net, xprt2->xprt_net)) + return 0; sap1 = (struct sockaddr *)&server1->nfs_client->cl_addr; sap2 = (struct sockaddr *)&server2->nfs_client->cl_addr; diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 5f4fd53e5764..e1c74d3db64d 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -245,8 +245,7 @@ static void nfs_mark_uptodate(struct nfs_page *req) static int wb_priority(struct writeback_control *wbc) { int ret = 0; - if (wbc->for_reclaim) - return FLUSH_HIGHPRI | FLUSH_COND_STABLE; + if (wbc->sync_mode == WB_SYNC_ALL) ret = FLUSH_COND_STABLE; return ret; @@ -737,7 +736,7 @@ static void nfs_inode_remove_request(struct nfs_page *req) head = req->wb_head; spin_lock(&inode->i_lock); - if (likely(!PageSwapCache(head->wb_page))) { + if (likely(head->wb_page && !PageSwapCache(head->wb_page))) { set_page_private(head->wb_page, 0); ClearPagePrivate(head->wb_page); smp_mb__after_atomic(); @@ -759,7 +758,8 @@ static void nfs_inode_remove_request(struct nfs_page *req) static void nfs_mark_request_dirty(struct nfs_page *req) { - __set_page_dirty_nobuffers(req->wb_page); + if (req->wb_page) + __set_page_dirty_nobuffers(req->wb_page); } /* @@ -804,7 +804,7 @@ nfs_page_search_commits_for_head_request_locked(struct nfs_inode *nfsi, * number of outstanding requests requiring a commit as well as * the MM page stats. * - * The caller must hold the cinfo->lock, and the nfs_page lock. + * The caller must hold cinfo->inode->i_lock, and the nfs_page lock. */ void nfs_request_add_commit_list_locked(struct nfs_page *req, struct list_head *dst, @@ -832,10 +832,11 @@ EXPORT_SYMBOL_GPL(nfs_request_add_commit_list_locked); void nfs_request_add_commit_list(struct nfs_page *req, struct nfs_commit_info *cinfo) { - spin_lock(cinfo->lock); + spin_lock(&cinfo->inode->i_lock); nfs_request_add_commit_list_locked(req, &cinfo->mds->list, cinfo); - spin_unlock(cinfo->lock); - nfs_mark_page_unstable(req->wb_page, cinfo); + spin_unlock(&cinfo->inode->i_lock); + if (req->wb_page) + nfs_mark_page_unstable(req->wb_page, cinfo); } EXPORT_SYMBOL_GPL(nfs_request_add_commit_list); @@ -864,7 +865,7 @@ EXPORT_SYMBOL_GPL(nfs_request_remove_commit_list); static void nfs_init_cinfo_from_inode(struct nfs_commit_info *cinfo, struct inode *inode) { - cinfo->lock = &inode->i_lock; + cinfo->inode = inode; cinfo->mds = &NFS_I(inode)->commit_info; cinfo->ds = pnfs_get_ds_info(inode); cinfo->dreq = NULL; @@ -967,7 +968,7 @@ nfs_reqs_to_commit(struct nfs_commit_info *cinfo) return cinfo->mds->ncommit; } -/* cinfo->lock held by caller */ +/* cinfo->inode->i_lock held by caller */ int nfs_scan_commit_list(struct list_head *src, struct list_head *dst, struct nfs_commit_info *cinfo, int max) @@ -979,7 +980,7 @@ nfs_scan_commit_list(struct list_head *src, struct list_head *dst, if (!nfs_lock_request(req)) continue; kref_get(&req->wb_kref); - if (cond_resched_lock(cinfo->lock)) + if (cond_resched_lock(&cinfo->inode->i_lock)) list_safe_reset_next(req, tmp, wb_list); nfs_request_remove_commit_list(req, cinfo); nfs_list_add_request(req, dst); @@ -1005,7 +1006,7 @@ nfs_scan_commit(struct inode *inode, struct list_head *dst, { int ret = 0; - spin_lock(cinfo->lock); + spin_lock(&cinfo->inode->i_lock); if (cinfo->mds->ncommit > 0) { const int max = INT_MAX; @@ -1013,7 +1014,7 @@ nfs_scan_commit(struct inode *inode, struct list_head *dst, cinfo, max); ret += pnfs_scan_commit_lists(inode, cinfo, max - ret); } - spin_unlock(cinfo->lock); + spin_unlock(&cinfo->inode->i_lock); return ret; } @@ -1709,6 +1710,10 @@ nfs_commit_list(struct inode *inode, struct list_head *head, int how, { struct nfs_commit_data *data; + /* another commit raced with us */ + if (list_empty(head)) + return 0; + data = nfs_commitdata_alloc(); if (!data) @@ -1724,6 +1729,36 @@ nfs_commit_list(struct inode *inode, struct list_head *head, int how, return -ENOMEM; } +int nfs_commit_file(struct file *file, struct nfs_write_verifier *verf) +{ + struct inode *inode = file_inode(file); + struct nfs_open_context *open; + struct nfs_commit_info cinfo; + struct nfs_page *req; + int ret; + + open = get_nfs_open_context(nfs_file_open_context(file)); + req = nfs_create_request(open, NULL, NULL, 0, i_size_read(inode)); + if (IS_ERR(req)) { + ret = PTR_ERR(req); + goto out_put; + } + + nfs_init_cinfo_from_inode(&cinfo, inode); + + memcpy(&req->wb_verf, verf, sizeof(struct nfs_write_verifier)); + nfs_request_add_commit_list(req, &cinfo); + ret = nfs_commit_inode(inode, FLUSH_SYNC); + if (ret > 0) + ret = 0; + + nfs_free_request(req); +out_put: + put_nfs_open_context(open); + return ret; +} +EXPORT_SYMBOL_GPL(nfs_commit_file); + /* * COMMIT call returned */ @@ -1748,7 +1783,8 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data) while (!list_empty(&data->pages)) { req = nfs_list_entry(data->pages.next); nfs_list_remove_request(req); - nfs_clear_page_commit(req->wb_page); + if (req->wb_page) + nfs_clear_page_commit(req->wb_page); dprintk("NFS: commit (%s/%llu %d@%lld)", req->wb_context->dentry->d_sb->s_id, diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c index e55b5242614d..ad2c05e80a83 100644 --- a/fs/nfsd/blocklayout.c +++ b/fs/nfsd/blocklayout.c @@ -2,6 +2,7 @@ * Copyright (c) 2014-2016 Christoph Hellwig. */ #include <linux/exportfs.h> +#include <linux/iomap.h> #include <linux/genhd.h> #include <linux/slab.h> #include <linux/pr.h> @@ -290,7 +291,7 @@ out_free_buf: return error; } -#define NFSD_MDS_PR_KEY 0x0100000000000000 +#define NFSD_MDS_PR_KEY 0x0100000000000000ULL /* * We use the client ID as a unique key for the reservations. diff --git a/fs/nfsd/blocklayoutxdr.c b/fs/nfsd/blocklayoutxdr.c index 6c3b316f932e..4ebaaf4b8d8a 100644 --- a/fs/nfsd/blocklayoutxdr.c +++ b/fs/nfsd/blocklayoutxdr.c @@ -3,6 +3,7 @@ */ #include <linux/sunrpc/svc.h> #include <linux/exportfs.h> +#include <linux/iomap.h> #include <linux/nfs4.h> #include "nfsd.h" diff --git a/fs/nfsd/nfs2acl.c b/fs/nfsd/nfs2acl.c index 1580ea6fd64d..d08cd88155c7 100644 --- a/fs/nfsd/nfs2acl.c +++ b/fs/nfsd/nfs2acl.c @@ -104,22 +104,21 @@ static __be32 nfsacld_proc_setacl(struct svc_rqst * rqstp, goto out; inode = d_inode(fh->fh_dentry); - if (!IS_POSIXACL(inode) || !inode->i_op->set_acl) { - error = -EOPNOTSUPP; - goto out_errno; - } error = fh_want_write(fh); if (error) goto out_errno; - error = inode->i_op->set_acl(inode, argp->acl_access, ACL_TYPE_ACCESS); + fh_lock(fh); + + error = set_posix_acl(inode, ACL_TYPE_ACCESS, argp->acl_access); if (error) - goto out_drop_write; - error = inode->i_op->set_acl(inode, argp->acl_default, - ACL_TYPE_DEFAULT); + goto out_drop_lock; + error = set_posix_acl(inode, ACL_TYPE_DEFAULT, argp->acl_default); if (error) - goto out_drop_write; + goto out_drop_lock; + + fh_unlock(fh); fh_drop_write(fh); @@ -131,7 +130,8 @@ out: posix_acl_release(argp->acl_access); posix_acl_release(argp->acl_default); return nfserr; -out_drop_write: +out_drop_lock: + fh_unlock(fh); fh_drop_write(fh); out_errno: nfserr = nfserrno(error); diff --git a/fs/nfsd/nfs3acl.c b/fs/nfsd/nfs3acl.c index 01df4cd7c753..0c890347cde3 100644 --- a/fs/nfsd/nfs3acl.c +++ b/fs/nfsd/nfs3acl.c @@ -95,22 +95,20 @@ static __be32 nfsd3_proc_setacl(struct svc_rqst * rqstp, goto out; inode = d_inode(fh->fh_dentry); - if (!IS_POSIXACL(inode) || !inode->i_op->set_acl) { - error = -EOPNOTSUPP; - goto out_errno; - } error = fh_want_write(fh); if (error) goto out_errno; - error = inode->i_op->set_acl(inode, argp->acl_access, ACL_TYPE_ACCESS); + fh_lock(fh); + + error = set_posix_acl(inode, ACL_TYPE_ACCESS, argp->acl_access); if (error) - goto out_drop_write; - error = inode->i_op->set_acl(inode, argp->acl_default, - ACL_TYPE_DEFAULT); + goto out_drop_lock; + error = set_posix_acl(inode, ACL_TYPE_DEFAULT, argp->acl_default); -out_drop_write: +out_drop_lock: + fh_unlock(fh); fh_drop_write(fh); out_errno: nfserr = nfserrno(error); diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c index 93d5853f8c99..dba2ff8eaa68 100644 --- a/fs/nfsd/nfs3xdr.c +++ b/fs/nfsd/nfs3xdr.c @@ -379,7 +379,7 @@ nfs3svc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p, */ hdr = (void*)p - rqstp->rq_arg.head[0].iov_base; dlen = rqstp->rq_arg.head[0].iov_len + rqstp->rq_arg.page_len - - hdr; + + rqstp->rq_arg.tail[0].iov_len - hdr; /* * Round the length of the data which was specified up to * the next multiple of XDR units and then compare that diff --git a/fs/nfsd/nfs4acl.c b/fs/nfsd/nfs4acl.c index 6adabd6049b7..71292a0d6f09 100644 --- a/fs/nfsd/nfs4acl.c +++ b/fs/nfsd/nfs4acl.c @@ -770,9 +770,6 @@ nfsd4_set_nfs4_acl(struct svc_rqst *rqstp, struct svc_fh *fhp, dentry = fhp->fh_dentry; inode = d_inode(dentry); - if (!inode->i_op->set_acl || !IS_POSIXACL(inode)) - return nfserr_attrnotsupp; - if (S_ISDIR(inode->i_mode)) flags = NFS4_ACL_DIR; @@ -782,16 +779,19 @@ nfsd4_set_nfs4_acl(struct svc_rqst *rqstp, struct svc_fh *fhp, if (host_error < 0) goto out_nfserr; - host_error = inode->i_op->set_acl(inode, pacl, ACL_TYPE_ACCESS); + fh_lock(fhp); + + host_error = set_posix_acl(inode, ACL_TYPE_ACCESS, pacl); if (host_error < 0) - goto out_release; + goto out_drop_lock; if (S_ISDIR(inode->i_mode)) { - host_error = inode->i_op->set_acl(inode, dpacl, - ACL_TYPE_DEFAULT); + host_error = set_posix_acl(inode, ACL_TYPE_DEFAULT, dpacl); } -out_release: +out_drop_lock: + fh_unlock(fhp); + posix_acl_release(pacl); posix_acl_release(dpacl); out_nfserr: diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index 7389cb1d7409..04c68d900324 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -710,22 +710,6 @@ static struct rpc_cred *get_backchannel_cred(struct nfs4_client *clp, struct rpc } } -static struct rpc_clnt *create_backchannel_client(struct rpc_create_args *args) -{ - struct rpc_xprt *xprt; - - if (args->protocol != XPRT_TRANSPORT_BC_TCP) - return rpc_create(args); - - xprt = args->bc_xprt->xpt_bc_xprt; - if (xprt) { - xprt_get(xprt); - return rpc_create_xprt(args, xprt); - } - - return rpc_create(args); -} - static int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *conn, struct nfsd4_session *ses) { int maxtime = max_cb_time(clp->net); @@ -768,7 +752,7 @@ static int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *c args.authflavor = ses->se_cb_sec.flavor; } /* Create RPC client */ - client = create_backchannel_client(&args); + client = rpc_create(&args); if (IS_ERR(client)) { dprintk("NFSD: couldn't create callback client: %ld\n", PTR_ERR(client)); diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c index 825c7bc8d789..953c0755cb37 100644 --- a/fs/nfsd/nfs4layouts.c +++ b/fs/nfsd/nfs4layouts.c @@ -289,7 +289,7 @@ nfsd4_preprocess_layout_stateid(struct svc_rqst *rqstp, status = nfserr_bad_stateid; mutex_lock(&ls->ls_mutex); - if (stateid->si_generation > stid->sc_stateid.si_generation) + if (nfsd4_stateid_generation_after(stateid, &stid->sc_stateid)) goto out_unlock_stid; if (layout_type != ls->ls_layout_type) goto out_unlock_stid; diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 0462eeddfff9..70d0b9b33031 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -3480,12 +3480,17 @@ alloc_init_open_stateowner(unsigned int strhashval, struct nfsd4_open *open, } static struct nfs4_ol_stateid * -init_open_stateid(struct nfs4_ol_stateid *stp, struct nfs4_file *fp, - struct nfsd4_open *open) +init_open_stateid(struct nfs4_file *fp, struct nfsd4_open *open) { struct nfs4_openowner *oo = open->op_openowner; struct nfs4_ol_stateid *retstp = NULL; + struct nfs4_ol_stateid *stp; + + stp = open->op_stp; + /* We are moving these outside of the spinlocks to avoid the warnings */ + mutex_init(&stp->st_mutex); + mutex_lock(&stp->st_mutex); spin_lock(&oo->oo_owner.so_client->cl_lock); spin_lock(&fp->fi_lock); @@ -3493,6 +3498,8 @@ init_open_stateid(struct nfs4_ol_stateid *stp, struct nfs4_file *fp, retstp = nfsd4_find_existing_open(fp, open); if (retstp) goto out_unlock; + + open->op_stp = NULL; atomic_inc(&stp->st_stid.sc_count); stp->st_stid.sc_type = NFS4_OPEN_STID; INIT_LIST_HEAD(&stp->st_locks); @@ -3502,14 +3509,19 @@ init_open_stateid(struct nfs4_ol_stateid *stp, struct nfs4_file *fp, stp->st_access_bmap = 0; stp->st_deny_bmap = 0; stp->st_openstp = NULL; - init_rwsem(&stp->st_rwsem); list_add(&stp->st_perstateowner, &oo->oo_owner.so_stateids); list_add(&stp->st_perfile, &fp->fi_stateids); out_unlock: spin_unlock(&fp->fi_lock); spin_unlock(&oo->oo_owner.so_client->cl_lock); - return retstp; + if (retstp) { + mutex_lock(&retstp->st_mutex); + /* To keep mutex tracking happy */ + mutex_unlock(&stp->st_mutex); + stp = retstp; + } + return stp; } /* @@ -4305,7 +4317,6 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf struct nfs4_client *cl = open->op_openowner->oo_owner.so_client; struct nfs4_file *fp = NULL; struct nfs4_ol_stateid *stp = NULL; - struct nfs4_ol_stateid *swapstp = NULL; struct nfs4_delegation *dp = NULL; __be32 status; @@ -4335,32 +4346,28 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf */ if (stp) { /* Stateid was found, this is an OPEN upgrade */ - down_read(&stp->st_rwsem); + mutex_lock(&stp->st_mutex); status = nfs4_upgrade_open(rqstp, fp, current_fh, stp, open); if (status) { - up_read(&stp->st_rwsem); + mutex_unlock(&stp->st_mutex); goto out; } } else { - stp = open->op_stp; - open->op_stp = NULL; - swapstp = init_open_stateid(stp, fp, open); - if (swapstp) { - nfs4_put_stid(&stp->st_stid); - stp = swapstp; - down_read(&stp->st_rwsem); + /* stp is returned locked. */ + stp = init_open_stateid(fp, open); + /* See if we lost the race to some other thread */ + if (stp->st_access_bmap != 0) { status = nfs4_upgrade_open(rqstp, fp, current_fh, stp, open); if (status) { - up_read(&stp->st_rwsem); + mutex_unlock(&stp->st_mutex); goto out; } goto upgrade_out; } - down_read(&stp->st_rwsem); status = nfs4_get_vfs_file(rqstp, fp, current_fh, stp, open); if (status) { - up_read(&stp->st_rwsem); + mutex_unlock(&stp->st_mutex); release_open_stateid(stp); goto out; } @@ -4372,7 +4379,7 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf } upgrade_out: nfs4_inc_and_copy_stateid(&open->op_stateid, &stp->st_stid); - up_read(&stp->st_rwsem); + mutex_unlock(&stp->st_mutex); if (nfsd4_has_session(&resp->cstate)) { if (open->op_deleg_want & NFS4_SHARE_WANT_NO_DELEG) { @@ -4651,12 +4658,6 @@ grace_disallows_io(struct net *net, struct inode *inode) return opens_in_grace(net) && mandatory_lock(inode); } -/* Returns true iff a is later than b: */ -static bool stateid_generation_after(stateid_t *a, stateid_t *b) -{ - return (s32)(a->si_generation - b->si_generation) > 0; -} - static __be32 check_stateid_generation(stateid_t *in, stateid_t *ref, bool has_session) { /* @@ -4670,7 +4671,7 @@ static __be32 check_stateid_generation(stateid_t *in, stateid_t *ref, bool has_s return nfs_ok; /* If the client sends us a stateid from the future, it's buggy: */ - if (stateid_generation_after(in, ref)) + if (nfsd4_stateid_generation_after(in, ref)) return nfserr_bad_stateid; /* * However, we could see a stateid from the past, even from a @@ -4983,12 +4984,12 @@ static __be32 nfs4_seqid_op_checks(struct nfsd4_compound_state *cstate, stateid_ * revoked delegations are kept only for free_stateid. */ return nfserr_bad_stateid; - down_write(&stp->st_rwsem); + mutex_lock(&stp->st_mutex); status = check_stateid_generation(stateid, &stp->st_stid.sc_stateid, nfsd4_has_session(cstate)); if (status == nfs_ok) status = nfs4_check_fh(current_fh, &stp->st_stid); if (status != nfs_ok) - up_write(&stp->st_rwsem); + mutex_unlock(&stp->st_mutex); return status; } @@ -5036,7 +5037,7 @@ static __be32 nfs4_preprocess_confirmed_seqid_op(struct nfsd4_compound_state *cs return status; oo = openowner(stp->st_stateowner); if (!(oo->oo_flags & NFS4_OO_CONFIRMED)) { - up_write(&stp->st_rwsem); + mutex_unlock(&stp->st_mutex); nfs4_put_stid(&stp->st_stid); return nfserr_bad_stateid; } @@ -5068,12 +5069,12 @@ nfsd4_open_confirm(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, oo = openowner(stp->st_stateowner); status = nfserr_bad_stateid; if (oo->oo_flags & NFS4_OO_CONFIRMED) { - up_write(&stp->st_rwsem); + mutex_unlock(&stp->st_mutex); goto put_stateid; } oo->oo_flags |= NFS4_OO_CONFIRMED; nfs4_inc_and_copy_stateid(&oc->oc_resp_stateid, &stp->st_stid); - up_write(&stp->st_rwsem); + mutex_unlock(&stp->st_mutex); dprintk("NFSD: %s: success, seqid=%d stateid=" STATEID_FMT "\n", __func__, oc->oc_seqid, STATEID_VAL(&stp->st_stid.sc_stateid)); @@ -5149,7 +5150,7 @@ nfsd4_open_downgrade(struct svc_rqst *rqstp, nfs4_inc_and_copy_stateid(&od->od_stateid, &stp->st_stid); status = nfs_ok; put_stateid: - up_write(&stp->st_rwsem); + mutex_unlock(&stp->st_mutex); nfs4_put_stid(&stp->st_stid); out: nfsd4_bump_seqid(cstate, status); @@ -5202,7 +5203,7 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if (status) goto out; nfs4_inc_and_copy_stateid(&close->cl_stateid, &stp->st_stid); - up_write(&stp->st_rwsem); + mutex_unlock(&stp->st_mutex); nfsd4_close_open_stateid(stp); @@ -5428,7 +5429,7 @@ init_lock_stateid(struct nfs4_ol_stateid *stp, struct nfs4_lockowner *lo, stp->st_access_bmap = 0; stp->st_deny_bmap = open_stp->st_deny_bmap; stp->st_openstp = open_stp; - init_rwsem(&stp->st_rwsem); + mutex_init(&stp->st_mutex); list_add(&stp->st_locks, &open_stp->st_locks); list_add(&stp->st_perstateowner, &lo->lo_owner.so_stateids); spin_lock(&fp->fi_lock); @@ -5597,7 +5598,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, &open_stp, nn); if (status) goto out; - up_write(&open_stp->st_rwsem); + mutex_unlock(&open_stp->st_mutex); open_sop = openowner(open_stp->st_stateowner); status = nfserr_bad_stateid; if (!same_clid(&open_sop->oo_owner.so_client->cl_clientid, @@ -5606,7 +5607,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, status = lookup_or_create_lock_state(cstate, open_stp, lock, &lock_stp, &new); if (status == nfs_ok) - down_write(&lock_stp->st_rwsem); + mutex_lock(&lock_stp->st_mutex); } else { status = nfs4_preprocess_seqid_op(cstate, lock->lk_old_lock_seqid, @@ -5710,7 +5711,7 @@ out: seqid_mutating_err(ntohl(status))) lock_sop->lo_owner.so_seqid++; - up_write(&lock_stp->st_rwsem); + mutex_unlock(&lock_stp->st_mutex); /* * If this is a new, never-before-used stateid, and we are @@ -5880,7 +5881,7 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, fput: fput(filp); put_stateid: - up_write(&stp->st_rwsem); + mutex_unlock(&stp->st_mutex); nfs4_put_stid(&stp->st_stid); out: nfsd4_bump_seqid(cstate, status); diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 9690cb4dd588..e7787777620e 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -158,7 +158,6 @@ static const struct file_operations exports_proc_operations = { .read = seq_read, .llseek = seq_lseek, .release = seq_release, - .owner = THIS_MODULE, }; static int exports_nfsd_open(struct inode *inode, struct file *file) @@ -171,7 +170,6 @@ static const struct file_operations exports_nfsd_operations = { .read = seq_read, .llseek = seq_lseek, .release = seq_release, - .owner = THIS_MODULE, }; static int export_features_show(struct seq_file *m, void *v) @@ -217,7 +215,6 @@ static const struct file_operations pool_stats_operations = { .read = seq_read, .llseek = seq_lseek, .release = nfsd_pool_stats_release, - .owner = THIS_MODULE, }; static struct file_operations reply_cache_stats_operations = { diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index c050c53036a6..64053eadeb81 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -535,7 +535,7 @@ struct nfs4_ol_stateid { unsigned char st_access_bmap; unsigned char st_deny_bmap; struct nfs4_ol_stateid *st_openstp; - struct rw_semaphore st_rwsem; + struct mutex st_mutex; }; static inline struct nfs4_ol_stateid *openlockstateid(struct nfs4_stid *s) @@ -573,6 +573,11 @@ enum nfsd4_cb_op { NFSPROC4_CLNT_CB_SEQUENCE, }; +/* Returns true iff a is later than b: */ +static inline bool nfsd4_stateid_generation_after(stateid_t *a, stateid_t *b) +{ + return (s32)(a->si_generation - b->si_generation) > 0; +} struct nfsd4_compound_state; struct nfsd_net; diff --git a/fs/nfsd/stats.c b/fs/nfsd/stats.c index cd90878a76aa..d97338bb6a39 100644 --- a/fs/nfsd/stats.c +++ b/fs/nfsd/stats.c @@ -84,7 +84,6 @@ static int nfsd_proc_open(struct inode *inode, struct file *file) } static const struct file_operations nfsd_proc_fops = { - .owner = THIS_MODULE, .open = nfsd_proc_open, .read = seq_read, .llseek = seq_lseek, diff --git a/fs/nilfs2/alloc.c b/fs/nilfs2/alloc.c index 2ccbf5531554..1a85d94f5b25 100644 --- a/fs/nilfs2/alloc.c +++ b/fs/nilfs2/alloc.c @@ -13,13 +13,8 @@ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - * - * Original code was written by Koji Sato <koji@osrg.net>. - * Two allocators were unified by Ryusuke Konishi <ryusuke@osrg.net>, - * Amagai Yoshiji <amagai@osrg.net>. + * Originally written by Koji Sato. + * Two allocators were unified by Ryusuke Konishi and Amagai Yoshiji. */ #include <linux/types.h> @@ -58,7 +53,7 @@ nilfs_palloc_groups_count(const struct inode *inode) * @inode: inode of metadata file using this allocator * @entry_size: size of the persistent object */ -int nilfs_palloc_init_blockgroup(struct inode *inode, unsigned entry_size) +int nilfs_palloc_init_blockgroup(struct inode *inode, unsigned int entry_size) { struct nilfs_mdt_info *mi = NILFS_MDT(inode); @@ -73,13 +68,17 @@ int nilfs_palloc_init_blockgroup(struct inode *inode, unsigned entry_size) mi->mi_blocks_per_group = DIV_ROUND_UP(nilfs_palloc_entries_per_group(inode), mi->mi_entries_per_block) + 1; - /* Number of blocks in a group including entry blocks and - a bitmap block */ + /* + * Number of blocks in a group including entry blocks + * and a bitmap block + */ mi->mi_blocks_per_desc_block = nilfs_palloc_groups_per_desc_block(inode) * mi->mi_blocks_per_group + 1; - /* Number of blocks per descriptor including the - descriptor block */ + /* + * Number of blocks per descriptor including the + * descriptor block + */ return 0; } @@ -389,7 +388,7 @@ void *nilfs_palloc_block_get_entry(const struct inode *inode, __u64 nr, */ static int nilfs_palloc_find_available_slot(unsigned char *bitmap, unsigned long target, - unsigned bsize, + unsigned int bsize, spinlock_t *lock) { int pos, end = bsize; @@ -624,7 +623,7 @@ void nilfs_palloc_commit_free_entry(struct inode *inode, if (!nilfs_clear_bit_atomic(lock, group_offset, bitmap)) nilfs_warning(inode->i_sb, __func__, - "entry number %llu already freed: ino=%lu\n", + "entry number %llu already freed: ino=%lu", (unsigned long long)req->pr_entry_nr, (unsigned long)inode->i_ino); else @@ -665,7 +664,7 @@ void nilfs_palloc_abort_alloc_entry(struct inode *inode, if (!nilfs_clear_bit_atomic(lock, group_offset, bitmap)) nilfs_warning(inode->i_sb, __func__, - "entry number %llu already freed: ino=%lu\n", + "entry number %llu already freed: ino=%lu", (unsigned long long)req->pr_entry_nr, (unsigned long)inode->i_ino); else @@ -740,8 +739,8 @@ int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems) unsigned long group, group_offset; __u64 group_min_nr, last_nrs[8]; const unsigned long epg = nilfs_palloc_entries_per_group(inode); - const unsigned epb = NILFS_MDT(inode)->mi_entries_per_block; - unsigned entry_start, end, pos; + const unsigned int epb = NILFS_MDT(inode)->mi_entries_per_block; + unsigned int entry_start, end, pos; spinlock_t *lock; int i, j, k, ret; u32 nfree; @@ -774,7 +773,7 @@ int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems) if (!nilfs_clear_bit_atomic(lock, group_offset, bitmap)) { nilfs_warning(inode->i_sb, __func__, - "entry number %llu already freed: ino=%lu\n", + "entry number %llu already freed: ino=%lu", (unsigned long long)entry_nrs[j], (unsigned long)inode->i_ino); } else { @@ -819,7 +818,7 @@ int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems) last_nrs[k]); if (ret && ret != -ENOENT) { nilfs_warning(inode->i_sb, __func__, - "failed to delete block of entry %llu: ino=%lu, err=%d\n", + "failed to delete block of entry %llu: ino=%lu, err=%d", (unsigned long long)last_nrs[k], (unsigned long)inode->i_ino, ret); } @@ -838,7 +837,7 @@ int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems) ret = nilfs_palloc_delete_bitmap_block(inode, group); if (ret && ret != -ENOENT) { nilfs_warning(inode->i_sb, __func__, - "failed to delete bitmap block of group %lu: ino=%lu, err=%d\n", + "failed to delete bitmap block of group %lu: ino=%lu, err=%d", group, (unsigned long)inode->i_ino, ret); } diff --git a/fs/nilfs2/alloc.h b/fs/nilfs2/alloc.h index 6e6f49aa53df..05149e606a78 100644 --- a/fs/nilfs2/alloc.h +++ b/fs/nilfs2/alloc.h @@ -13,13 +13,8 @@ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - * - * Original code was written by Koji Sato <koji@osrg.net>. - * Two allocators were unified by Ryusuke Konishi <ryusuke@osrg.net>, - * Amagai Yoshiji <amagai@osrg.net>. + * Originally written by Koji Sato. + * Two allocators were unified by Ryusuke Konishi and Amagai Yoshiji. */ #ifndef _NILFS_ALLOC_H @@ -42,7 +37,7 @@ nilfs_palloc_entries_per_group(const struct inode *inode) return 1UL << (inode->i_blkbits + 3 /* log2(8 = CHAR_BITS) */); } -int nilfs_palloc_init_blockgroup(struct inode *, unsigned); +int nilfs_palloc_init_blockgroup(struct inode *, unsigned int); int nilfs_palloc_get_entry_block(struct inode *, __u64, int, struct buffer_head **); void *nilfs_palloc_block_get_entry(const struct inode *, __u64, diff --git a/fs/nilfs2/bmap.c b/fs/nilfs2/bmap.c index a9fb3636c142..f2a7877e0c8c 100644 --- a/fs/nilfs2/bmap.c +++ b/fs/nilfs2/bmap.c @@ -13,11 +13,7 @@ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - * - * Written by Koji Sato <koji@osrg.net>. + * Written by Koji Sato. */ #include <linux/fs.h> @@ -46,7 +42,7 @@ static int nilfs_bmap_convert_error(struct nilfs_bmap *bmap, if (err == -EINVAL) { nilfs_error(inode->i_sb, fname, - "broken bmap (inode number=%lu)\n", inode->i_ino); + "broken bmap (inode number=%lu)", inode->i_ino); err = -EIO; } return err; @@ -97,7 +93,7 @@ int nilfs_bmap_lookup_at_level(struct nilfs_bmap *bmap, __u64 key, int level, } int nilfs_bmap_lookup_contig(struct nilfs_bmap *bmap, __u64 key, __u64 *ptrp, - unsigned maxblocks) + unsigned int maxblocks) { int ret; diff --git a/fs/nilfs2/bmap.h b/fs/nilfs2/bmap.h index bfa817ce40b3..b6a4c8f93ac8 100644 --- a/fs/nilfs2/bmap.h +++ b/fs/nilfs2/bmap.h @@ -13,11 +13,7 @@ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - * - * Written by Koji Sato <koji@osrg.net>. + * Written by Koji Sato. */ #ifndef _NILFS_BMAP_H @@ -61,7 +57,7 @@ struct nilfs_bmap_stats { struct nilfs_bmap_operations { int (*bop_lookup)(const struct nilfs_bmap *, __u64, int, __u64 *); int (*bop_lookup_contig)(const struct nilfs_bmap *, __u64, __u64 *, - unsigned); + unsigned int); int (*bop_insert)(struct nilfs_bmap *, __u64, __u64); int (*bop_delete)(struct nilfs_bmap *, __u64); void (*bop_clear)(struct nilfs_bmap *); @@ -126,10 +122,14 @@ struct nilfs_bmap { /* pointer type */ #define NILFS_BMAP_PTR_P 0 /* physical block number (i.e. LBN) */ -#define NILFS_BMAP_PTR_VS 1 /* virtual block number (single - version) */ -#define NILFS_BMAP_PTR_VM 2 /* virtual block number (has multiple - versions) */ +#define NILFS_BMAP_PTR_VS 1 /* + * virtual block number (single + * version) + */ +#define NILFS_BMAP_PTR_VM 2 /* + * virtual block number (has multiple + * versions) + */ #define NILFS_BMAP_PTR_U (-1) /* never perform pointer operations */ #define NILFS_BMAP_USE_VBN(bmap) ((bmap)->b_ptr_type > 0) @@ -154,7 +154,7 @@ struct nilfs_bmap_store { int nilfs_bmap_test_and_clear_dirty(struct nilfs_bmap *); int nilfs_bmap_read(struct nilfs_bmap *, struct nilfs_inode *); void nilfs_bmap_write(struct nilfs_bmap *, struct nilfs_inode *); -int nilfs_bmap_lookup_contig(struct nilfs_bmap *, __u64, __u64 *, unsigned); +int nilfs_bmap_lookup_contig(struct nilfs_bmap *, __u64, __u64 *, unsigned int); int nilfs_bmap_insert(struct nilfs_bmap *bmap, __u64 key, unsigned long rec); int nilfs_bmap_delete(struct nilfs_bmap *bmap, __u64 key); int nilfs_bmap_seek_key(struct nilfs_bmap *bmap, __u64 start, __u64 *keyp); diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c index e0c9daf9aa22..4cca998ec7a0 100644 --- a/fs/nilfs2/btnode.c +++ b/fs/nilfs2/btnode.c @@ -13,13 +13,8 @@ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - * - * This file was originally written by Seiji Kihara <kihara@osrg.net> - * and fully revised by Ryusuke Konishi <ryusuke@osrg.net> for - * stabilization and simplification. + * Originally written by Seiji Kihara. + * Fully revised by Ryusuke Konishi for stabilization and simplification. * */ @@ -67,7 +62,7 @@ nilfs_btnode_create_block(struct address_space *btnc, __u64 blocknr) } int nilfs_btnode_submit_block(struct address_space *btnc, __u64 blocknr, - sector_t pblocknr, int mode, + sector_t pblocknr, int mode, int mode_flags, struct buffer_head **pbh, sector_t *submit_ptr) { struct buffer_head *bh; @@ -100,7 +95,7 @@ int nilfs_btnode_submit_block(struct address_space *btnc, __u64 blocknr, } } - if (mode == READA) { + if (mode_flags & REQ_RAHEAD) { if (pblocknr != *submit_ptr + 1 || !trylock_buffer(bh)) { err = -EBUSY; /* internal code */ brelse(bh); @@ -119,7 +114,7 @@ int nilfs_btnode_submit_block(struct address_space *btnc, __u64 blocknr, bh->b_blocknr = pblocknr; /* set block address for read */ bh->b_end_io = end_buffer_read_sync; get_bh(bh); - submit_bh(mode, bh); + submit_bh(mode, mode_flags, bh); bh->b_blocknr = blocknr; /* set back to the given block address */ *submit_ptr = pblocknr; err = 0; diff --git a/fs/nilfs2/btnode.h b/fs/nilfs2/btnode.h index d876b565ce64..4e8aaa1aeb65 100644 --- a/fs/nilfs2/btnode.h +++ b/fs/nilfs2/btnode.h @@ -13,12 +13,8 @@ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - * - * Written by Seiji Kihara <kihara@osrg.net> - * Revised by Ryusuke Konishi <ryusuke@osrg.net> + * Written by Seiji Kihara. + * Revised by Ryusuke Konishi. */ #ifndef _NILFS_BTNODE_H @@ -47,7 +43,7 @@ void nilfs_btnode_cache_clear(struct address_space *); struct buffer_head *nilfs_btnode_create_block(struct address_space *btnc, __u64 blocknr); int nilfs_btnode_submit_block(struct address_space *, __u64, sector_t, int, - struct buffer_head **, sector_t *); + int, struct buffer_head **, sector_t *); void nilfs_btnode_delete(struct buffer_head *); int nilfs_btnode_prepare_change_key(struct address_space *, struct nilfs_btnode_chkey_ctxt *); diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c index 3a3821b00486..982d1e3df3a5 100644 --- a/fs/nilfs2/btree.c +++ b/fs/nilfs2/btree.c @@ -13,11 +13,7 @@ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - * - * Written by Koji Sato <koji@osrg.net>. + * Written by Koji Sato. */ #include <linux/slab.h> @@ -480,7 +476,8 @@ static int __nilfs_btree_get_block(const struct nilfs_bmap *btree, __u64 ptr, sector_t submit_ptr = 0; int ret; - ret = nilfs_btnode_submit_block(btnc, ptr, 0, READ, &bh, &submit_ptr); + ret = nilfs_btnode_submit_block(btnc, ptr, 0, REQ_OP_READ, 0, &bh, + &submit_ptr); if (ret) { if (ret != -EEXIST) return ret; @@ -496,7 +493,8 @@ static int __nilfs_btree_get_block(const struct nilfs_bmap *btree, __u64 ptr, n > 0 && i < ra->ncmax; n--, i++) { ptr2 = nilfs_btree_node_get_ptr(ra->node, i, ra->ncmax); - ret = nilfs_btnode_submit_block(btnc, ptr2, 0, READA, + ret = nilfs_btnode_submit_block(btnc, ptr2, 0, + REQ_OP_READ, REQ_RAHEAD, &ra_bh, &submit_ptr); if (likely(!ret || ret == -EEXIST)) brelse(ra_bh); @@ -689,7 +687,8 @@ static int nilfs_btree_lookup(const struct nilfs_bmap *btree, } static int nilfs_btree_lookup_contig(const struct nilfs_bmap *btree, - __u64 key, __u64 *ptrp, unsigned maxblocks) + __u64 key, __u64 *ptrp, + unsigned int maxblocks) { struct nilfs_btree_path *path; struct nilfs_btree_node *node; @@ -1032,12 +1031,12 @@ static __u64 nilfs_btree_find_target_v(const struct nilfs_bmap *btree, if (ptr != NILFS_BMAP_INVALID_PTR) /* sequential access */ return ptr; - else { - ptr = nilfs_btree_find_near(btree, path); - if (ptr != NILFS_BMAP_INVALID_PTR) - /* near */ - return ptr; - } + + ptr = nilfs_btree_find_near(btree, path); + if (ptr != NILFS_BMAP_INVALID_PTR) + /* near */ + return ptr; + /* block group */ return nilfs_bmap_find_target_in_group(btree); } diff --git a/fs/nilfs2/btree.h b/fs/nilfs2/btree.h index 22c02e35b6ef..df1a25faa83b 100644 --- a/fs/nilfs2/btree.h +++ b/fs/nilfs2/btree.h @@ -13,11 +13,7 @@ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - * - * Written by Koji Sato <koji@osrg.net>. + * Written by Koji Sato. */ #ifndef _NILFS_BTREE_H diff --git a/fs/nilfs2/cpfile.c b/fs/nilfs2/cpfile.c index b6596cab9e99..8a3d3b65af3f 100644 --- a/fs/nilfs2/cpfile.c +++ b/fs/nilfs2/cpfile.c @@ -13,11 +13,7 @@ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - * - * Written by Koji Sato <koji@osrg.net>. + * Written by Koji Sato. */ #include <linux/kernel.h> @@ -41,6 +37,7 @@ static unsigned long nilfs_cpfile_get_blkoff(const struct inode *cpfile, __u64 cno) { __u64 tcno = cno + NILFS_MDT(cpfile)->mi_first_entry_offset - 1; + do_div(tcno, nilfs_cpfile_checkpoints_per_block(cpfile)); return (unsigned long)tcno; } @@ -50,6 +47,7 @@ static unsigned long nilfs_cpfile_get_offset(const struct inode *cpfile, __u64 cno) { __u64 tcno = cno + NILFS_MDT(cpfile)->mi_first_entry_offset - 1; + return do_div(tcno, nilfs_cpfile_checkpoints_per_block(cpfile)); } @@ -433,7 +431,8 @@ static void nilfs_cpfile_checkpoint_to_cpinfo(struct inode *cpfile, } static ssize_t nilfs_cpfile_do_get_cpinfo(struct inode *cpfile, __u64 *cnop, - void *buf, unsigned cisz, size_t nci) + void *buf, unsigned int cisz, + size_t nci) { struct nilfs_checkpoint *cp; struct nilfs_cpinfo *ci = buf; @@ -484,7 +483,8 @@ static ssize_t nilfs_cpfile_do_get_cpinfo(struct inode *cpfile, __u64 *cnop, } static ssize_t nilfs_cpfile_do_get_ssinfo(struct inode *cpfile, __u64 *cnop, - void *buf, unsigned cisz, size_t nci) + void *buf, unsigned int cisz, + size_t nci) { struct buffer_head *bh; struct nilfs_cpfile_header *header; @@ -570,7 +570,7 @@ static ssize_t nilfs_cpfile_do_get_ssinfo(struct inode *cpfile, __u64 *cnop, */ ssize_t nilfs_cpfile_get_cpinfo(struct inode *cpfile, __u64 *cnop, int mode, - void *buf, unsigned cisz, size_t nci) + void *buf, unsigned int cisz, size_t nci) { switch (mode) { case NILFS_CHECKPOINT: @@ -870,8 +870,10 @@ int nilfs_cpfile_is_snapshot(struct inode *cpfile, __u64 cno) void *kaddr; int ret; - /* CP number is invalid if it's zero or larger than the - largest exist one.*/ + /* + * CP number is invalid if it's zero or larger than the + * largest existing one. + */ if (cno == 0 || cno >= nilfs_mdt_cno(cpfile)) return -ENOENT; down_read(&NILFS_MDT(cpfile)->mi_sem); diff --git a/fs/nilfs2/cpfile.h b/fs/nilfs2/cpfile.h index a242b9a314f9..0249744ae234 100644 --- a/fs/nilfs2/cpfile.h +++ b/fs/nilfs2/cpfile.h @@ -13,11 +13,7 @@ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - * - * Written by Koji Sato <koji@osrg.net>. + * Written by Koji Sato. */ #ifndef _NILFS_CPFILE_H @@ -37,8 +33,8 @@ int nilfs_cpfile_delete_checkpoint(struct inode *, __u64); int nilfs_cpfile_change_cpmode(struct inode *, __u64, int); int nilfs_cpfile_is_snapshot(struct inode *, __u64); int nilfs_cpfile_get_stat(struct inode *, struct nilfs_cpstat *); -ssize_t nilfs_cpfile_get_cpinfo(struct inode *, __u64 *, int, void *, unsigned, - size_t); +ssize_t nilfs_cpfile_get_cpinfo(struct inode *, __u64 *, int, void *, + unsigned int, size_t); int nilfs_cpfile_read(struct super_block *sb, size_t cpsize, struct nilfs_inode *raw_inode, struct inode **inodep); diff --git a/fs/nilfs2/dat.c b/fs/nilfs2/dat.c index 7dc23f100e57..7367610ea807 100644 --- a/fs/nilfs2/dat.c +++ b/fs/nilfs2/dat.c @@ -13,11 +13,7 @@ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - * - * Written by Koji Sato <koji@osrg.net>. + * Written by Koji Sato. */ #include <linux/types.h> @@ -428,7 +424,7 @@ int nilfs_dat_translate(struct inode *dat, __u64 vblocknr, sector_t *blocknrp) return ret; } -ssize_t nilfs_dat_get_vinfo(struct inode *dat, void *buf, unsigned visz, +ssize_t nilfs_dat_get_vinfo(struct inode *dat, void *buf, unsigned int visz, size_t nvi) { struct buffer_head *entry_bh; diff --git a/fs/nilfs2/dat.h b/fs/nilfs2/dat.h index cbd8e9732503..abbfdabcabea 100644 --- a/fs/nilfs2/dat.h +++ b/fs/nilfs2/dat.h @@ -13,11 +13,7 @@ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - * - * Written by Koji Sato <koji@osrg.net>. + * Written by Koji Sato. */ #ifndef _NILFS_DAT_H @@ -51,7 +47,7 @@ void nilfs_dat_abort_update(struct inode *, struct nilfs_palloc_req *, int nilfs_dat_mark_dirty(struct inode *, __u64); int nilfs_dat_freev(struct inode *, __u64 *, size_t); int nilfs_dat_move(struct inode *, __u64, sector_t); -ssize_t nilfs_dat_get_vinfo(struct inode *, void *, unsigned, size_t); +ssize_t nilfs_dat_get_vinfo(struct inode *, void *, unsigned int, size_t); int nilfs_dat_read(struct super_block *sb, size_t entry_size, struct nilfs_inode *raw_inode, struct inode **inodep); diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c index 6723d45a631a..e506f4f7120a 100644 --- a/fs/nilfs2/dir.c +++ b/fs/nilfs2/dir.c @@ -13,11 +13,7 @@ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - * - * Modified for NILFS by Amagai Yoshiji <amagai@osrg.net> + * Modified for NILFS by Amagai Yoshiji. */ /* * linux/fs/ext2/dir.c @@ -50,7 +46,7 @@ * nilfs uses block-sized chunks. Arguably, sector-sized ones would be * more robust, but we have what we have */ -static inline unsigned nilfs_chunk_size(struct inode *inode) +static inline unsigned int nilfs_chunk_size(struct inode *inode) { return inode->i_sb->s_blocksize; } @@ -65,9 +61,9 @@ static inline void nilfs_put_page(struct page *page) * Return the offset into page `page_nr' of the last valid * byte in that page, plus one. */ -static unsigned nilfs_last_byte(struct inode *inode, unsigned long page_nr) +static unsigned int nilfs_last_byte(struct inode *inode, unsigned long page_nr) { - unsigned last_byte = inode->i_size; + unsigned int last_byte = inode->i_size; last_byte -= page_nr << PAGE_SHIFT; if (last_byte > PAGE_SIZE) @@ -75,20 +71,22 @@ static unsigned nilfs_last_byte(struct inode *inode, unsigned long page_nr) return last_byte; } -static int nilfs_prepare_chunk(struct page *page, unsigned from, unsigned to) +static int nilfs_prepare_chunk(struct page *page, unsigned int from, + unsigned int to) { loff_t pos = page_offset(page) + from; + return __block_write_begin(page, pos, to - from, nilfs_get_block); } static void nilfs_commit_chunk(struct page *page, struct address_space *mapping, - unsigned from, unsigned to) + unsigned int from, unsigned int to) { struct inode *dir = mapping->host; loff_t pos = page_offset(page) + from; - unsigned len = to - from; - unsigned nr_dirty, copied; + unsigned int len = to - from; + unsigned int nr_dirty, copied; int err; nr_dirty = nilfs_page_count_clean_buffers(page, from, to); @@ -106,10 +104,10 @@ static bool nilfs_check_page(struct page *page) { struct inode *dir = page->mapping->host; struct super_block *sb = dir->i_sb; - unsigned chunk_size = nilfs_chunk_size(dir); + unsigned int chunk_size = nilfs_chunk_size(dir); char *kaddr = page_address(page); - unsigned offs, rec_len; - unsigned limit = PAGE_SIZE; + unsigned int offs, rec_len; + unsigned int limit = PAGE_SIZE; struct nilfs_dir_entry *p; char *error; @@ -259,7 +257,6 @@ static int nilfs_readdir(struct file *file, struct dir_context *ctx) unsigned int offset = pos & ~PAGE_MASK; unsigned long n = pos >> PAGE_SHIFT; unsigned long npages = dir_pages(inode); -/* unsigned chunk_mask = ~(nilfs_chunk_size(inode)-1); */ if (pos > inode->i_size - NILFS_DIR_REC_LEN(1)) return 0; @@ -321,7 +318,7 @@ nilfs_find_entry(struct inode *dir, const struct qstr *qstr, { const unsigned char *name = qstr->name; int namelen = qstr->len; - unsigned reclen = NILFS_DIR_REC_LEN(namelen); + unsigned int reclen = NILFS_DIR_REC_LEN(namelen); unsigned long start, n; unsigned long npages = dir_pages(dir); struct page *page = NULL; @@ -340,6 +337,7 @@ nilfs_find_entry(struct inode *dir, const struct qstr *qstr, n = start; do { char *kaddr; + page = nilfs_get_page(dir, n); if (!IS_ERR(page)) { kaddr = page_address(page); @@ -410,8 +408,8 @@ ino_t nilfs_inode_by_name(struct inode *dir, const struct qstr *qstr) void nilfs_set_link(struct inode *dir, struct nilfs_dir_entry *de, struct page *page, struct inode *inode) { - unsigned from = (char *) de - (char *) page_address(page); - unsigned to = from + nilfs_rec_len_from_disk(de->rec_len); + unsigned int from = (char *)de - (char *)page_address(page); + unsigned int to = from + nilfs_rec_len_from_disk(de->rec_len); struct address_space *mapping = page->mapping; int err; @@ -433,15 +431,15 @@ int nilfs_add_link(struct dentry *dentry, struct inode *inode) struct inode *dir = d_inode(dentry->d_parent); const unsigned char *name = dentry->d_name.name; int namelen = dentry->d_name.len; - unsigned chunk_size = nilfs_chunk_size(dir); - unsigned reclen = NILFS_DIR_REC_LEN(namelen); + unsigned int chunk_size = nilfs_chunk_size(dir); + unsigned int reclen = NILFS_DIR_REC_LEN(namelen); unsigned short rec_len, name_len; struct page *page = NULL; struct nilfs_dir_entry *de; unsigned long npages = dir_pages(dir); unsigned long n; char *kaddr; - unsigned from, to; + unsigned int from, to; int err; /* @@ -533,13 +531,14 @@ int nilfs_delete_entry(struct nilfs_dir_entry *dir, struct page *page) struct address_space *mapping = page->mapping; struct inode *inode = mapping->host; char *kaddr = page_address(page); - unsigned from = ((char *)dir - kaddr) & ~(nilfs_chunk_size(inode) - 1); - unsigned to = ((char *)dir - kaddr) + - nilfs_rec_len_from_disk(dir->rec_len); - struct nilfs_dir_entry *pde = NULL; - struct nilfs_dir_entry *de = (struct nilfs_dir_entry *)(kaddr + from); + unsigned int from, to; + struct nilfs_dir_entry *de, *pde = NULL; int err; + from = ((char *)dir - kaddr) & ~(nilfs_chunk_size(inode) - 1); + to = ((char *)dir - kaddr) + nilfs_rec_len_from_disk(dir->rec_len); + de = (struct nilfs_dir_entry *)(kaddr + from); + while ((char *)de < (char *)dir) { if (de->rec_len == 0) { nilfs_error(inode->i_sb, __func__, @@ -572,7 +571,7 @@ int nilfs_make_empty(struct inode *inode, struct inode *parent) { struct address_space *mapping = inode->i_mapping; struct page *page = grab_cache_page(mapping, 0); - unsigned chunk_size = nilfs_chunk_size(inode); + unsigned int chunk_size = nilfs_chunk_size(inode); struct nilfs_dir_entry *de; int err; void *kaddr; @@ -630,8 +629,8 @@ int nilfs_empty_dir(struct inode *inode) while ((char *)de <= kaddr) { if (de->rec_len == 0) { nilfs_error(inode->i_sb, __func__, - "zero-length directory entry " - "(kaddr=%p, de=%p)\n", kaddr, de); + "zero-length directory entry (kaddr=%p, de=%p)", + kaddr, de); goto not_empty; } if (de->inode != 0) { diff --git a/fs/nilfs2/direct.c b/fs/nilfs2/direct.c index ebf89fd8ac1a..251a44928405 100644 --- a/fs/nilfs2/direct.c +++ b/fs/nilfs2/direct.c @@ -13,11 +13,7 @@ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - * - * Written by Koji Sato <koji@osrg.net>. + * Written by Koji Sato. */ #include <linux/errno.h> @@ -62,7 +58,7 @@ static int nilfs_direct_lookup(const struct nilfs_bmap *direct, static int nilfs_direct_lookup_contig(const struct nilfs_bmap *direct, __u64 key, __u64 *ptrp, - unsigned maxblocks) + unsigned int maxblocks) { struct inode *dat = NULL; __u64 ptr, ptr2; @@ -83,7 +79,8 @@ static int nilfs_direct_lookup_contig(const struct nilfs_bmap *direct, ptr = blocknr; } - maxblocks = min_t(unsigned, maxblocks, NILFS_DIRECT_KEY_MAX - key + 1); + maxblocks = min_t(unsigned int, maxblocks, + NILFS_DIRECT_KEY_MAX - key + 1); for (cnt = 1; cnt < maxblocks && (ptr2 = nilfs_direct_get_ptr(direct, key + cnt)) != NILFS_BMAP_INVALID_PTR; @@ -110,9 +107,9 @@ nilfs_direct_find_target_v(const struct nilfs_bmap *direct, __u64 key) if (ptr != NILFS_BMAP_INVALID_PTR) /* sequential access */ return ptr; - else - /* block group */ - return nilfs_bmap_find_target_in_group(direct); + + /* block group */ + return nilfs_bmap_find_target_in_group(direct); } static int nilfs_direct_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr) diff --git a/fs/nilfs2/direct.h b/fs/nilfs2/direct.h index dc643de20a25..3015a6e78724 100644 --- a/fs/nilfs2/direct.h +++ b/fs/nilfs2/direct.h @@ -13,11 +13,7 @@ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - * - * Written by Koji Sato <koji@osrg.net>. + * Written by Koji Sato. */ #ifndef _NILFS_DIRECT_H diff --git a/fs/nilfs2/export.h b/fs/nilfs2/export.h index 19ccbf9522ab..00107fdb9343 100644 --- a/fs/nilfs2/export.h +++ b/fs/nilfs2/export.h @@ -20,6 +20,6 @@ struct nilfs_fid { u32 parent_gen; u64 parent_ino; -} __attribute__ ((packed)); +} __packed; #endif diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c index 088ba001c6ef..547381f3ce13 100644 --- a/fs/nilfs2/file.c +++ b/fs/nilfs2/file.c @@ -13,12 +13,7 @@ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - * - * Written by Amagai Yoshiji <amagai@osrg.net>, - * Ryusuke Konishi <ryusuke@osrg.net> + * Written by Amagai Yoshiji and Ryusuke Konishi. */ #include <linux/fs.h> diff --git a/fs/nilfs2/gcinode.c b/fs/nilfs2/gcinode.c index 0224b7826ace..e9148f94d696 100644 --- a/fs/nilfs2/gcinode.c +++ b/fs/nilfs2/gcinode.c @@ -13,13 +13,8 @@ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - * - * Written by Seiji Kihara <kihara@osrg.net>, Amagai Yoshiji <amagai@osrg.net>, - * and Ryusuke Konishi <ryusuke@osrg.net>. - * Revised by Ryusuke Konishi <ryusuke@osrg.net>. + * Written by Seiji Kihara, Amagai Yoshiji, and Ryusuke Konishi. + * Revised by Ryusuke Konishi. * */ /* @@ -106,7 +101,7 @@ int nilfs_gccache_submit_read_data(struct inode *inode, sector_t blkoff, bh->b_blocknr = pbn; bh->b_end_io = end_buffer_read_sync; get_bh(bh); - submit_bh(READ, bh); + submit_bh(REQ_OP_READ, 0, bh); if (vbn) bh->b_blocknr = vbn; out: @@ -143,7 +138,8 @@ int nilfs_gccache_submit_read_node(struct inode *inode, sector_t pbn, int ret; ret = nilfs_btnode_submit_block(&NILFS_I(inode)->i_btnode_cache, - vbn ? : pbn, pbn, READ, out_bh, &pbn); + vbn ? : pbn, pbn, REQ_OP_READ, 0, + out_bh, &pbn); if (ret == -EEXIST) /* internal code (cache hit) */ ret = 0; return ret; diff --git a/fs/nilfs2/ifile.c b/fs/nilfs2/ifile.c index 6548c7851b48..1d2b1805327a 100644 --- a/fs/nilfs2/ifile.c +++ b/fs/nilfs2/ifile.c @@ -13,12 +13,8 @@ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - * - * Written by Amagai Yoshiji <amagai@osrg.net>. - * Revised by Ryusuke Konishi <ryusuke@osrg.net>. + * Written by Amagai Yoshiji. + * Revised by Ryusuke Konishi. * */ @@ -68,8 +64,10 @@ int nilfs_ifile_create_inode(struct inode *ifile, ino_t *out_ino, struct nilfs_palloc_req req; int ret; - req.pr_entry_nr = 0; /* 0 says find free inode from beginning of - a group. dull code!! */ + req.pr_entry_nr = 0; /* + * 0 says find free inode from beginning + * of a group. dull code!! + */ req.pr_entry_bh = NULL; ret = nilfs_palloc_prepare_alloc_entry(ifile, &req); diff --git a/fs/nilfs2/ifile.h b/fs/nilfs2/ifile.h index 679674d13372..23ad2f091e76 100644 --- a/fs/nilfs2/ifile.h +++ b/fs/nilfs2/ifile.h @@ -13,12 +13,8 @@ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - * - * Written by Amagai Yoshiji <amagai@osrg.net> - * Revised by Ryusuke Konishi <ryusuke@osrg.net> + * Written by Amagai Yoshiji. + * Revised by Ryusuke Konishi. * */ @@ -36,6 +32,7 @@ static inline struct nilfs_inode * nilfs_ifile_map_inode(struct inode *ifile, ino_t ino, struct buffer_head *ibh) { void *kaddr = kmap(ibh->b_page); + return nilfs_palloc_block_get_entry(ifile, ino, ibh, kaddr); } diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index cfebcd2fc7f3..a0ebdb17e912 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c @@ -13,11 +13,7 @@ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - * - * Written by Ryusuke Konishi <ryusuke@osrg.net> + * Written by Ryusuke Konishi. * */ @@ -87,7 +83,7 @@ int nilfs_get_block(struct inode *inode, sector_t blkoff, struct the_nilfs *nilfs = inode->i_sb->s_fs_info; __u64 blknum = 0; int err = 0, ret; - unsigned maxblocks = bh_result->b_size >> inode->i_blkbits; + unsigned int maxblocks = bh_result->b_size >> inode->i_blkbits; down_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem); ret = nilfs_bmap_lookup_contig(ii->i_bmap, blkoff, &blknum, maxblocks); @@ -133,11 +129,14 @@ int nilfs_get_block(struct inode *inode, sector_t blkoff, /* Error handling should be detailed */ set_buffer_new(bh_result); set_buffer_delay(bh_result); - map_bh(bh_result, inode->i_sb, 0); /* dbn must be changed - to proper value */ + map_bh(bh_result, inode->i_sb, 0); + /* Disk block number must be changed to proper value */ + } else if (ret == -ENOENT) { - /* not found is not error (e.g. hole); must return without - the mapped state flag. */ + /* + * not found is not error (e.g. hole); must return without + * the mapped state flag. + */ ; } else { err = ret; @@ -167,7 +166,7 @@ static int nilfs_readpage(struct file *file, struct page *page) * @nr_pages - number of pages to be read */ static int nilfs_readpages(struct file *file, struct address_space *mapping, - struct list_head *pages, unsigned nr_pages) + struct list_head *pages, unsigned int nr_pages) { return mpage_readpages(mapping, pages, nr_pages, nilfs_get_block); } @@ -226,7 +225,7 @@ static int nilfs_set_page_dirty(struct page *page) int ret = __set_page_dirty_nobuffers(page); if (page_has_buffers(page)) { - unsigned nr_dirty = 0; + unsigned int nr_dirty = 0; struct buffer_head *bh, *head; /* @@ -249,7 +248,7 @@ static int nilfs_set_page_dirty(struct page *page) if (nr_dirty) nilfs_set_file_dirty(inode, nr_dirty); } else if (ret) { - unsigned nr_dirty = 1 << (PAGE_SHIFT - inode->i_blkbits); + unsigned int nr_dirty = 1 << (PAGE_SHIFT - inode->i_blkbits); nilfs_set_file_dirty(inode, nr_dirty); } @@ -291,8 +290,8 @@ static int nilfs_write_end(struct file *file, struct address_space *mapping, struct page *page, void *fsdata) { struct inode *inode = mapping->host; - unsigned start = pos & (PAGE_SIZE - 1); - unsigned nr_dirty; + unsigned int start = pos & (PAGE_SIZE - 1); + unsigned int nr_dirty; int err; nr_dirty = nilfs_page_count_clean_buffers(page, start, @@ -399,23 +398,26 @@ struct inode *nilfs_new_inode(struct inode *dir, umode_t mode) err = nilfs_init_acl(inode, dir); if (unlikely(err)) - goto failed_after_creation; /* never occur. When supporting - nilfs_init_acl(), proper cancellation of - above jobs should be considered */ + /* + * Never occur. When supporting nilfs_init_acl(), + * proper cancellation of above jobs should be considered. + */ + goto failed_after_creation; return inode; failed_after_creation: clear_nlink(inode); unlock_new_inode(inode); - iput(inode); /* raw_inode will be deleted through - nilfs_evict_inode() */ + iput(inode); /* + * raw_inode will be deleted through + * nilfs_evict_inode(). + */ goto failed; failed_ifile_create_inode: make_bad_inode(inode); - iput(inode); /* if i_nlink == 1, generic_forget_inode() will be - called */ + iput(inode); failed: return ERR_PTR(err); } @@ -666,8 +668,10 @@ void nilfs_write_inode_common(struct inode *inode, else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) raw_inode->i_device_code = cpu_to_le64(huge_encode_dev(inode->i_rdev)); - /* When extending inode, nilfs->ns_inode_size should be checked - for substitutions of appended fields */ + /* + * When extending inode, nilfs->ns_inode_size should be checked + * for substitutions of appended fields. + */ } void nilfs_update_inode(struct inode *inode, struct buffer_head *ibh, int flags) @@ -685,9 +689,12 @@ void nilfs_update_inode(struct inode *inode, struct buffer_head *ibh, int flags) set_bit(NILFS_I_INODE_SYNC, &ii->i_state); nilfs_write_inode_common(inode, raw_inode, 0); - /* XXX: call with has_bmap = 0 is a workaround to avoid - deadlock of bmap. This delays update of i_bmap to just - before writing */ + /* + * XXX: call with has_bmap = 0 is a workaround to avoid + * deadlock of bmap. This delays update of i_bmap to just + * before writing. + */ + nilfs_ifile_unmap_inode(ifile, ino, ibh); } @@ -752,14 +759,15 @@ void nilfs_truncate(struct inode *inode) nilfs_mark_inode_dirty(inode); nilfs_set_file_dirty(inode, 0); nilfs_transaction_commit(sb); - /* May construct a logical segment and may fail in sync mode. - But truncate has no return value. */ + /* + * May construct a logical segment and may fail in sync mode. + * But truncate has no return value. + */ } static void nilfs_clear_inode(struct inode *inode) { struct nilfs_inode_info *ii = NILFS_I(inode); - struct nilfs_mdt_info *mdi = NILFS_MDT(inode); /* * Free resources allocated in nilfs_read_inode(), here. @@ -768,8 +776,8 @@ static void nilfs_clear_inode(struct inode *inode) brelse(ii->i_bh); ii->i_bh = NULL; - if (mdi && mdi->mi_palloc_cache) - nilfs_palloc_destroy_cache(inode); + if (nilfs_is_metadata_file_inode(inode)) + nilfs_mdt_clear(inode); if (test_bit(NILFS_I_BMAP, &ii->i_state)) nilfs_bmap_clear(ii->i_bmap); @@ -811,8 +819,10 @@ void nilfs_evict_inode(struct inode *inode) if (IS_SYNC(inode)) nilfs_set_transaction_flag(NILFS_TI_SYNC); nilfs_transaction_commit(sb); - /* May construct a logical segment and may fail in sync mode. - But delete_inode has no return value. */ + /* + * May construct a logical segment and may fail in sync mode. + * But delete_inode has no return value. + */ } int nilfs_setattr(struct dentry *dentry, struct iattr *iattr) @@ -856,6 +866,7 @@ out_err: int nilfs_permission(struct inode *inode, int mask) { struct nilfs_root *root = NILFS_I(inode)->i_root; + if ((mask & MAY_WRITE) && root && root->cno != NILFS_CPTREE_CURRENT_CNO) return -EROFS; /* snapshot is not writable */ @@ -906,7 +917,7 @@ int nilfs_inode_dirty(struct inode *inode) return ret; } -int nilfs_set_file_dirty(struct inode *inode, unsigned nr_dirty) +int nilfs_set_file_dirty(struct inode *inode, unsigned int nr_dirty) { struct nilfs_inode_info *ii = NILFS_I(inode); struct the_nilfs *nilfs = inode->i_sb->s_fs_info; @@ -919,17 +930,23 @@ int nilfs_set_file_dirty(struct inode *inode, unsigned nr_dirty) spin_lock(&nilfs->ns_inode_lock); if (!test_bit(NILFS_I_QUEUED, &ii->i_state) && !test_bit(NILFS_I_BUSY, &ii->i_state)) { - /* Because this routine may race with nilfs_dispose_list(), - we have to check NILFS_I_QUEUED here, too. */ + /* + * Because this routine may race with nilfs_dispose_list(), + * we have to check NILFS_I_QUEUED here, too. + */ if (list_empty(&ii->i_dirty) && igrab(inode) == NULL) { - /* This will happen when somebody is freeing - this inode. */ + /* + * This will happen when somebody is freeing + * this inode. + */ nilfs_warning(inode->i_sb, __func__, - "cannot get inode (ino=%lu)\n", + "cannot get inode (ino=%lu)", inode->i_ino); spin_unlock(&nilfs->ns_inode_lock); - return -EINVAL; /* NILFS_I_DIRTY may remain for - freeing inode */ + return -EINVAL; /* + * NILFS_I_DIRTY may remain for + * freeing inode. + */ } list_move_tail(&ii->i_dirty, &nilfs->ns_dirty_files); set_bit(NILFS_I_QUEUED, &ii->i_state); @@ -946,7 +963,7 @@ int __nilfs_mark_inode_dirty(struct inode *inode, int flags) err = nilfs_load_inode_block(inode, &ibh); if (unlikely(err)) { nilfs_warning(inode->i_sb, __func__, - "failed to reget inode block.\n"); + "failed to reget inode block."); return err; } nilfs_update_inode(inode, ibh, flags); @@ -973,7 +990,7 @@ void nilfs_dirty_inode(struct inode *inode, int flags) if (is_bad_inode(inode)) { nilfs_warning(inode->i_sb, __func__, - "tried to mark bad_inode dirty. ignored.\n"); + "tried to mark bad_inode dirty. ignored."); dump_stack(); return; } diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c index e8fe24882b5b..358b57e2cdf9 100644 --- a/fs/nilfs2/ioctl.c +++ b/fs/nilfs2/ioctl.c @@ -13,11 +13,7 @@ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - * - * Written by Koji Sato <koji@osrg.net>. + * Written by Koji Sato. */ #include <linux/fs.h> @@ -783,6 +779,7 @@ static int nilfs_ioctl_mark_blocks_dirty(struct the_nilfs *nilfs, size_t nmembs = argv->v_nmembs; struct nilfs_bmap *bmap = NILFS_I(nilfs->ns_dat)->i_bmap; struct nilfs_bdesc *bdescs = buf; + struct buffer_head *bh; int ret, i; for (i = 0; i < nmembs; i++) { @@ -800,12 +797,16 @@ static int nilfs_ioctl_mark_blocks_dirty(struct the_nilfs *nilfs, /* skip dead block */ continue; if (bdescs[i].bd_level == 0) { - ret = nilfs_mdt_mark_block_dirty(nilfs->ns_dat, - bdescs[i].bd_offset); - if (ret < 0) { + ret = nilfs_mdt_get_block(nilfs->ns_dat, + bdescs[i].bd_offset, + false, NULL, &bh); + if (unlikely(ret)) { WARN_ON(ret == -ENOENT); return ret; } + mark_buffer_dirty(bh); + nilfs_mdt_mark_dirty(nilfs->ns_dat); + put_bh(bh); } else { ret = nilfs_bmap_mark(bmap, bdescs[i].bd_offset, bdescs[i].bd_level); diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c index f6982b9153d5..0d7b71fbeff8 100644 --- a/fs/nilfs2/mdt.c +++ b/fs/nilfs2/mdt.c @@ -13,11 +13,7 @@ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - * - * Written by Ryusuke Konishi <ryusuke@osrg.net> + * Written by Ryusuke Konishi. */ #include <linux/buffer_head.h> @@ -32,6 +28,7 @@ #include "segment.h" #include "page.h" #include "mdt.h" +#include "alloc.h" /* nilfs_palloc_destroy_cache() */ #include <trace/events/nilfs2.h> @@ -124,7 +121,7 @@ static int nilfs_mdt_create_block(struct inode *inode, unsigned long block, static int nilfs_mdt_submit_block(struct inode *inode, unsigned long blkoff, - int mode, struct buffer_head **out_bh) + int mode, int mode_flags, struct buffer_head **out_bh) { struct buffer_head *bh; __u64 blknum = 0; @@ -138,7 +135,7 @@ nilfs_mdt_submit_block(struct inode *inode, unsigned long blkoff, if (buffer_uptodate(bh)) goto out; - if (mode == READA) { + if (mode_flags & REQ_RAHEAD) { if (!trylock_buffer(bh)) { ret = -EBUSY; goto failed_bh; @@ -160,7 +157,7 @@ nilfs_mdt_submit_block(struct inode *inode, unsigned long blkoff, bh->b_end_io = end_buffer_read_sync; get_bh(bh); - submit_bh(mode, bh); + submit_bh(mode, mode_flags, bh); ret = 0; trace_nilfs2_mdt_submit_block(inode, inode->i_ino, blkoff, mode); @@ -184,7 +181,7 @@ static int nilfs_mdt_read_block(struct inode *inode, unsigned long block, int i, nr_ra_blocks = NILFS_MDT_MAX_RA_BLOCKS; int err; - err = nilfs_mdt_submit_block(inode, block, READ, &first_bh); + err = nilfs_mdt_submit_block(inode, block, REQ_OP_READ, 0, &first_bh); if (err == -EEXIST) /* internal code */ goto out; @@ -194,7 +191,8 @@ static int nilfs_mdt_read_block(struct inode *inode, unsigned long block, if (readahead) { blkoff = block + 1; for (i = 0; i < nr_ra_blocks; i++, blkoff++) { - err = nilfs_mdt_submit_block(inode, blkoff, READA, &bh); + err = nilfs_mdt_submit_block(inode, blkoff, REQ_OP_READ, + REQ_RAHEAD, &bh); if (likely(!err || err == -EEXIST)) brelse(bh); else if (err != -EBUSY) @@ -393,34 +391,6 @@ int nilfs_mdt_forget_block(struct inode *inode, unsigned long block) return ret; } -/** - * nilfs_mdt_mark_block_dirty - mark a block on the meta data file dirty. - * @inode: inode of the meta data file - * @block: block offset - * - * Return Value: On success, it returns 0. On error, the following negative - * error code is returned. - * - * %-ENOMEM - Insufficient memory available. - * - * %-EIO - I/O error - * - * %-ENOENT - the specified block does not exist (hole block) - */ -int nilfs_mdt_mark_block_dirty(struct inode *inode, unsigned long block) -{ - struct buffer_head *bh; - int err; - - err = nilfs_mdt_read_block(inode, block, 0, &bh); - if (unlikely(err)) - return err; - mark_buffer_dirty(bh); - nilfs_mdt_mark_dirty(inode); - brelse(bh); - return 0; -} - int nilfs_mdt_fetch_dirty(struct inode *inode) { struct nilfs_inode_info *ii = NILFS_I(inode); @@ -497,8 +467,32 @@ int nilfs_mdt_init(struct inode *inode, gfp_t gfp_mask, size_t objsz) return 0; } -void nilfs_mdt_set_entry_size(struct inode *inode, unsigned entry_size, - unsigned header_size) +/** + * nilfs_mdt_clear - do cleanup for the metadata file + * @inode: inode of the metadata file + */ +void nilfs_mdt_clear(struct inode *inode) +{ + struct nilfs_mdt_info *mdi = NILFS_MDT(inode); + + if (mdi->mi_palloc_cache) + nilfs_palloc_destroy_cache(inode); +} + +/** + * nilfs_mdt_destroy - release resources used by the metadata file + * @inode: inode of the metadata file + */ +void nilfs_mdt_destroy(struct inode *inode) +{ + struct nilfs_mdt_info *mdi = NILFS_MDT(inode); + + kfree(mdi->mi_bgl); /* kfree(NULL) is safe */ + kfree(mdi); +} + +void nilfs_mdt_set_entry_size(struct inode *inode, unsigned int entry_size, + unsigned int header_size) { struct nilfs_mdt_info *mi = NILFS_MDT(inode); diff --git a/fs/nilfs2/mdt.h b/fs/nilfs2/mdt.h index 03246cac3338..3f67f3932097 100644 --- a/fs/nilfs2/mdt.h +++ b/fs/nilfs2/mdt.h @@ -13,11 +13,7 @@ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - * - * Written by Ryusuke Konishi <ryusuke@osrg.net> + * Written by Ryusuke Konishi. */ #ifndef _NILFS_MDT_H @@ -57,8 +53,8 @@ struct nilfs_shadow_map { struct nilfs_mdt_info { struct rw_semaphore mi_sem; struct blockgroup_lock *mi_bgl; - unsigned mi_entry_size; - unsigned mi_first_entry_offset; + unsigned int mi_entry_size; + unsigned int mi_first_entry_offset; unsigned long mi_entries_per_block; struct nilfs_palloc_cache *mi_palloc_cache; struct nilfs_shadow_map *mi_shadow; @@ -71,6 +67,11 @@ static inline struct nilfs_mdt_info *NILFS_MDT(const struct inode *inode) return inode->i_private; } +static inline int nilfs_is_metadata_file_inode(const struct inode *inode) +{ + return inode->i_private != NULL; +} + /* Default GFP flags using highmem */ #define NILFS_MDT_GFP (__GFP_RECLAIM | __GFP_IO | __GFP_HIGHMEM) @@ -83,11 +84,13 @@ int nilfs_mdt_find_block(struct inode *inode, unsigned long start, struct buffer_head **out_bh); int nilfs_mdt_delete_block(struct inode *, unsigned long); int nilfs_mdt_forget_block(struct inode *, unsigned long); -int nilfs_mdt_mark_block_dirty(struct inode *, unsigned long); int nilfs_mdt_fetch_dirty(struct inode *); int nilfs_mdt_init(struct inode *inode, gfp_t gfp_mask, size_t objsz); -void nilfs_mdt_set_entry_size(struct inode *, unsigned, unsigned); +void nilfs_mdt_clear(struct inode *inode); +void nilfs_mdt_destroy(struct inode *inode); + +void nilfs_mdt_set_entry_size(struct inode *, unsigned int, unsigned int); int nilfs_mdt_setup_shadow_map(struct inode *inode, struct nilfs_shadow_map *shadow); diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c index 3b2af05f9fb4..1ec8ae5995a5 100644 --- a/fs/nilfs2/namei.c +++ b/fs/nilfs2/namei.c @@ -13,12 +13,7 @@ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - * - * Modified for NILFS by Amagai Yoshiji <amagai@osrg.net>, - * Ryusuke Konishi <ryusuke@osrg.net> + * Modified for NILFS by Amagai Yoshiji and Ryusuke Konishi. */ /* * linux/fs/ext2/namei.c @@ -49,6 +44,7 @@ static inline int nilfs_add_nondir(struct dentry *dentry, struct inode *inode) { int err = nilfs_add_link(dentry, inode); + if (!err) { d_instantiate(dentry, inode); unlock_new_inode(inode); @@ -143,7 +139,7 @@ static int nilfs_symlink(struct inode *dir, struct dentry *dentry, { struct nilfs_transaction_info ti; struct super_block *sb = dir->i_sb; - unsigned l = strlen(symname)+1; + unsigned int l = strlen(symname) + 1; struct inode *inode; int err; @@ -288,7 +284,7 @@ static int nilfs_do_unlink(struct inode *dir, struct dentry *dentry) if (!inode->i_nlink) { nilfs_warning(inode->i_sb, __func__, - "deleting nonexistent file (%lu), %d\n", + "deleting nonexistent file (%lu), %d", inode->i_ino, inode->i_nlink); set_nlink(inode, 1); } diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h index 385704027575..b1d48bc0532d 100644 --- a/fs/nilfs2/nilfs.h +++ b/fs/nilfs2/nilfs.h @@ -13,12 +13,7 @@ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - * - * Written by Koji Sato <koji@osrg.net> - * Ryusuke Konishi <ryusuke@osrg.net> + * Written by Koji Sato and Ryusuke Konishi. */ #ifndef _NILFS_H @@ -69,8 +64,10 @@ struct nilfs_inode_info { */ struct rw_semaphore xattr_sem; #endif - struct buffer_head *i_bh; /* i_bh contains a new or dirty - disk inode */ + struct buffer_head *i_bh; /* + * i_bh contains a new or dirty + * disk inode. + */ struct nilfs_root *i_root; struct inode vfs_inode; }; @@ -100,8 +97,10 @@ enum { NILFS_I_NEW = 0, /* Inode is newly created */ NILFS_I_DIRTY, /* The file is dirty */ NILFS_I_QUEUED, /* inode is in dirty_files list */ - NILFS_I_BUSY, /* inode is grabbed by a segment - constructor */ + NILFS_I_BUSY, /* + * Inode is grabbed by a segment + * constructor + */ NILFS_I_COLLECTED, /* All dirty blocks are collected */ NILFS_I_UPDATED, /* The file has been written back */ NILFS_I_INODE_SYNC, /* dsync is not allowed for inode */ @@ -145,8 +144,10 @@ enum { struct nilfs_transaction_info { u32 ti_magic; void *ti_save; - /* This should never used. If this happens, - one of other filesystems has a bug. */ + /* + * This should never be used. If it happens, + * one of other filesystems has a bug. + */ unsigned short ti_flags; unsigned short ti_count; }; @@ -156,8 +157,10 @@ struct nilfs_transaction_info { /* ti_flags */ #define NILFS_TI_DYNAMIC_ALLOC 0x0001 /* Allocated from slab */ -#define NILFS_TI_SYNC 0x0002 /* Force to construct segment at the - end of transaction. */ +#define NILFS_TI_SYNC 0x0002 /* + * Force to construct segment at the + * end of transaction. + */ #define NILFS_TI_GC 0x0004 /* GC context */ #define NILFS_TI_COMMIT 0x0008 /* Change happened or not */ #define NILFS_TI_WRITER 0x0010 /* Constructor context */ @@ -279,7 +282,7 @@ extern void nilfs_write_failed(struct address_space *mapping, loff_t to); int nilfs_permission(struct inode *inode, int mask); int nilfs_load_inode_block(struct inode *inode, struct buffer_head **pbh); extern int nilfs_inode_dirty(struct inode *); -int nilfs_set_file_dirty(struct inode *inode, unsigned nr_dirty); +int nilfs_set_file_dirty(struct inode *inode, unsigned int nr_dirty); extern int __nilfs_mark_inode_dirty(struct inode *, int); extern void nilfs_dirty_inode(struct inode *, int flags); int nilfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c index 489391561cda..d97ba5f11b77 100644 --- a/fs/nilfs2/page.c +++ b/fs/nilfs2/page.c @@ -13,12 +13,7 @@ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - * - * Written by Ryusuke Konishi <ryusuke@osrg.net>, - * Seiji Kihara <kihara@osrg.net>. + * Written by Ryusuke Konishi and Seiji Kihara. */ #include <linux/pagemap.h> @@ -440,12 +435,12 @@ void nilfs_clear_dirty_page(struct page *page, bool silent) __nilfs_clear_page_dirty(page); } -unsigned nilfs_page_count_clean_buffers(struct page *page, - unsigned from, unsigned to) +unsigned int nilfs_page_count_clean_buffers(struct page *page, + unsigned int from, unsigned int to) { - unsigned block_start, block_end; + unsigned int block_start, block_end; struct buffer_head *bh, *head; - unsigned nc = 0; + unsigned int nc = 0; for (bh = head = page_buffers(page), block_start = 0; bh != head || !block_start; diff --git a/fs/nilfs2/page.h b/fs/nilfs2/page.h index a43b8287d012..f3687c958fa8 100644 --- a/fs/nilfs2/page.h +++ b/fs/nilfs2/page.h @@ -13,12 +13,7 @@ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - * - * Written by Ryusuke Konishi <ryusuke@osrg.net>, - * Seiji Kihara <kihara@osrg.net>. + * Written by Ryusuke Konishi and Seiji Kihara. */ #ifndef _NILFS_PAGE_H @@ -58,7 +53,8 @@ void nilfs_copy_back_pages(struct address_space *, struct address_space *); void nilfs_clear_dirty_page(struct page *, bool); void nilfs_clear_dirty_pages(struct address_space *, bool); void nilfs_mapping_init(struct address_space *mapping, struct inode *inode); -unsigned nilfs_page_count_clean_buffers(struct page *, unsigned, unsigned); +unsigned int nilfs_page_count_clean_buffers(struct page *, unsigned int, + unsigned int); unsigned long nilfs_find_uncommitted_extent(struct inode *inode, sector_t start_blk, sector_t *blkoff); diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c index 5afa77fadc11..d893dc912b62 100644 --- a/fs/nilfs2/recovery.c +++ b/fs/nilfs2/recovery.c @@ -13,11 +13,7 @@ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - * - * Written by Ryusuke Konishi <ryusuke@osrg.net> + * Written by Ryusuke Konishi. */ #include <linux/buffer_head.h> @@ -47,8 +43,10 @@ enum { /* work structure for recovery */ struct nilfs_recovery_block { - ino_t ino; /* Inode number of the file that this block - belongs to */ + ino_t ino; /* + * Inode number of the file that this block + * belongs to + */ sector_t blocknr; /* block number */ __u64 vblocknr; /* virtual block number */ unsigned long blkoff; /* File offset of the data block (per block) */ @@ -156,7 +154,7 @@ int nilfs_read_super_root_block(struct the_nilfs *nilfs, sector_t sr_block, sr = (struct nilfs_super_root *)bh_sr->b_data; if (check) { - unsigned bytes = le16_to_cpu(sr->sr_bytes); + unsigned int bytes = le16_to_cpu(sr->sr_bytes); if (bytes == 0 || bytes > nilfs->ns_blocksize) { ret = NILFS_SEG_FAIL_CHECKSUM_SUPER_ROOT; @@ -508,7 +506,7 @@ static int nilfs_recover_dsync_blocks(struct the_nilfs *nilfs, { struct inode *inode; struct nilfs_recovery_block *rb, *n; - unsigned blocksize = nilfs->ns_blocksize; + unsigned int blocksize = nilfs->ns_blocksize; struct page *page; loff_t pos; int err = 0, err2 = 0; @@ -526,6 +524,7 @@ static int nilfs_recover_dsync_blocks(struct the_nilfs *nilfs, 0, &page, nilfs_get_block); if (unlikely(err)) { loff_t isize = inode->i_size; + if (pos + blocksize > isize) nilfs_write_failed(inode->i_mapping, pos + blocksize); @@ -872,9 +871,11 @@ int nilfs_search_super_root(struct the_nilfs *nilfs, flags = le16_to_cpu(sum->ss_flags); if (!(flags & NILFS_SS_SR) && !scan_newer) { - /* This will never happen because a superblock - (last_segment) always points to a pseg - having a super root. */ + /* + * This will never happen because a superblock + * (last_segment) always points to a pseg with + * a super root. + */ ret = NILFS_SEG_FAIL_CONSISTENCY; goto failed; } diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c index f63620ce3892..a962d7d83447 100644 --- a/fs/nilfs2/segbuf.c +++ b/fs/nilfs2/segbuf.c @@ -13,11 +13,7 @@ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - * - * Written by Ryusuke Konishi <ryusuke@osrg.net> + * Written by Ryusuke Konishi. * */ @@ -133,7 +129,7 @@ int nilfs_segbuf_extend_payload(struct nilfs_segment_buffer *segbuf, return 0; } -int nilfs_segbuf_reset(struct nilfs_segment_buffer *segbuf, unsigned flags, +int nilfs_segbuf_reset(struct nilfs_segment_buffer *segbuf, unsigned int flags, time_t ctime, __u64 cno) { int err; @@ -240,7 +236,7 @@ nilfs_segbuf_fill_in_super_root_crc(struct nilfs_segment_buffer *segbuf, { struct nilfs_super_root *raw_sr; struct the_nilfs *nilfs = segbuf->sb_super->s_fs_info; - unsigned srsize; + unsigned int srsize; u32 crc; raw_sr = (struct nilfs_super_root *)segbuf->sb_super_root->b_data; @@ -350,7 +346,8 @@ static void nilfs_end_bio_write(struct bio *bio) } static int nilfs_segbuf_submit_bio(struct nilfs_segment_buffer *segbuf, - struct nilfs_write_info *wi, int mode) + struct nilfs_write_info *wi, int mode, + int mode_flags) { struct bio *bio = wi->bio; int err; @@ -368,7 +365,8 @@ static int nilfs_segbuf_submit_bio(struct nilfs_segment_buffer *segbuf, bio->bi_end_io = nilfs_end_bio_write; bio->bi_private = segbuf; - submit_bio(mode, bio); + bio_set_op_attrs(bio, mode, mode_flags); + submit_bio(bio); segbuf->sb_nbio++; wi->bio = NULL; @@ -441,7 +439,7 @@ static int nilfs_segbuf_submit_bh(struct nilfs_segment_buffer *segbuf, return 0; } /* bio is FULL */ - err = nilfs_segbuf_submit_bio(segbuf, wi, mode); + err = nilfs_segbuf_submit_bio(segbuf, wi, mode, 0); /* never submit current bh */ if (likely(!err)) goto repeat; @@ -465,19 +463,19 @@ static int nilfs_segbuf_write(struct nilfs_segment_buffer *segbuf, { struct nilfs_write_info wi; struct buffer_head *bh; - int res = 0, rw = WRITE; + int res = 0; wi.nilfs = nilfs; nilfs_segbuf_prepare_write(segbuf, &wi); list_for_each_entry(bh, &segbuf->sb_segsum_buffers, b_assoc_buffers) { - res = nilfs_segbuf_submit_bh(segbuf, &wi, bh, rw); + res = nilfs_segbuf_submit_bh(segbuf, &wi, bh, REQ_OP_WRITE); if (unlikely(res)) goto failed_bio; } list_for_each_entry(bh, &segbuf->sb_payload_buffers, b_assoc_buffers) { - res = nilfs_segbuf_submit_bh(segbuf, &wi, bh, rw); + res = nilfs_segbuf_submit_bh(segbuf, &wi, bh, REQ_OP_WRITE); if (unlikely(res)) goto failed_bio; } @@ -487,8 +485,8 @@ static int nilfs_segbuf_write(struct nilfs_segment_buffer *segbuf, * Last BIO is always sent through the following * submission. */ - rw |= REQ_SYNC; - res = nilfs_segbuf_submit_bio(segbuf, &wi, rw); + res = nilfs_segbuf_submit_bio(segbuf, &wi, REQ_OP_WRITE, + REQ_SYNC); } failed_bio: diff --git a/fs/nilfs2/segbuf.h b/fs/nilfs2/segbuf.h index b04f08cc2397..7bbccc099709 100644 --- a/fs/nilfs2/segbuf.h +++ b/fs/nilfs2/segbuf.h @@ -13,11 +13,7 @@ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - * - * Written by Ryusuke Konishi <ryusuke@osrg.net> + * Written by Ryusuke Konishi. * */ #ifndef _NILFS_SEGBUF_H @@ -82,7 +78,7 @@ struct nilfs_segment_buffer { __u64 sb_nextnum; sector_t sb_fseg_start, sb_fseg_end; sector_t sb_pseg_start; - unsigned sb_rest_blocks; + unsigned int sb_rest_blocks; /* Buffers */ struct list_head sb_segsum_buffers; @@ -124,7 +120,8 @@ void nilfs_segbuf_map_cont(struct nilfs_segment_buffer *segbuf, struct nilfs_segment_buffer *prev); void nilfs_segbuf_set_next_segnum(struct nilfs_segment_buffer *, __u64, struct the_nilfs *); -int nilfs_segbuf_reset(struct nilfs_segment_buffer *, unsigned, time_t, __u64); +int nilfs_segbuf_reset(struct nilfs_segment_buffer *, unsigned int, time_t, + __u64); int nilfs_segbuf_extend_segsum(struct nilfs_segment_buffer *); int nilfs_segbuf_extend_payload(struct nilfs_segment_buffer *, struct buffer_head **); diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index 4317f72568e6..e78b68a81aec 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -13,11 +13,7 @@ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - * - * Written by Ryusuke Konishi <ryusuke@osrg.net> + * Written by Ryusuke Konishi. * */ @@ -49,18 +45,26 @@ */ #define SC_N_INODEVEC 16 /* Size of locally allocated inode vector */ -#define SC_MAX_SEGDELTA 64 /* Upper limit of the number of segments - appended in collection retry loop */ +#define SC_MAX_SEGDELTA 64 /* + * Upper limit of the number of segments + * appended in collection retry loop + */ /* Construction mode */ enum { SC_LSEG_SR = 1, /* Make a logical segment having a super root */ - SC_LSEG_DSYNC, /* Flush data blocks of a given file and make - a logical segment without a super root */ - SC_FLUSH_FILE, /* Flush data files, leads to segment writes without - creating a checkpoint */ - SC_FLUSH_DAT, /* Flush DAT file. This also creates segments without - a checkpoint */ + SC_LSEG_DSYNC, /* + * Flush data blocks of a given file and make + * a logical segment without a super root. + */ + SC_FLUSH_FILE, /* + * Flush data files, leads to segment writes without + * creating a checkpoint. + */ + SC_FLUSH_DAT, /* + * Flush DAT file. This also creates segments + * without a checkpoint. + */ }; /* Stage numbers of dirty block collection */ @@ -154,17 +158,15 @@ static int nilfs_prepare_segment_lock(struct nilfs_transaction_info *ti) if (cur_ti) { if (cur_ti->ti_magic == NILFS_TI_MAGIC) return ++cur_ti->ti_count; - else { - /* - * If journal_info field is occupied by other FS, - * it is saved and will be restored on - * nilfs_transaction_commit(). - */ - printk(KERN_WARNING - "NILFS warning: journal info from a different " - "FS\n"); - save = current->journal_info; - } + + /* + * If journal_info field is occupied by other FS, + * it is saved and will be restored on + * nilfs_transaction_commit(). + */ + printk(KERN_WARNING + "NILFS warning: journal info from a different FS\n"); + save = current->journal_info; } if (!ti) { ti = kmem_cache_alloc(nilfs_transaction_cachep, GFP_NOFS); @@ -397,10 +399,10 @@ static void nilfs_transaction_unlock(struct super_block *sb) static void *nilfs_segctor_map_segsum_entry(struct nilfs_sc_info *sci, struct nilfs_segsum_pointer *ssp, - unsigned bytes) + unsigned int bytes) { struct nilfs_segment_buffer *segbuf = sci->sc_curseg; - unsigned blocksize = sci->sc_super->s_blocksize; + unsigned int blocksize = sci->sc_super->s_blocksize; void *p; if (unlikely(ssp->offset + bytes > blocksize)) { @@ -422,8 +424,8 @@ static int nilfs_segctor_reset_segment_buffer(struct nilfs_sc_info *sci) { struct nilfs_segment_buffer *segbuf = sci->sc_curseg; struct buffer_head *sumbh; - unsigned sumbytes; - unsigned flags = 0; + unsigned int sumbytes; + unsigned int flags = 0; int err; if (nilfs_doing_gc()) @@ -444,8 +446,10 @@ static int nilfs_segctor_feed_segment(struct nilfs_sc_info *sci) { sci->sc_nblk_this_inc += sci->sc_curseg->sb_sum.nblocks; if (NILFS_SEGBUF_IS_LAST(sci->sc_curseg, &sci->sc_segbufs)) - return -E2BIG; /* The current segment is filled up - (internal code) */ + return -E2BIG; /* + * The current segment is filled up + * (internal code) + */ sci->sc_curseg = NILFS_NEXT_SEGBUF(sci->sc_curseg); return nilfs_segctor_reset_segment_buffer(sci); } @@ -472,9 +476,9 @@ static int nilfs_segctor_add_super_root(struct nilfs_sc_info *sci) */ static int nilfs_segctor_segsum_block_required( struct nilfs_sc_info *sci, const struct nilfs_segsum_pointer *ssp, - unsigned binfo_size) + unsigned int binfo_size) { - unsigned blocksize = sci->sc_super->s_blocksize; + unsigned int blocksize = sci->sc_super->s_blocksize; /* Size of finfo and binfo is enough small against blocksize */ return ssp->offset + binfo_size + @@ -533,7 +537,7 @@ static void nilfs_segctor_end_finfo(struct nilfs_sc_info *sci, static int nilfs_segctor_add_file_block(struct nilfs_sc_info *sci, struct buffer_head *bh, struct inode *inode, - unsigned binfo_size) + unsigned int binfo_size) { struct nilfs_segment_buffer *segbuf; int required, err = 0; @@ -617,7 +621,7 @@ static void nilfs_write_file_node_binfo(struct nilfs_sc_info *sci, *vblocknr = binfo->bi_v.bi_vblocknr; } -static struct nilfs_sc_operations nilfs_sc_file_ops = { +static const struct nilfs_sc_operations nilfs_sc_file_ops = { .collect_data = nilfs_collect_file_data, .collect_node = nilfs_collect_file_node, .collect_bmap = nilfs_collect_file_bmap, @@ -666,7 +670,7 @@ static void nilfs_write_dat_node_binfo(struct nilfs_sc_info *sci, *binfo_dat = binfo->bi_dat; } -static struct nilfs_sc_operations nilfs_sc_dat_ops = { +static const struct nilfs_sc_operations nilfs_sc_dat_ops = { .collect_data = nilfs_collect_dat_data, .collect_node = nilfs_collect_file_node, .collect_bmap = nilfs_collect_dat_bmap, @@ -674,7 +678,7 @@ static struct nilfs_sc_operations nilfs_sc_dat_ops = { .write_node_binfo = nilfs_write_dat_node_binfo, }; -static struct nilfs_sc_operations nilfs_sc_dsync_ops = { +static const struct nilfs_sc_operations nilfs_sc_dsync_ops = { .collect_data = nilfs_collect_file_data, .collect_node = NULL, .collect_bmap = NULL, @@ -777,7 +781,7 @@ static void nilfs_dispose_list(struct the_nilfs *nilfs, { struct nilfs_inode_info *ii, *n; struct nilfs_inode_info *ivec[SC_N_INODEVEC], **pii; - unsigned nv = 0; + unsigned int nv = 0; while (!list_empty(head)) { spin_lock(&nilfs->ns_inode_lock); @@ -875,9 +879,11 @@ static int nilfs_segctor_create_checkpoint(struct nilfs_sc_info *sci) err = nilfs_cpfile_get_checkpoint(nilfs->ns_cpfile, nilfs->ns_cno, 1, &raw_cp, &bh_cp); if (likely(!err)) { - /* The following code is duplicated with cpfile. But, it is - needed to collect the checkpoint even if it was not newly - created */ + /* + * The following code is duplicated with cpfile. But, it is + * needed to collect the checkpoint even if it was not newly + * created. + */ mark_buffer_dirty(bh_cp); nilfs_mdt_mark_dirty(nilfs->ns_cpfile); nilfs_cpfile_put_checkpoint( @@ -958,7 +964,7 @@ static void nilfs_segctor_fill_in_super_root(struct nilfs_sc_info *sci, { struct buffer_head *bh_sr; struct nilfs_super_root *raw_sr; - unsigned isz, srsz; + unsigned int isz, srsz; bh_sr = NILFS_LAST_SEGBUF(&sci->sc_segbufs)->sb_super_root; raw_sr = (struct nilfs_super_root *)bh_sr->b_data; @@ -1043,7 +1049,7 @@ static size_t nilfs_segctor_buffer_rest(struct nilfs_sc_info *sci) static int nilfs_segctor_scan_file(struct nilfs_sc_info *sci, struct inode *inode, - struct nilfs_sc_operations *sc_ops) + const struct nilfs_sc_operations *sc_ops) { LIST_HEAD(data_buffers); LIST_HEAD(node_buffers); @@ -1406,8 +1412,10 @@ static void nilfs_free_incomplete_logs(struct list_head *logs, if (atomic_read(&segbuf->sb_err)) { /* Case 1: The first segment failed */ if (segbuf->sb_pseg_start != segbuf->sb_fseg_start) - /* Case 1a: Partial segment appended into an existing - segment */ + /* + * Case 1a: Partial segment appended into an existing + * segment + */ nilfs_terminate_segment(nilfs, segbuf->sb_fseg_start, segbuf->sb_fseg_end); else /* Case 1b: New full segment */ @@ -1550,7 +1558,7 @@ nilfs_segctor_update_payload_blocknr(struct nilfs_sc_info *sci, sector_t blocknr; unsigned long nfinfo = segbuf->sb_sum.nfinfo; unsigned long nblocks = 0, ndatablk = 0; - struct nilfs_sc_operations *sc_op = NULL; + const struct nilfs_sc_operations *sc_op = NULL; struct nilfs_segsum_pointer ssp; struct nilfs_finfo *finfo = NULL; union nilfs_binfo binfo; @@ -1631,8 +1639,10 @@ static int nilfs_segctor_assign(struct nilfs_sc_info *sci, int mode) static void nilfs_begin_page_io(struct page *page) { if (!page || PageWriteback(page)) - /* For split b-tree node pages, this function may be called - twice. We ignore the 2nd or later calls by this check. */ + /* + * For split b-tree node pages, this function may be called + * twice. We ignore the 2nd or later calls by this check. + */ return; lock_page(page); @@ -1942,7 +1952,7 @@ static int nilfs_segctor_collect_dirty_files(struct nilfs_sc_info *sci, ifile, ii->vfs_inode.i_ino, &ibh); if (unlikely(err)) { nilfs_warning(sci->sc_super, __func__, - "failed to get inode block.\n"); + "failed to get inode block."); return err; } mark_buffer_dirty(ibh); @@ -2395,6 +2405,7 @@ static int nilfs_segctor_construct(struct nilfs_sc_info *sci, int mode) static void nilfs_construction_timeout(unsigned long data) { struct task_struct *p = (struct task_struct *)data; + wake_up_process(p); } @@ -2555,10 +2566,10 @@ static int nilfs_segctor_thread(void *arg) if (timeout || sci->sc_seq_request != sci->sc_seq_done) mode = SC_LSEG_SR; - else if (!sci->sc_flush_request) - break; - else + else if (sci->sc_flush_request) mode = nilfs_segctor_flush_mode(sci); + else + break; spin_unlock(&sci->sc_state_lock); nilfs_segctor_thread_construct(sci, mode); @@ -2684,8 +2695,10 @@ static void nilfs_segctor_write_out(struct nilfs_sc_info *sci) { int ret, retrycount = NILFS_SC_CLEANUP_RETRY; - /* The segctord thread was stopped and its timer was removed. - But some tasks remain. */ + /* + * The segctord thread was stopped and its timer was removed. + * But some tasks remain. + */ do { struct nilfs_transaction_info ti; @@ -2727,13 +2740,13 @@ static void nilfs_segctor_destroy(struct nilfs_sc_info *sci) if (!list_empty(&sci->sc_dirty_files)) { nilfs_warning(sci->sc_super, __func__, - "dirty file(s) after the final construction\n"); + "dirty file(s) after the final construction"); nilfs_dispose_list(nilfs, &sci->sc_dirty_files, 1); } if (!list_empty(&sci->sc_iput_queue)) { nilfs_warning(sci->sc_super, __func__, - "iput queue is not empty\n"); + "iput queue is not empty"); nilfs_dispose_list(nilfs, &sci->sc_iput_queue, 1); } @@ -2810,7 +2823,7 @@ void nilfs_detach_log_writer(struct super_block *sb) if (!list_empty(&nilfs->ns_dirty_files)) { list_splice_init(&nilfs->ns_dirty_files, &garbage_list); nilfs_warning(sb, __func__, - "Hit dirty file after stopped log writer\n"); + "Hit dirty file after stopped log writer"); } spin_unlock(&nilfs->ns_inode_lock); up_write(&nilfs->ns_segctor_sem); diff --git a/fs/nilfs2/segment.h b/fs/nilfs2/segment.h index 0408b9b2814b..6565c10b7b76 100644 --- a/fs/nilfs2/segment.h +++ b/fs/nilfs2/segment.h @@ -13,11 +13,7 @@ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - * - * Written by Ryusuke Konishi <ryusuke@osrg.net> + * Written by Ryusuke Konishi. * */ #ifndef _NILFS_SEGMENT_H @@ -75,7 +71,7 @@ struct nilfs_recovery_info { */ struct nilfs_cstage { int scnt; - unsigned flags; + unsigned int flags; struct nilfs_inode_info *dirty_file_ptr; struct nilfs_inode_info *gc_inode_ptr; }; @@ -84,7 +80,7 @@ struct nilfs_segment_buffer; struct nilfs_segsum_pointer { struct buffer_head *bh; - unsigned offset; /* offset in bytes */ + unsigned int offset; /* offset in bytes */ }; /** @@ -193,11 +189,15 @@ enum { NILFS_SC_DIRTY, /* One or more dirty meta-data blocks exist */ NILFS_SC_UNCLOSED, /* Logical segment is not closed */ NILFS_SC_SUPER_ROOT, /* The latest segment has a super root */ - NILFS_SC_PRIOR_FLUSH, /* Requesting immediate flush without making a - checkpoint */ - NILFS_SC_HAVE_DELTA, /* Next checkpoint will have update of files - other than DAT, cpfile, sufile, or files - moved by GC */ + NILFS_SC_PRIOR_FLUSH, /* + * Requesting immediate flush without making a + * checkpoint + */ + NILFS_SC_HAVE_DELTA, /* + * Next checkpoint will have update of files + * other than DAT, cpfile, sufile, or files + * moved by GC. + */ }; /* sc_state */ @@ -207,17 +207,23 @@ enum { /* * Constant parameters */ -#define NILFS_SC_CLEANUP_RETRY 3 /* Retry count of construction when - destroying segctord */ +#define NILFS_SC_CLEANUP_RETRY 3 /* + * Retry count of construction when + * destroying segctord + */ /* * Default values of timeout, in seconds. */ -#define NILFS_SC_DEFAULT_TIMEOUT 5 /* Timeout value of dirty blocks. - It triggers construction of a - logical segment with a super root */ -#define NILFS_SC_DEFAULT_SR_FREQ 30 /* Maximum frequency of super root - creation */ +#define NILFS_SC_DEFAULT_TIMEOUT 5 /* + * Timeout value of dirty blocks. + * It triggers construction of a + * logical segment with a super root. + */ +#define NILFS_SC_DEFAULT_SR_FREQ 30 /* + * Maximum frequency of super root + * creation + */ /* * The default threshold amount of data, in block counts. diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c index 52821ffc11f4..1963595a1580 100644 --- a/fs/nilfs2/sufile.c +++ b/fs/nilfs2/sufile.c @@ -13,12 +13,8 @@ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - * - * Written by Koji Sato <koji@osrg.net>. - * Revised by Ryusuke Konishi <ryusuke@osrg.net>. + * Written by Koji Sato. + * Revised by Ryusuke Konishi. */ #include <linux/kernel.h> @@ -61,6 +57,7 @@ static unsigned long nilfs_sufile_get_blkoff(const struct inode *sufile, __u64 segnum) { __u64 t = segnum + NILFS_MDT(sufile)->mi_first_entry_offset; + do_div(t, nilfs_sufile_segment_usages_per_block(sufile)); return (unsigned long)t; } @@ -69,6 +66,7 @@ static unsigned long nilfs_sufile_get_offset(const struct inode *sufile, __u64 segnum) { __u64 t = segnum + NILFS_MDT(sufile)->mi_first_entry_offset; + return do_div(t, nilfs_sufile_segment_usages_per_block(sufile)); } @@ -819,7 +817,7 @@ out: * %-ENOMEM - Insufficient amount of memory available. */ ssize_t nilfs_sufile_get_suinfo(struct inode *sufile, __u64 segnum, void *buf, - unsigned sisz, size_t nsi) + unsigned int sisz, size_t nsi) { struct buffer_head *su_bh; struct nilfs_segment_usage *su; @@ -897,7 +895,7 @@ ssize_t nilfs_sufile_get_suinfo(struct inode *sufile, __u64 segnum, void *buf, * %-EINVAL - Invalid values in input (segment number, flags or nblocks) */ ssize_t nilfs_sufile_set_suinfo(struct inode *sufile, void *buf, - unsigned supsz, size_t nsup) + unsigned int supsz, size_t nsup) { struct the_nilfs *nilfs = sufile->i_sb->s_fs_info; struct buffer_head *header_bh, *bh; diff --git a/fs/nilfs2/sufile.h b/fs/nilfs2/sufile.h index b8afd72f2379..46e89872294c 100644 --- a/fs/nilfs2/sufile.h +++ b/fs/nilfs2/sufile.h @@ -13,11 +13,7 @@ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - * - * Written by Koji Sato <koji@osrg.net>. + * Written by Koji Sato. */ #ifndef _NILFS_SUFILE_H @@ -42,9 +38,9 @@ int nilfs_sufile_mark_dirty(struct inode *sufile, __u64 segnum); int nilfs_sufile_set_segment_usage(struct inode *sufile, __u64 segnum, unsigned long nblocks, time_t modtime); int nilfs_sufile_get_stat(struct inode *, struct nilfs_sustat *); -ssize_t nilfs_sufile_get_suinfo(struct inode *, __u64, void *, unsigned, +ssize_t nilfs_sufile_get_suinfo(struct inode *, __u64, void *, unsigned int, size_t); -ssize_t nilfs_sufile_set_suinfo(struct inode *, void *, unsigned , size_t); +ssize_t nilfs_sufile_set_suinfo(struct inode *, void *, unsigned int, size_t); int nilfs_sufile_updatev(struct inode *, __u64 *, size_t, int, size_t *, void (*dofunc)(struct inode *, __u64, diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c index 7f5d3d9f1c37..666107a18a22 100644 --- a/fs/nilfs2/super.c +++ b/fs/nilfs2/super.c @@ -13,11 +13,7 @@ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - * - * Written by Ryusuke Konishi <ryusuke@osrg.net> + * Written by Ryusuke Konishi. */ /* * linux/fs/ext2/super.c @@ -173,12 +169,10 @@ struct inode *nilfs_alloc_inode(struct super_block *sb) static void nilfs_i_callback(struct rcu_head *head) { struct inode *inode = container_of(head, struct inode, i_rcu); - struct nilfs_mdt_info *mdi = NILFS_MDT(inode); - if (mdi) { - kfree(mdi->mi_bgl); /* kfree(NULL) is safe */ - kfree(mdi); - } + if (nilfs_is_metadata_file_inode(inode)) + nilfs_mdt_destroy(inode); + kmem_cache_free(nilfs_inode_cachep, NILFS_I(inode)); } @@ -279,7 +273,7 @@ struct nilfs_super_block **nilfs_prepare_super(struct super_block *sb, } } else if (sbp[1] && sbp[1]->s_magic != cpu_to_le16(NILFS_SUPER_MAGIC)) { - memcpy(sbp[1], sbp[0], nilfs->ns_sbsize); + memcpy(sbp[1], sbp[0], nilfs->ns_sbsize); } if (flip && sbp[1]) @@ -749,6 +743,7 @@ static int parse_options(char *options, struct super_block *sb, int is_remount) while ((p = strsep(&options, ",")) != NULL) { int token; + if (!*p) continue; @@ -891,7 +886,7 @@ int nilfs_store_magic_and_option(struct super_block *sb, nilfs->ns_interval = le32_to_cpu(sbp->s_c_interval); nilfs->ns_watermark = le32_to_cpu(sbp->s_c_block_max); - return !parse_options(data, sb, 0) ? -EINVAL : 0 ; + return !parse_options(data, sb, 0) ? -EINVAL : 0; } int nilfs_check_feature_compatibility(struct super_block *sb, @@ -1316,7 +1311,7 @@ nilfs_mount(struct file_system_type *fs_type, int flags, } if (!s->s_root) { - s_new = true; + s_new = true; /* New superblock instance created */ s->s_mode = mode; diff --git a/fs/nilfs2/sysfs.c b/fs/nilfs2/sysfs.c index bbb0dcc35905..8ffa42b704d8 100644 --- a/fs/nilfs2/sysfs.c +++ b/fs/nilfs2/sysfs.c @@ -68,7 +68,7 @@ static ssize_t nilfs_##name##_attr_store(struct kobject *kobj, \ static const struct sysfs_ops nilfs_##name##_attr_ops = { \ .show = nilfs_##name##_attr_show, \ .store = nilfs_##name##_attr_store, \ -}; +} #define NILFS_DEV_INT_GROUP_TYPE(name, parent_name) \ static void nilfs_##name##_attr_release(struct kobject *kobj) \ @@ -84,7 +84,7 @@ static struct kobj_type nilfs_##name##_ktype = { \ .default_attrs = nilfs_##name##_attrs, \ .sysfs_ops = &nilfs_##name##_attr_ops, \ .release = nilfs_##name##_attr_release, \ -}; +} #define NILFS_DEV_INT_GROUP_FNS(name, parent_name) \ static int nilfs_sysfs_create_##name##_group(struct the_nilfs *nilfs) \ @@ -756,7 +756,7 @@ nilfs_superblock_sb_write_count_show(struct nilfs_superblock_attr *attr, struct the_nilfs *nilfs, char *buf) { - unsigned sbwcount; + unsigned int sbwcount; down_read(&nilfs->ns_sem); sbwcount = nilfs->ns_sbwcount; @@ -770,7 +770,7 @@ nilfs_superblock_sb_update_frequency_show(struct nilfs_superblock_attr *attr, struct the_nilfs *nilfs, char *buf) { - unsigned sb_update_freq; + unsigned int sb_update_freq; down_read(&nilfs->ns_sem); sb_update_freq = nilfs->ns_sb_update_freq; @@ -784,7 +784,7 @@ nilfs_superblock_sb_update_frequency_store(struct nilfs_superblock_attr *attr, struct the_nilfs *nilfs, const char *buf, size_t count) { - unsigned val; + unsigned int val; int err; err = kstrtouint(skip_spaces(buf), 0, &val); diff --git a/fs/nilfs2/sysfs.h b/fs/nilfs2/sysfs.h index 677e3a1a8370..648cedf9c06e 100644 --- a/fs/nilfs2/sysfs.h +++ b/fs/nilfs2/sysfs.h @@ -66,7 +66,7 @@ struct nilfs_##name##_attr { \ char *); \ ssize_t (*store)(struct kobject *, struct attribute *, \ const char *, size_t); \ -}; +} NILFS_COMMON_ATTR_STRUCT(feature); @@ -77,7 +77,7 @@ struct nilfs_##name##_attr { \ char *); \ ssize_t (*store)(struct nilfs_##name##_attr *, struct the_nilfs *, \ const char *, size_t); \ -}; +} NILFS_DEV_ATTR_STRUCT(dev); NILFS_DEV_ATTR_STRUCT(segments); @@ -93,7 +93,7 @@ struct nilfs_##name##_attr { \ char *); \ ssize_t (*store)(struct nilfs_##name##_attr *, struct nilfs_root *, \ const char *, size_t); \ -}; +} NILFS_CP_ATTR_STRUCT(snapshot); diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c index 69bd801afb53..e9fd241b9a0a 100644 --- a/fs/nilfs2/the_nilfs.c +++ b/fs/nilfs2/the_nilfs.c @@ -13,11 +13,7 @@ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - * - * Written by Ryusuke Konishi <ryusuke@osrg.net> + * Written by Ryusuke Konishi. * */ @@ -112,8 +108,8 @@ static int nilfs_load_super_root(struct the_nilfs *nilfs, struct nilfs_super_root *raw_sr; struct nilfs_super_block **sbp = nilfs->ns_sbp; struct nilfs_inode *rawi; - unsigned dat_entry_size, segment_usage_size, checkpoint_size; - unsigned inode_size; + unsigned int dat_entry_size, segment_usage_size, checkpoint_size; + unsigned int inode_size; int err; err = nilfs_read_super_root_block(nilfs, sr_block, &bh_sr, 1); @@ -443,7 +439,7 @@ static int nilfs_valid_sb(struct nilfs_super_block *sbp) if (!sbp || le16_to_cpu(sbp->s_magic) != NILFS_SUPER_MAGIC) return 0; bytes = le16_to_cpu(sbp->s_bytes); - if (bytes > BLOCK_SIZE) + if (bytes < sumoff + 4 || bytes > BLOCK_SIZE) return 0; crc = crc32_le(le32_to_cpu(sbp->s_crc_seed), (unsigned char *)sbp, sumoff); @@ -621,8 +617,10 @@ int init_nilfs(struct the_nilfs *nilfs, struct super_block *sb, char *data) err = nilfs_load_super_block(nilfs, sb, blocksize, &sbp); if (err) goto out; - /* not failed_sbh; sbh is released automatically - when reloading fails. */ + /* + * Not to failed_sbh; sbh is released automatically + * when reloading fails. + */ } nilfs->ns_blocksize_bits = sb->s_blocksize_bits; nilfs->ns_blocksize = blocksize; diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h index 23778d385836..79369fd6b13b 100644 --- a/fs/nilfs2/the_nilfs.h +++ b/fs/nilfs2/the_nilfs.h @@ -13,11 +13,7 @@ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - * - * Written by Ryusuke Konishi <ryusuke@osrg.net> + * Written by Ryusuke Konishi. * */ @@ -118,10 +114,10 @@ struct the_nilfs { struct buffer_head *ns_sbh[2]; struct nilfs_super_block *ns_sbp[2]; time_t ns_sbwtime; - unsigned ns_sbwcount; - unsigned ns_sbsize; - unsigned ns_mount_state; - unsigned ns_sb_update_freq; + unsigned int ns_sbwcount; + unsigned int ns_sbsize; + unsigned int ns_mount_state; + unsigned int ns_sb_update_freq; /* * Following fields are dedicated to a writable FS-instance. @@ -226,15 +222,14 @@ THE_NILFS_FNS(SB_DIRTY, sb_dirty) * Mount option operations */ #define nilfs_clear_opt(nilfs, opt) \ - do { (nilfs)->ns_mount_opt &= ~NILFS_MOUNT_##opt; } while (0) + ((nilfs)->ns_mount_opt &= ~NILFS_MOUNT_##opt) #define nilfs_set_opt(nilfs, opt) \ - do { (nilfs)->ns_mount_opt |= NILFS_MOUNT_##opt; } while (0) + ((nilfs)->ns_mount_opt |= NILFS_MOUNT_##opt) #define nilfs_test_opt(nilfs, opt) ((nilfs)->ns_mount_opt & NILFS_MOUNT_##opt) #define nilfs_write_opt(nilfs, mask, opt) \ - do { (nilfs)->ns_mount_opt = \ + ((nilfs)->ns_mount_opt = \ (((nilfs)->ns_mount_opt & ~NILFS_MOUNT_##mask) | \ - NILFS_MOUNT_##opt); \ - } while (0) + NILFS_MOUNT_##opt)) \ /** * struct nilfs_root - nilfs root object @@ -273,6 +268,7 @@ struct nilfs_root { static inline int nilfs_sb_need_update(struct the_nilfs *nilfs) { u64 t = get_seconds(); + return t < nilfs->ns_sbwtime || t > nilfs->ns_sbwtime + nilfs->ns_sb_update_freq; } @@ -280,6 +276,7 @@ static inline int nilfs_sb_need_update(struct the_nilfs *nilfs) static inline int nilfs_sb_will_flip(struct the_nilfs *nilfs) { int flip_bits = nilfs->ns_sbwcount & 0x0FL; + return (flip_bits != 0x08 && flip_bits != 0x0F); } @@ -308,7 +305,7 @@ static inline void nilfs_get_root(struct nilfs_root *root) static inline int nilfs_valid_fs(struct the_nilfs *nilfs) { - unsigned valid_fs; + unsigned int valid_fs; down_read(&nilfs->ns_sem); valid_fs = (nilfs->ns_mount_state & NILFS_VALID_FS); diff --git a/fs/notify/fsnotify.h b/fs/notify/fsnotify.h index b44c68a857e7..0a3bc2cf192c 100644 --- a/fs/notify/fsnotify.h +++ b/fs/notify/fsnotify.h @@ -56,6 +56,13 @@ static inline void fsnotify_clear_marks_by_mount(struct vfsmount *mnt) fsnotify_destroy_marks(&real_mount(mnt)->mnt_fsnotify_marks, &mnt->mnt_root->d_lock); } +/* prepare for freeing all marks associated with given group */ +extern void fsnotify_detach_group_marks(struct fsnotify_group *group); +/* + * wait for fsnotify_mark_srcu period to end and free all marks in destroy_list + */ +extern void fsnotify_mark_destroy_list(void); + /* * update the dentry->d_flags of all of inode's children to indicate if inode cares * about events that happen to its children. diff --git a/fs/notify/group.c b/fs/notify/group.c index d16b62cb2854..3e2dd85be5dd 100644 --- a/fs/notify/group.c +++ b/fs/notify/group.c @@ -47,12 +47,21 @@ static void fsnotify_final_destroy_group(struct fsnotify_group *group) */ void fsnotify_destroy_group(struct fsnotify_group *group) { - /* clear all inode marks for this group */ - fsnotify_clear_marks_by_group(group); + /* clear all inode marks for this group, attach them to destroy_list */ + fsnotify_detach_group_marks(group); - synchronize_srcu(&fsnotify_mark_srcu); + /* + * Wait for fsnotify_mark_srcu period to end and free all marks in + * destroy_list + */ + fsnotify_mark_destroy_list(); - /* clear the notification queue of all events */ + /* + * Since we have waited for fsnotify_mark_srcu in + * fsnotify_mark_destroy_list() there can be no outstanding event + * notification against this group. So clearing the notification queue + * of all events is reliable now. + */ fsnotify_flush_notify(group); /* diff --git a/fs/notify/mark.c b/fs/notify/mark.c index 7115c5d7d373..d3fea0bd89e2 100644 --- a/fs/notify/mark.c +++ b/fs/notify/mark.c @@ -97,8 +97,8 @@ struct srcu_struct fsnotify_mark_srcu; static DEFINE_SPINLOCK(destroy_lock); static LIST_HEAD(destroy_list); -static void fsnotify_mark_destroy(struct work_struct *work); -static DECLARE_DELAYED_WORK(reaper_work, fsnotify_mark_destroy); +static void fsnotify_mark_destroy_workfn(struct work_struct *work); +static DECLARE_DELAYED_WORK(reaper_work, fsnotify_mark_destroy_workfn); void fsnotify_get_mark(struct fsnotify_mark *mark) { @@ -173,11 +173,15 @@ void fsnotify_detach_mark(struct fsnotify_mark *mark) } /* - * Free fsnotify mark. The freeing is actually happening from a kthread which - * first waits for srcu period end. Caller must have a reference to the mark - * or be protected by fsnotify_mark_srcu. + * Prepare mark for freeing and add it to the list of marks prepared for + * freeing. The actual freeing must happen after SRCU period ends and the + * caller is responsible for this. + * + * The function returns true if the mark was added to the list of marks for + * freeing. The function returns false if someone else has already called + * __fsnotify_free_mark() for the mark. */ -void fsnotify_free_mark(struct fsnotify_mark *mark) +static bool __fsnotify_free_mark(struct fsnotify_mark *mark) { struct fsnotify_group *group = mark->group; @@ -185,17 +189,11 @@ void fsnotify_free_mark(struct fsnotify_mark *mark) /* something else already called this function on this mark */ if (!(mark->flags & FSNOTIFY_MARK_FLAG_ALIVE)) { spin_unlock(&mark->lock); - return; + return false; } mark->flags &= ~FSNOTIFY_MARK_FLAG_ALIVE; spin_unlock(&mark->lock); - spin_lock(&destroy_lock); - list_add(&mark->g_list, &destroy_list); - spin_unlock(&destroy_lock); - queue_delayed_work(system_unbound_wq, &reaper_work, - FSNOTIFY_REAPER_DELAY); - /* * Some groups like to know that marks are being freed. This is a * callback to the group function to let it know that this mark @@ -203,6 +201,25 @@ void fsnotify_free_mark(struct fsnotify_mark *mark) */ if (group->ops->freeing_mark) group->ops->freeing_mark(mark, group); + + spin_lock(&destroy_lock); + list_add(&mark->g_list, &destroy_list); + spin_unlock(&destroy_lock); + + return true; +} + +/* + * Free fsnotify mark. The freeing is actually happening from a workqueue which + * first waits for srcu period end. Caller must have a reference to the mark + * or be protected by fsnotify_mark_srcu. + */ +void fsnotify_free_mark(struct fsnotify_mark *mark) +{ + if (__fsnotify_free_mark(mark)) { + queue_delayed_work(system_unbound_wq, &reaper_work, + FSNOTIFY_REAPER_DELAY); + } } void fsnotify_destroy_mark(struct fsnotify_mark *mark, @@ -468,11 +485,29 @@ void fsnotify_clear_marks_by_group_flags(struct fsnotify_group *group, } /* - * Given a group, destroy all of the marks associated with that group. + * Given a group, prepare for freeing all the marks associated with that group. + * The marks are attached to the list of marks prepared for destruction, the + * caller is responsible for freeing marks in that list after SRCU period has + * ended. */ -void fsnotify_clear_marks_by_group(struct fsnotify_group *group) +void fsnotify_detach_group_marks(struct fsnotify_group *group) { - fsnotify_clear_marks_by_group_flags(group, (unsigned int)-1); + struct fsnotify_mark *mark; + + while (1) { + mutex_lock_nested(&group->mark_mutex, SINGLE_DEPTH_NESTING); + if (list_empty(&group->marks_list)) { + mutex_unlock(&group->mark_mutex); + break; + } + mark = list_first_entry(&group->marks_list, + struct fsnotify_mark, g_list); + fsnotify_get_mark(mark); + fsnotify_detach_mark(mark); + mutex_unlock(&group->mark_mutex); + __fsnotify_free_mark(mark); + fsnotify_put_mark(mark); + } } void fsnotify_duplicate_mark(struct fsnotify_mark *new, struct fsnotify_mark *old) @@ -499,7 +534,11 @@ void fsnotify_init_mark(struct fsnotify_mark *mark, mark->free_mark = free_mark; } -static void fsnotify_mark_destroy(struct work_struct *work) +/* + * Destroy all marks in destroy_list, waits for SRCU period to finish before + * actually freeing marks. + */ +void fsnotify_mark_destroy_list(void) { struct fsnotify_mark *mark, *next; struct list_head private_destroy_list; @@ -516,3 +555,8 @@ static void fsnotify_mark_destroy(struct work_struct *work) fsnotify_put_mark(mark); } } + +static void fsnotify_mark_destroy_workfn(struct work_struct *work) +{ + fsnotify_mark_destroy_list(); +} diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c index 97768a1379f2..fe251f187ff8 100644 --- a/fs/ntfs/aops.c +++ b/fs/ntfs/aops.c @@ -362,7 +362,7 @@ handle_zblock: for (i = 0; i < nr; i++) { tbh = arr[i]; if (likely(!buffer_uptodate(tbh))) - submit_bh(READ, tbh); + submit_bh(REQ_OP_READ, 0, tbh); else ntfs_end_buffer_async_read(tbh, 1); } @@ -877,7 +877,7 @@ lock_retry_remap: do { struct buffer_head *next = bh->b_this_page; if (buffer_async_write(bh)) { - submit_bh(WRITE, bh); + submit_bh(REQ_OP_WRITE, 0, bh); need_end_writeback = false; } bh = next; @@ -1202,7 +1202,7 @@ lock_retry_remap: BUG_ON(!buffer_mapped(tbh)); get_bh(tbh); tbh->b_end_io = end_buffer_write_sync; - submit_bh(WRITE, tbh); + submit_bh(REQ_OP_WRITE, 0, tbh); } /* Synchronize the mft mirror now if not @sync. */ if (is_mft && !sync) diff --git a/fs/ntfs/compress.c b/fs/ntfs/compress.c index f2b5e746f49b..f8eb04387ca4 100644 --- a/fs/ntfs/compress.c +++ b/fs/ntfs/compress.c @@ -670,7 +670,7 @@ lock_retry_remap: } get_bh(tbh); tbh->b_end_io = end_buffer_read_sync; - submit_bh(READ, tbh); + submit_bh(REQ_OP_READ, 0, tbh); } /* Wait for io completion on all buffer heads. */ diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c index 5622ed5a201e..f548629dfaac 100644 --- a/fs/ntfs/file.c +++ b/fs/ntfs/file.c @@ -553,7 +553,7 @@ static inline int ntfs_submit_bh_for_read(struct buffer_head *bh) lock_buffer(bh); get_bh(bh); bh->b_end_io = end_buffer_read_sync; - return submit_bh(READ, bh); + return submit_bh(REQ_OP_READ, 0, bh); } /** diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c index f40972d6df90..e01287c964a8 100644 --- a/fs/ntfs/inode.c +++ b/fs/ntfs/inode.c @@ -1854,7 +1854,7 @@ int ntfs_read_inode_mount(struct inode *vi) /* Need this to sanity check attribute list references to $MFT. */ vi->i_generation = ni->seq_no = le16_to_cpu(m->sequence_number); - /* Provides readpage() and sync_page() for map_mft_record(). */ + /* Provides readpage() for map_mft_record(). */ vi->i_mapping->a_ops = &ntfs_mst_aops; ctx = ntfs_attr_get_search_ctx(ni, m); diff --git a/fs/ntfs/logfile.c b/fs/ntfs/logfile.c index 9d71213ca81e..761f12f7f3ef 100644 --- a/fs/ntfs/logfile.c +++ b/fs/ntfs/logfile.c @@ -821,7 +821,7 @@ map_vcn: * completed ignore errors afterwards as we can assume * that if one buffer worked all of them will work. */ - submit_bh(WRITE, bh); + submit_bh(REQ_OP_WRITE, 0, bh); if (should_wait) { should_wait = false; wait_on_buffer(bh); diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c index 37b2501caaa4..d15d492ce47b 100644 --- a/fs/ntfs/mft.c +++ b/fs/ntfs/mft.c @@ -592,7 +592,7 @@ int ntfs_sync_mft_mirror(ntfs_volume *vol, const unsigned long mft_no, clear_buffer_dirty(tbh); get_bh(tbh); tbh->b_end_io = end_buffer_write_sync; - submit_bh(WRITE, tbh); + submit_bh(REQ_OP_WRITE, 0, tbh); } /* Wait on i/o completion of buffers. */ for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++) { @@ -785,7 +785,7 @@ int write_mft_record_nolock(ntfs_inode *ni, MFT_RECORD *m, int sync) clear_buffer_dirty(tbh); get_bh(tbh); tbh->b_end_io = end_buffer_write_sync; - submit_bh(WRITE, tbh); + submit_bh(REQ_OP_WRITE, 0, tbh); } /* Synchronize the mft mirror now if not @sync. */ if (!sync && ni->mft_no < vol->mftmirr_size) diff --git a/fs/ntfs/namei.c b/fs/ntfs/namei.c index 443abecf01b7..358258364616 100644 --- a/fs/ntfs/namei.c +++ b/fs/ntfs/namei.c @@ -253,7 +253,7 @@ handle_name: err = (signed)nls_name.len; goto err_out; } - nls_name.hash = full_name_hash(nls_name.name, nls_name.len); + nls_name.hash = full_name_hash(dent, nls_name.name, nls_name.len); dent = d_add_ci(dent, dent_inode, &nls_name); kfree(nls_name.name); diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile index e27e6527912b..4342c7ee7d20 100644 --- a/fs/ocfs2/Makefile +++ b/fs/ocfs2/Makefile @@ -1,7 +1,5 @@ ccflags-y := -Ifs/ocfs2 -ccflags-y += -DCATCH_BH_JBD_RACES - obj-$(CONFIG_OCFS2_FS) += \ ocfs2.o \ ocfs2_stackglue.o diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index e361d1a0ca09..460c0cedab3a 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -5351,7 +5351,7 @@ static int ocfs2_truncate_rec(handle_t *handle, { int ret; u32 left_cpos, rec_range, trunc_range; - int wants_rotate = 0, is_rightmost_tree_rec = 0; + int is_rightmost_tree_rec = 0; struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci); struct ocfs2_path *left_path = NULL; struct ocfs2_extent_list *el = path_leaf_el(path); @@ -5457,7 +5457,6 @@ static int ocfs2_truncate_rec(handle_t *handle, memset(rec, 0, sizeof(*rec)); ocfs2_cleanup_merge(el, index); - wants_rotate = 1; next_free = le16_to_cpu(el->l_next_free_rec); if (is_rightmost_tree_rec && next_free > 1) { diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index c034edf3ef38..af2adfcb0f6f 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -640,7 +640,7 @@ int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno, !buffer_new(bh) && ocfs2_should_read_blk(inode, page, block_start) && (block_start < from || block_end > to)) { - ll_rw_block(READ, 1, &bh); + ll_rw_block(REQ_OP_READ, 0, 1, &bh); *wait_bh++=bh; } @@ -2426,7 +2426,7 @@ static int ocfs2_dio_end_io(struct kiocb *iocb, static ssize_t ocfs2_direct_IO(struct kiocb *iocb, struct iov_iter *iter) { struct file *file = iocb->ki_filp; - struct inode *inode = file_inode(file)->i_mapping->host; + struct inode *inode = file->f_mapping->host; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); get_block_t *get_block; diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c index fe50ded1b4ce..8f040f88ade4 100644 --- a/fs/ocfs2/buffer_head_io.c +++ b/fs/ocfs2/buffer_head_io.c @@ -79,7 +79,7 @@ int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh, get_bh(bh); /* for end_buffer_write_sync() */ bh->b_end_io = end_buffer_write_sync; - submit_bh(WRITE, bh); + submit_bh(REQ_OP_WRITE, 0, bh); wait_on_buffer(bh); @@ -139,17 +139,22 @@ int ocfs2_read_blocks_sync(struct ocfs2_super *osb, u64 block, lock_buffer(bh); if (buffer_jbd(bh)) { +#ifdef CATCH_BH_JBD_RACES mlog(ML_ERROR, "block %llu had the JBD bit set " "while I was in lock_buffer!", (unsigned long long)bh->b_blocknr); BUG(); +#else + unlock_buffer(bh); + continue; +#endif } clear_buffer_uptodate(bh); get_bh(bh); /* for end_buffer_read_sync() */ bh->b_end_io = end_buffer_read_sync; - submit_bh(READ, bh); + submit_bh(REQ_OP_READ, 0, bh); } for (i = nr; i > 0; i--) { @@ -305,7 +310,7 @@ int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr, if (validate) set_buffer_needs_validate(bh); bh->b_end_io = end_buffer_read_sync; - submit_bh(READ, bh); + submit_bh(REQ_OP_READ, 0, bh); continue; } } @@ -419,7 +424,7 @@ int ocfs2_write_super_or_backup(struct ocfs2_super *osb, get_bh(bh); /* for end_buffer_write_sync() */ bh->b_end_io = end_buffer_write_sync; ocfs2_compute_meta_ecc(osb->sb, bh->b_data, &di->i_check); - submit_bh(WRITE, bh); + submit_bh(REQ_OP_WRITE, 0, bh); wait_on_buffer(bh); diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index 1934abb6b680..636abcbd4650 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -272,10 +272,21 @@ struct o2hb_region { struct delayed_work hr_write_timeout_work; unsigned long hr_last_timeout_start; + /* negotiate timer, used to negotiate extending hb timeout. */ + struct delayed_work hr_nego_timeout_work; + unsigned long hr_nego_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)]; + /* Used during o2hb_check_slot to hold a copy of the block * being checked because we temporarily have to zero out the * crc field. */ struct o2hb_disk_heartbeat_block *hr_tmp_block; + + /* Message key for negotiate timeout message. */ + unsigned int hr_key; + struct list_head hr_handler_list; + + /* last hb status, 0 for success, other value for error. */ + int hr_last_hb_status; }; struct o2hb_bio_wait_ctxt { @@ -284,6 +295,17 @@ struct o2hb_bio_wait_ctxt { int wc_error; }; +#define O2HB_NEGO_TIMEOUT_MS (O2HB_MAX_WRITE_TIMEOUT_MS/2) + +enum { + O2HB_NEGO_TIMEOUT_MSG = 1, + O2HB_NEGO_APPROVE_MSG = 2, +}; + +struct o2hb_nego_msg { + u8 node_num; +}; + static void o2hb_write_timeout(struct work_struct *work) { int failed, quorum; @@ -319,7 +341,7 @@ static void o2hb_write_timeout(struct work_struct *work) o2quo_disk_timeout(); } -static void o2hb_arm_write_timeout(struct o2hb_region *reg) +static void o2hb_arm_timeout(struct o2hb_region *reg) { /* Arm writeout only after thread reaches steady state */ if (atomic_read(®->hr_steady_iterations) != 0) @@ -334,14 +356,132 @@ static void o2hb_arm_write_timeout(struct o2hb_region *reg) spin_unlock(&o2hb_live_lock); } cancel_delayed_work(®->hr_write_timeout_work); - reg->hr_last_timeout_start = jiffies; schedule_delayed_work(®->hr_write_timeout_work, msecs_to_jiffies(O2HB_MAX_WRITE_TIMEOUT_MS)); + + cancel_delayed_work(®->hr_nego_timeout_work); + /* negotiate timeout must be less than write timeout. */ + schedule_delayed_work(®->hr_nego_timeout_work, + msecs_to_jiffies(O2HB_NEGO_TIMEOUT_MS)); + memset(reg->hr_nego_node_bitmap, 0, sizeof(reg->hr_nego_node_bitmap)); } -static void o2hb_disarm_write_timeout(struct o2hb_region *reg) +static void o2hb_disarm_timeout(struct o2hb_region *reg) { cancel_delayed_work_sync(®->hr_write_timeout_work); + cancel_delayed_work_sync(®->hr_nego_timeout_work); +} + +static int o2hb_send_nego_msg(int key, int type, u8 target) +{ + struct o2hb_nego_msg msg; + int status, ret; + + msg.node_num = o2nm_this_node(); +again: + ret = o2net_send_message(type, key, &msg, sizeof(msg), + target, &status); + + if (ret == -EAGAIN || ret == -ENOMEM) { + msleep(100); + goto again; + } + + return ret; +} + +static void o2hb_nego_timeout(struct work_struct *work) +{ + unsigned long live_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)]; + int master_node, i, ret; + struct o2hb_region *reg; + + reg = container_of(work, struct o2hb_region, hr_nego_timeout_work.work); + /* don't negotiate timeout if last hb failed since it is very + * possible io failed. Should let write timeout fence self. + */ + if (reg->hr_last_hb_status) + return; + + o2hb_fill_node_map(live_node_bitmap, sizeof(live_node_bitmap)); + /* lowest node as master node to make negotiate decision. */ + master_node = find_next_bit(live_node_bitmap, O2NM_MAX_NODES, 0); + + if (master_node == o2nm_this_node()) { + if (!test_bit(master_node, reg->hr_nego_node_bitmap)) { + printk(KERN_NOTICE "o2hb: node %d hb write hung for %ds on region %s (%s).\n", + o2nm_this_node(), O2HB_NEGO_TIMEOUT_MS/1000, + config_item_name(®->hr_item), reg->hr_dev_name); + set_bit(master_node, reg->hr_nego_node_bitmap); + } + if (memcmp(reg->hr_nego_node_bitmap, live_node_bitmap, + sizeof(reg->hr_nego_node_bitmap))) { + /* check negotiate bitmap every second to do timeout + * approve decision. + */ + schedule_delayed_work(®->hr_nego_timeout_work, + msecs_to_jiffies(1000)); + + return; + } + + printk(KERN_NOTICE "o2hb: all nodes hb write hung, maybe region %s (%s) is down.\n", + config_item_name(®->hr_item), reg->hr_dev_name); + /* approve negotiate timeout request. */ + o2hb_arm_timeout(reg); + + i = -1; + while ((i = find_next_bit(live_node_bitmap, + O2NM_MAX_NODES, i + 1)) < O2NM_MAX_NODES) { + if (i == master_node) + continue; + + mlog(ML_HEARTBEAT, "send NEGO_APPROVE msg to node %d\n", i); + ret = o2hb_send_nego_msg(reg->hr_key, + O2HB_NEGO_APPROVE_MSG, i); + if (ret) + mlog(ML_ERROR, "send NEGO_APPROVE msg to node %d fail %d\n", + i, ret); + } + } else { + /* negotiate timeout with master node. */ + printk(KERN_NOTICE "o2hb: node %d hb write hung for %ds on region %s (%s), negotiate timeout with node %d.\n", + o2nm_this_node(), O2HB_NEGO_TIMEOUT_MS/1000, config_item_name(®->hr_item), + reg->hr_dev_name, master_node); + ret = o2hb_send_nego_msg(reg->hr_key, O2HB_NEGO_TIMEOUT_MSG, + master_node); + if (ret) + mlog(ML_ERROR, "send NEGO_TIMEOUT msg to node %d fail %d\n", + master_node, ret); + } +} + +static int o2hb_nego_timeout_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data) +{ + struct o2hb_region *reg = data; + struct o2hb_nego_msg *nego_msg; + + nego_msg = (struct o2hb_nego_msg *)msg->buf; + printk(KERN_NOTICE "o2hb: receive negotiate timeout message from node %d on region %s (%s).\n", + nego_msg->node_num, config_item_name(®->hr_item), reg->hr_dev_name); + if (nego_msg->node_num < O2NM_MAX_NODES) + set_bit(nego_msg->node_num, reg->hr_nego_node_bitmap); + else + mlog(ML_ERROR, "got nego timeout message from bad node.\n"); + + return 0; +} + +static int o2hb_nego_approve_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data) +{ + struct o2hb_region *reg = data; + + printk(KERN_NOTICE "o2hb: negotiate timeout approved by master node on region %s (%s).\n", + config_item_name(®->hr_item), reg->hr_dev_name); + o2hb_arm_timeout(reg); + return 0; } static inline void o2hb_bio_wait_init(struct o2hb_bio_wait_ctxt *wc) @@ -390,7 +530,8 @@ static void o2hb_bio_end_io(struct bio *bio) static struct bio *o2hb_setup_one_bio(struct o2hb_region *reg, struct o2hb_bio_wait_ctxt *wc, unsigned int *current_slot, - unsigned int max_slots) + unsigned int max_slots, int op, + int op_flags) { int len, current_page; unsigned int vec_len, vec_start; @@ -416,6 +557,7 @@ static struct bio *o2hb_setup_one_bio(struct o2hb_region *reg, bio->bi_bdev = reg->hr_bdev; bio->bi_private = wc; bio->bi_end_io = o2hb_bio_end_io; + bio_set_op_attrs(bio, op, op_flags); vec_start = (cs << bits) % PAGE_SIZE; while(cs < max_slots) { @@ -451,7 +593,8 @@ static int o2hb_read_slots(struct o2hb_region *reg, o2hb_bio_wait_init(&wc); while(current_slot < max_slots) { - bio = o2hb_setup_one_bio(reg, &wc, ¤t_slot, max_slots); + bio = o2hb_setup_one_bio(reg, &wc, ¤t_slot, max_slots, + REQ_OP_READ, 0); if (IS_ERR(bio)) { status = PTR_ERR(bio); mlog_errno(status); @@ -459,7 +602,7 @@ static int o2hb_read_slots(struct o2hb_region *reg, } atomic_inc(&wc.wc_num_reqs); - submit_bio(READ, bio); + submit_bio(bio); } status = 0; @@ -483,7 +626,8 @@ static int o2hb_issue_node_write(struct o2hb_region *reg, slot = o2nm_this_node(); - bio = o2hb_setup_one_bio(reg, write_wc, &slot, slot+1); + bio = o2hb_setup_one_bio(reg, write_wc, &slot, slot+1, REQ_OP_WRITE, + WRITE_SYNC); if (IS_ERR(bio)) { status = PTR_ERR(bio); mlog_errno(status); @@ -491,7 +635,7 @@ static int o2hb_issue_node_write(struct o2hb_region *reg, } atomic_inc(&write_wc->wc_num_reqs); - submit_bio(WRITE_SYNC, bio); + submit_bio(bio); status = 0; bail: @@ -1032,7 +1176,8 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg) /* Skip disarming the timeout if own slot has stale/bad data */ if (own_slot_ok) { o2hb_set_quorum_device(reg); - o2hb_arm_write_timeout(reg); + o2hb_arm_timeout(reg); + reg->hr_last_timeout_start = jiffies; } bail: @@ -1096,6 +1241,7 @@ static int o2hb_thread(void *data) before_hb = ktime_get_real(); ret = o2hb_do_disk_heartbeat(reg); + reg->hr_last_hb_status = ret; after_hb = ktime_get_real(); @@ -1114,7 +1260,7 @@ static int o2hb_thread(void *data) } } - o2hb_disarm_write_timeout(reg); + o2hb_disarm_timeout(reg); /* unclean stop is only used in very bad situation */ for(i = 0; !reg->hr_unclean_stop && i < reg->hr_blocks; i++) @@ -1451,12 +1597,12 @@ static void o2hb_region_release(struct config_item *item) list_del(®->hr_all_item); spin_unlock(&o2hb_live_lock); + o2net_unregister_handler_list(®->hr_handler_list); kfree(reg); } static int o2hb_read_block_input(struct o2hb_region *reg, const char *page, - size_t count, unsigned long *ret_bytes, unsigned int *ret_bits) { @@ -1499,8 +1645,8 @@ static ssize_t o2hb_region_block_bytes_store(struct config_item *item, if (reg->hr_bdev) return -EINVAL; - status = o2hb_read_block_input(reg, page, count, - &block_bytes, &block_bits); + status = o2hb_read_block_input(reg, page, &block_bytes, + &block_bits); if (status) return status; @@ -1763,6 +1909,7 @@ static ssize_t o2hb_region_dev_store(struct config_item *item, } INIT_DELAYED_WORK(®->hr_write_timeout_work, o2hb_write_timeout); + INIT_DELAYED_WORK(®->hr_nego_timeout_work, o2hb_nego_timeout); /* * A node is considered live after it has beat LIVE_THRESHOLD @@ -1996,13 +2143,37 @@ static struct config_item *o2hb_heartbeat_group_make_item(struct config_group *g config_item_init_type_name(®->hr_item, name, &o2hb_region_type); + /* this is the same way to generate msg key as dlm, for local heartbeat, + * name is also the same, so make initial crc value different to avoid + * message key conflict. + */ + reg->hr_key = crc32_le(reg->hr_region_num + O2NM_MAX_REGIONS, + name, strlen(name)); + INIT_LIST_HEAD(®->hr_handler_list); + ret = o2net_register_handler(O2HB_NEGO_TIMEOUT_MSG, reg->hr_key, + sizeof(struct o2hb_nego_msg), + o2hb_nego_timeout_handler, + reg, NULL, ®->hr_handler_list); + if (ret) + goto free; + + ret = o2net_register_handler(O2HB_NEGO_APPROVE_MSG, reg->hr_key, + sizeof(struct o2hb_nego_msg), + o2hb_nego_approve_handler, + reg, NULL, ®->hr_handler_list); + if (ret) + goto unregister_handler; + ret = o2hb_debug_region_init(reg, o2hb_debug_dir); if (ret) { config_item_put(®->hr_item); - goto free; + goto unregister_handler; } return ®->hr_item; + +unregister_handler: + o2net_unregister_handler_list(®->hr_handler_list); free: kfree(reg); return ERR_PTR(ret); diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index 2d0acd6678fe..1d67fcbf7160 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c @@ -600,10 +600,11 @@ static void o2net_set_nn_state(struct o2net_node *nn, static void o2net_data_ready(struct sock *sk) { void (*ready)(struct sock *sk); + struct o2net_sock_container *sc; - read_lock(&sk->sk_callback_lock); - if (sk->sk_user_data) { - struct o2net_sock_container *sc = sk->sk_user_data; + read_lock_bh(&sk->sk_callback_lock); + sc = sk->sk_user_data; + if (sc) { sclog(sc, "data_ready hit\n"); o2net_set_data_ready_time(sc); o2net_sc_queue_work(sc, &sc->sc_rx_work); @@ -611,7 +612,7 @@ static void o2net_data_ready(struct sock *sk) } else { ready = sk->sk_data_ready; } - read_unlock(&sk->sk_callback_lock); + read_unlock_bh(&sk->sk_callback_lock); ready(sk); } @@ -622,7 +623,7 @@ static void o2net_state_change(struct sock *sk) void (*state_change)(struct sock *sk); struct o2net_sock_container *sc; - read_lock(&sk->sk_callback_lock); + read_lock_bh(&sk->sk_callback_lock); sc = sk->sk_user_data; if (sc == NULL) { state_change = sk->sk_state_change; @@ -649,7 +650,7 @@ static void o2net_state_change(struct sock *sk) break; } out: - read_unlock(&sk->sk_callback_lock); + read_unlock_bh(&sk->sk_callback_lock); state_change(sk); } @@ -1617,16 +1618,12 @@ static void o2net_start_connect(struct work_struct *work) /* watch for racing with tearing a node down */ node = o2nm_get_node_by_num(o2net_num_from_nn(nn)); - if (node == NULL) { - ret = 0; + if (node == NULL) goto out; - } mynode = o2nm_get_node_by_num(o2nm_this_node()); - if (mynode == NULL) { - ret = 0; + if (mynode == NULL) goto out; - } spin_lock(&nn->nn_lock); /* @@ -2012,7 +2009,7 @@ static void o2net_listen_data_ready(struct sock *sk) { void (*ready)(struct sock *sk); - read_lock(&sk->sk_callback_lock); + read_lock_bh(&sk->sk_callback_lock); ready = sk->sk_user_data; if (ready == NULL) { /* check for teardown race */ ready = sk->sk_data_ready; @@ -2039,7 +2036,7 @@ static void o2net_listen_data_ready(struct sock *sk) } out: - read_unlock(&sk->sk_callback_lock); + read_unlock_bh(&sk->sk_callback_lock); if (ready != NULL) ready(sk); } diff --git a/fs/ocfs2/cluster/tcp_internal.h b/fs/ocfs2/cluster/tcp_internal.h index b95e7df5b76a..94b18369b1cc 100644 --- a/fs/ocfs2/cluster/tcp_internal.h +++ b/fs/ocfs2/cluster/tcp_internal.h @@ -44,6 +44,9 @@ * version here in tcp_internal.h should not need to be bumped for * filesystem locking changes. * + * New in version 12 + * - Negotiate hb timeout when storage is down. + * * New in version 11 * - Negotiation of filesystem locking in the dlm join. * @@ -75,7 +78,7 @@ * - full 64 bit i_size in the metadata lock lvbs * - introduction of "rw" lock and pushing meta/data locking down */ -#define O2NET_PROTOCOL_VERSION 11ULL +#define O2NET_PROTOCOL_VERSION 12ULL struct o2net_handshake { __be64 protocol_version; __be64 connector_id; diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h index 004f2cbe8f71..8107d0d0c3f6 100644 --- a/fs/ocfs2/dlm/dlmcommon.h +++ b/fs/ocfs2/dlm/dlmcommon.h @@ -47,7 +47,7 @@ #define DLM_HASH_BUCKETS (DLM_HASH_PAGES * DLM_BUCKETS_PER_PAGE) /* Intended to make it easier for us to switch out hash functions */ -#define dlm_lockid_hash(_n, _l) full_name_hash(_n, _l) +#define dlm_lockid_hash(_n, _l) full_name_hash(NULL, _n, _l) enum dlm_mle_type { DLM_MLE_BLOCK = 0, diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c index 825136070d2c..e7b760deefae 100644 --- a/fs/ocfs2/dlm/dlmdebug.c +++ b/fs/ocfs2/dlm/dlmdebug.c @@ -347,26 +347,6 @@ static struct dentry *dlm_debugfs_root; #define DLM_DEBUGFS_PURGE_LIST "purge_list" /* begin - utils funcs */ -static void dlm_debug_free(struct kref *kref) -{ - struct dlm_debug_ctxt *dc; - - dc = container_of(kref, struct dlm_debug_ctxt, debug_refcnt); - - kfree(dc); -} - -static void dlm_debug_put(struct dlm_debug_ctxt *dc) -{ - if (dc) - kref_put(&dc->debug_refcnt, dlm_debug_free); -} - -static void dlm_debug_get(struct dlm_debug_ctxt *dc) -{ - kref_get(&dc->debug_refcnt); -} - static int debug_release(struct inode *inode, struct file *file) { free_page((unsigned long)file->private_data); @@ -932,11 +912,9 @@ int dlm_debug_init(struct dlm_ctxt *dlm) goto bail; } - dlm_debug_get(dc); return 0; bail: - dlm_debug_shutdown(dlm); return -ENOMEM; } @@ -949,7 +927,8 @@ void dlm_debug_shutdown(struct dlm_ctxt *dlm) debugfs_remove(dc->debug_mle_dentry); debugfs_remove(dc->debug_lockres_dentry); debugfs_remove(dc->debug_state_dentry); - dlm_debug_put(dc); + kfree(dc); + dc = NULL; } } @@ -969,7 +948,6 @@ int dlm_create_debugfs_subroot(struct dlm_ctxt *dlm) mlog_errno(-ENOMEM); goto bail; } - kref_init(&dlm->dlm_debug_ctxt->debug_refcnt); return 0; bail: diff --git a/fs/ocfs2/dlm/dlmdebug.h b/fs/ocfs2/dlm/dlmdebug.h index 1f27c4812d1a..5ced5482e7d3 100644 --- a/fs/ocfs2/dlm/dlmdebug.h +++ b/fs/ocfs2/dlm/dlmdebug.h @@ -30,7 +30,6 @@ void dlm_print_one_mle(struct dlm_master_list_entry *mle); #ifdef CONFIG_DEBUG_FS struct dlm_debug_ctxt { - struct kref debug_refcnt; struct dentry *debug_state_dentry; struct dentry *debug_lockres_dentry; struct dentry *debug_mle_dentry; diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 1eaa9100c889..83d576f6a287 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -1635,7 +1635,6 @@ int ocfs2_create_new_inode_locks(struct inode *inode) int ret; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); - BUG_ON(!inode); BUG_ON(!ocfs2_inode_is_new(inode)); mlog(0, "Inode %llu\n", (unsigned long long)OCFS2_I(inode)->ip_blkno); @@ -1665,10 +1664,8 @@ int ocfs2_create_new_inode_locks(struct inode *inode) } ret = ocfs2_create_new_lock(osb, &OCFS2_I(inode)->ip_open_lockres, 0, 0); - if (ret) { + if (ret) mlog_errno(ret); - goto bail; - } bail: return ret; @@ -1680,8 +1677,6 @@ int ocfs2_rw_lock(struct inode *inode, int write) struct ocfs2_lock_res *lockres; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); - BUG_ON(!inode); - mlog(0, "inode %llu take %s RW lock\n", (unsigned long long)OCFS2_I(inode)->ip_blkno, write ? "EXMODE" : "PRMODE"); @@ -1724,8 +1719,6 @@ int ocfs2_open_lock(struct inode *inode) struct ocfs2_lock_res *lockres; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); - BUG_ON(!inode); - mlog(0, "inode %llu take PRMODE open lock\n", (unsigned long long)OCFS2_I(inode)->ip_blkno); @@ -1749,8 +1742,6 @@ int ocfs2_try_open_lock(struct inode *inode, int write) struct ocfs2_lock_res *lockres; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); - BUG_ON(!inode); - mlog(0, "inode %llu try to take %s open lock\n", (unsigned long long)OCFS2_I(inode)->ip_blkno, write ? "EXMODE" : "PRMODE"); @@ -2328,8 +2319,6 @@ int ocfs2_inode_lock_full_nested(struct inode *inode, struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); struct buffer_head *local_bh = NULL; - BUG_ON(!inode); - mlog(0, "inode %llu, take %s META lock\n", (unsigned long long)OCFS2_I(inode)->ip_blkno, ex ? "EXMODE" : "PRMODE"); diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index 0748777f2e2a..c56a7679df93 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -176,12 +176,7 @@ struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, unsigned flags, } if (is_bad_inode(inode)) { iput(inode); - if ((flags & OCFS2_FI_FLAG_FILECHECK_CHK) || - (flags & OCFS2_FI_FLAG_FILECHECK_FIX)) - /* Return OCFS2_FILECHECK_ERR_XXX related errno */ - inode = ERR_PTR(rc); - else - inode = ERR_PTR(-ESTALE); + inode = ERR_PTR(rc); goto bail; } diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h index d8f3fc8d2551..50cc55047443 100644 --- a/fs/ocfs2/inode.h +++ b/fs/ocfs2/inode.h @@ -145,22 +145,15 @@ int ocfs2_drop_inode(struct inode *inode); struct inode *ocfs2_ilookup(struct super_block *sb, u64 feoff); struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 feoff, unsigned flags, int sysfile_type); -int ocfs2_inode_init_private(struct inode *inode); int ocfs2_inode_revalidate(struct dentry *dentry); void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe, int create_ino); -void ocfs2_read_inode(struct inode *inode); -void ocfs2_read_inode2(struct inode *inode, void *opaque); -ssize_t ocfs2_rw_direct(int rw, struct file *filp, char *buf, - size_t size, loff_t *offp); void ocfs2_sync_blockdev(struct super_block *sb); void ocfs2_refresh_inode(struct inode *inode, struct ocfs2_dinode *fe); int ocfs2_mark_inode_dirty(handle_t *handle, struct inode *inode, struct buffer_head *bh); -struct buffer_head *ocfs2_bread(struct inode *inode, - int block, int *err, int reada); void ocfs2_set_inode_flags(struct inode *inode); void ocfs2_get_inode_flags(struct ocfs2_inode_info *oi); diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index e607419cdfa4..a244f14c6b87 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -1159,10 +1159,8 @@ static int ocfs2_force_read_journal(struct inode *inode) int status = 0; int i; u64 v_blkno, p_blkno, p_blocks, num_blocks; -#define CONCURRENT_JOURNAL_FILL 32ULL - struct buffer_head *bhs[CONCURRENT_JOURNAL_FILL]; - - memset(bhs, 0, sizeof(struct buffer_head *) * CONCURRENT_JOURNAL_FILL); + struct buffer_head *bh = NULL; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); num_blocks = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode)); v_blkno = 0; @@ -1174,29 +1172,32 @@ static int ocfs2_force_read_journal(struct inode *inode) goto bail; } - if (p_blocks > CONCURRENT_JOURNAL_FILL) - p_blocks = CONCURRENT_JOURNAL_FILL; - - /* We are reading journal data which should not - * be put in the uptodate cache */ - status = ocfs2_read_blocks_sync(OCFS2_SB(inode->i_sb), - p_blkno, p_blocks, bhs); - if (status < 0) { - mlog_errno(status); - goto bail; - } + for (i = 0; i < p_blocks; i++, p_blkno++) { + bh = __find_get_block(osb->sb->s_bdev, p_blkno, + osb->sb->s_blocksize); + /* block not cached. */ + if (!bh) + continue; + + brelse(bh); + bh = NULL; + /* We are reading journal data which should not + * be put in the uptodate cache. + */ + status = ocfs2_read_blocks_sync(osb, p_blkno, 1, &bh); + if (status < 0) { + mlog_errno(status); + goto bail; + } - for(i = 0; i < p_blocks; i++) { - brelse(bhs[i]); - bhs[i] = NULL; + brelse(bh); + bh = NULL; } v_blkno += p_blocks; } bail: - for(i = 0; i < CONCURRENT_JOURNAL_FILL; i++) - brelse(bhs[i]); return status; } diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h index f4cd3c3e9fb7..497a4171ef61 100644 --- a/fs/ocfs2/journal.h +++ b/fs/ocfs2/journal.h @@ -619,7 +619,7 @@ static inline int ocfs2_calc_tree_trunc_credits(struct super_block *sb, static inline int ocfs2_jbd2_file_inode(handle_t *handle, struct inode *inode) { - return jbd2_journal_file_inode(handle, &OCFS2_I(inode)->ip_jinode); + return jbd2_journal_inode_add_write(handle, &OCFS2_I(inode)->ip_jinode); } static inline int ocfs2_begin_ordered_truncate(struct inode *inode, diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h index 540ab5b75dbb..44d178b8d1aa 100644 --- a/fs/ocfs2/ocfs2_fs.h +++ b/fs/ocfs2/ocfs2_fs.h @@ -580,7 +580,7 @@ struct ocfs2_extended_slot { /*00*/ __u8 es_valid; __u8 es_reserved1[3]; __le32 es_node_num; -/*10*/ +/*08*/ }; /* diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c index 1e09592148ad..d7407994f308 100644 --- a/fs/ocfs2/slot_map.c +++ b/fs/ocfs2/slot_map.c @@ -535,12 +535,8 @@ void ocfs2_put_slot(struct ocfs2_super *osb) spin_unlock(&osb->osb_lock); status = ocfs2_update_disk_slot(osb, si, slot_num); - if (status < 0) { + if (status < 0) mlog_errno(status); - goto bail; - } -bail: ocfs2_free_slot_info(osb); } - diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c index 13219ed73e1d..52c07346bea3 100644 --- a/fs/ocfs2/stackglue.c +++ b/fs/ocfs2/stackglue.c @@ -735,8 +735,6 @@ static void __exit ocfs2_stack_glue_exit(void) { memset(&locking_max_version, 0, sizeof(struct ocfs2_protocol_version)); - locking_max_version.pv_major = 0; - locking_max_version.pv_minor = 0; ocfs2_sysfs_exit(); if (ocfs2_table_header) unregister_sysctl_table(ocfs2_table_header); diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index d7cae3327de5..603b28d6f008 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -1819,7 +1819,7 @@ static int ocfs2_get_sector(struct super_block *sb, if (!buffer_dirty(*bh)) clear_buffer_uptodate(*bh); unlock_buffer(*bh); - ll_rw_block(READ, 1, bh); + ll_rw_block(REQ_OP_READ, 0, 1, bh); wait_on_buffer(*bh); if (!buffer_uptodate(*bh)) { mlog_errno(-EIO); @@ -2072,7 +2072,6 @@ static int ocfs2_initialize_super(struct super_block *sb, osb->osb_dx_seed[3] = le32_to_cpu(di->id2.i_super.s_uuid_hash); osb->sb = sb; - /* Save off for ocfs2_rw_direct */ osb->s_sectsize_bits = blksize_bits(sector_size); BUG_ON(!osb->s_sectsize_bits); diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index ad16995c9e7a..d2053853951e 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -7254,10 +7254,11 @@ static int ocfs2_xattr_security_get(const struct xattr_handler *handler, } static int ocfs2_xattr_security_set(const struct xattr_handler *handler, - struct dentry *dentry, const char *name, - const void *value, size_t size, int flags) + struct dentry *unused, struct inode *inode, + const char *name, const void *value, + size_t size, int flags) { - return ocfs2_xattr_set(d_inode(dentry), OCFS2_XATTR_INDEX_SECURITY, + return ocfs2_xattr_set(inode, OCFS2_XATTR_INDEX_SECURITY, name, value, size, flags); } @@ -7325,10 +7326,11 @@ static int ocfs2_xattr_trusted_get(const struct xattr_handler *handler, } static int ocfs2_xattr_trusted_set(const struct xattr_handler *handler, - struct dentry *dentry, const char *name, - const void *value, size_t size, int flags) + struct dentry *unused, struct inode *inode, + const char *name, const void *value, + size_t size, int flags) { - return ocfs2_xattr_set(d_inode(dentry), OCFS2_XATTR_INDEX_TRUSTED, + return ocfs2_xattr_set(inode, OCFS2_XATTR_INDEX_TRUSTED, name, value, size, flags); } @@ -7354,15 +7356,16 @@ static int ocfs2_xattr_user_get(const struct xattr_handler *handler, } static int ocfs2_xattr_user_set(const struct xattr_handler *handler, - struct dentry *dentry, const char *name, - const void *value, size_t size, int flags) + struct dentry *unused, struct inode *inode, + const char *name, const void *value, + size_t size, int flags) { - struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb); + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); if (osb->s_mount_opt & OCFS2_MOUNT_NOUSERXATTR) return -EOPNOTSUPP; - return ocfs2_xattr_set(d_inode(dentry), OCFS2_XATTR_INDEX_USER, + return ocfs2_xattr_set(inode, OCFS2_XATTR_INDEX_USER, name, value, size, flags); } diff --git a/fs/open.c b/fs/open.c index 93ae3cdee4ab..bf66cf1a9f5c 100644 --- a/fs/open.c +++ b/fs/open.c @@ -840,13 +840,13 @@ EXPORT_SYMBOL(file_path); int vfs_open(const struct path *path, struct file *file, const struct cred *cred) { - struct inode *inode = vfs_select_inode(path->dentry, file->f_flags); + struct dentry *dentry = d_real(path->dentry, NULL, file->f_flags); - if (IS_ERR(inode)) - return PTR_ERR(inode); + if (IS_ERR(dentry)) + return PTR_ERR(dentry); file->f_path = *path; - return do_dentry_open(file, inode, NULL, cred); + return do_dentry_open(file, d_backing_inode(dentry), NULL, cred); } struct file *dentry_open(const struct path *path, int flags, diff --git a/fs/orangefs/acl.c b/fs/orangefs/acl.c index 03f89dbb2512..28f2195cd798 100644 --- a/fs/orangefs/acl.c +++ b/fs/orangefs/acl.c @@ -18,10 +18,10 @@ struct posix_acl *orangefs_get_acl(struct inode *inode, int type) switch (type) { case ACL_TYPE_ACCESS: - key = ORANGEFS_XATTR_NAME_ACL_ACCESS; + key = XATTR_NAME_POSIX_ACL_ACCESS; break; case ACL_TYPE_DEFAULT: - key = ORANGEFS_XATTR_NAME_ACL_DEFAULT; + key = XATTR_NAME_POSIX_ACL_DEFAULT; break; default: gossip_err("orangefs_get_acl: bogus value of type %d\n", type); @@ -43,11 +43,8 @@ struct posix_acl *orangefs_get_acl(struct inode *inode, int type) get_khandle_from_ino(inode), key, type); - ret = orangefs_inode_getxattr(inode, - "", - key, - value, - ORANGEFS_MAX_XATTR_VALUELEN); + ret = orangefs_inode_getxattr(inode, key, value, + ORANGEFS_MAX_XATTR_VALUELEN); /* if the key exists, convert it to an in-memory rep */ if (ret > 0) { acl = posix_acl_from_xattr(&init_user_ns, value, ret); @@ -74,7 +71,7 @@ int orangefs_set_acl(struct inode *inode, struct posix_acl *acl, int type) switch (type) { case ACL_TYPE_ACCESS: - name = ORANGEFS_XATTR_NAME_ACL_ACCESS; + name = XATTR_NAME_POSIX_ACL_ACCESS; if (acl) { umode_t mode = inode->i_mode; /* @@ -98,7 +95,7 @@ int orangefs_set_acl(struct inode *inode, struct posix_acl *acl, int type) } break; case ACL_TYPE_DEFAULT: - name = ORANGEFS_XATTR_NAME_ACL_DEFAULT; + name = XATTR_NAME_POSIX_ACL_DEFAULT; break; default: gossip_err("%s: invalid type %d!\n", __func__, type); @@ -131,7 +128,7 @@ int orangefs_set_acl(struct inode *inode, struct posix_acl *acl, int type) * will xlate to a removexattr. However, we don't want removexattr * complain if attributes does not exist. */ - error = orangefs_inode_setxattr(inode, "", name, value, size, 0); + error = orangefs_inode_setxattr(inode, name, value, size, 0); out: kfree(value); diff --git a/fs/orangefs/devorangefs-req.c b/fs/orangefs/devorangefs-req.c index db170beba797..a287a66d94e3 100644 --- a/fs/orangefs/devorangefs-req.c +++ b/fs/orangefs/devorangefs-req.c @@ -116,6 +116,13 @@ static int orangefs_devreq_open(struct inode *inode, struct file *file) { int ret = -EINVAL; + /* in order to ensure that the filesystem driver sees correct UIDs */ + if (file->f_cred->user_ns != &init_user_ns) { + gossip_err("%s: device cannot be opened outside init_user_ns\n", + __func__); + goto out; + } + if (!(file->f_flags & O_NONBLOCK)) { gossip_err("%s: device cannot be opened in blocking mode\n", __func__); diff --git a/fs/orangefs/file.c b/fs/orangefs/file.c index 491e82c6f705..526040e09f78 100644 --- a/fs/orangefs/file.c +++ b/fs/orangefs/file.c @@ -516,7 +516,6 @@ static long orangefs_ioctl(struct file *file, unsigned int cmd, unsigned long ar if (cmd == FS_IOC_GETFLAGS) { val = 0; ret = orangefs_inode_getxattr(file_inode(file), - ORANGEFS_XATTR_NAME_DEFAULT_PREFIX, "user.pvfs2.meta_hint", &val, sizeof(val)); if (ret < 0 && ret != -ENODATA) @@ -549,7 +548,6 @@ static long orangefs_ioctl(struct file *file, unsigned int cmd, unsigned long ar "orangefs_ioctl: FS_IOC_SETFLAGS: %llu\n", (unsigned long long)val); ret = orangefs_inode_setxattr(file_inode(file), - ORANGEFS_XATTR_NAME_DEFAULT_PREFIX, "user.pvfs2.meta_hint", &val, sizeof(val), 0); } diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c index 85640e955cde..2e63e6d0a68e 100644 --- a/fs/orangefs/inode.c +++ b/fs/orangefs/inode.c @@ -80,7 +80,7 @@ static int orangefs_readpages(struct file *file, if (!add_to_page_cache(page, mapping, page->index, - GFP_KERNEL)) { + readahead_gfp_mask(mapping))) { ret = read_one_page(page); gossip_debug(GOSSIP_INODE_DEBUG, "failure adding page to cache, read_one_page returned: %d\n", @@ -124,19 +124,16 @@ static int orangefs_releasepage(struct page *page, gfp_t foo) * will need to be able to use O_DIRECT on open in order to support * AIO. Modeled after NFS, they do this too. */ -/* - * static ssize_t orangefs_direct_IO(int rw, - * struct kiocb *iocb, - * struct iov_iter *iter, - * loff_t offset) - *{ - * gossip_debug(GOSSIP_INODE_DEBUG, - * "orangefs_direct_IO: %s\n", - * iocb->ki_filp->f_path.dentry->d_name.name); - * - * return -EINVAL; - *} - */ + +static ssize_t orangefs_direct_IO(struct kiocb *iocb, + struct iov_iter *iter) +{ + gossip_debug(GOSSIP_INODE_DEBUG, + "orangefs_direct_IO: %s\n", + iocb->ki_filp->f_path.dentry->d_name.name); + + return -EINVAL; +} struct backing_dev_info orangefs_backing_dev_info = { .name = "orangefs", @@ -150,7 +147,7 @@ const struct address_space_operations orangefs_address_operations = { .readpages = orangefs_readpages, .invalidatepage = orangefs_invalidatepage, .releasepage = orangefs_releasepage, -/* .direct_IO = orangefs_direct_IO */ + .direct_IO = orangefs_direct_IO, }; static int orangefs_setattr_size(struct inode *inode, struct iattr *iattr) @@ -294,7 +291,7 @@ int orangefs_permission(struct inode *inode, int mask) } /* ORANGEDS2 implementation of VFS inode operations for files */ -struct inode_operations orangefs_file_inode_operations = { +const struct inode_operations orangefs_file_inode_operations = { .get_acl = orangefs_get_acl, .set_acl = orangefs_set_acl, .setattr = orangefs_setattr, diff --git a/fs/orangefs/namei.c b/fs/orangefs/namei.c index 5a60c508af4e..7e8dfa97c44a 100644 --- a/fs/orangefs/namei.c +++ b/fs/orangefs/namei.c @@ -405,12 +405,8 @@ static int orangefs_rename(struct inode *old_dir, int ret; gossip_debug(GOSSIP_NAME_DEBUG, - "orangefs_rename: called (%s/%s => %s/%s) ct=%d\n", - old_dentry->d_parent->d_name.name, - old_dentry->d_name.name, - new_dentry->d_parent->d_name.name, - new_dentry->d_name.name, - d_count(new_dentry)); + "orangefs_rename: called (%pd2 => %pd2) ct=%d\n", + old_dentry, new_dentry, d_count(new_dentry)); new_op = op_alloc(ORANGEFS_VFS_OP_RENAME); if (!new_op) @@ -442,7 +438,7 @@ static int orangefs_rename(struct inode *old_dir, } /* ORANGEFS implementation of VFS inode operations for directories */ -struct inode_operations orangefs_dir_inode_operations = { +const struct inode_operations orangefs_dir_inode_operations = { .lookup = orangefs_lookup, .get_acl = orangefs_get_acl, .set_acl = orangefs_set_acl, diff --git a/fs/orangefs/orangefs-cache.c b/fs/orangefs/orangefs-cache.c index 900a2e38e11b..b6edbe9fb309 100644 --- a/fs/orangefs/orangefs-cache.c +++ b/fs/orangefs/orangefs-cache.c @@ -136,10 +136,10 @@ struct orangefs_kernel_op_s *op_alloc(__s32 type) llu(new_op->tag), get_opname_string(new_op)); - new_op->upcall.uid = from_kuid(current_user_ns(), + new_op->upcall.uid = from_kuid(&init_user_ns, current_fsuid()); - new_op->upcall.gid = from_kgid(current_user_ns(), + new_op->upcall.gid = from_kgid(&init_user_ns, current_fsgid()); } else { gossip_err("op_alloc: kmem_cache_zalloc failed!\n"); diff --git a/fs/orangefs/orangefs-kernel.h b/fs/orangefs/orangefs-kernel.h index 2281882f718e..4b6e132d5a0f 100644 --- a/fs/orangefs/orangefs-kernel.h +++ b/fs/orangefs/orangefs-kernel.h @@ -119,17 +119,6 @@ struct client_debug_mask { #define ORANGEFS_CACHE_CREATE_FLAGS 0 #endif /* ((defined ORANGEFS_KERNEL_DEBUG) && (defined CONFIG_DEBUG_SLAB)) */ -/* orangefs xattr and acl related defines */ -#define ORANGEFS_XATTR_INDEX_POSIX_ACL_ACCESS 1 -#define ORANGEFS_XATTR_INDEX_POSIX_ACL_DEFAULT 2 -#define ORANGEFS_XATTR_INDEX_TRUSTED 3 -#define ORANGEFS_XATTR_INDEX_DEFAULT 4 - -#define ORANGEFS_XATTR_NAME_ACL_ACCESS XATTR_NAME_POSIX_ACL_ACCESS -#define ORANGEFS_XATTR_NAME_ACL_DEFAULT XATTR_NAME_POSIX_ACL_DEFAULT -#define ORANGEFS_XATTR_NAME_TRUSTED_PREFIX "trusted." -#define ORANGEFS_XATTR_NAME_DEFAULT_PREFIX "" - /* these functions are defined in orangefs-utils.c */ int orangefs_prepare_cdm_array(char *debug_array_string); int orangefs_prepare_debugfs_help_string(int); @@ -528,13 +517,11 @@ __s32 fsid_of_op(struct orangefs_kernel_op_s *op); int orangefs_flush_inode(struct inode *inode); ssize_t orangefs_inode_getxattr(struct inode *inode, - const char *prefix, const char *name, void *buffer, size_t size); int orangefs_inode_setxattr(struct inode *inode, - const char *prefix, const char *name, const void *value, size_t size, @@ -570,10 +557,10 @@ extern int hash_table_size; extern const struct address_space_operations orangefs_address_operations; extern struct backing_dev_info orangefs_backing_dev_info; -extern struct inode_operations orangefs_file_inode_operations; +extern const struct inode_operations orangefs_file_inode_operations; extern const struct file_operations orangefs_file_operations; -extern struct inode_operations orangefs_symlink_inode_operations; -extern struct inode_operations orangefs_dir_inode_operations; +extern const struct inode_operations orangefs_symlink_inode_operations; +extern const struct inode_operations orangefs_dir_inode_operations; extern const struct file_operations orangefs_dir_operations; extern const struct dentry_operations orangefs_dentry_operations; extern const struct file_operations orangefs_devreq_file_operations; @@ -600,8 +587,8 @@ int service_operation(struct orangefs_kernel_op_s *op, #define fill_default_sys_attrs(sys_attr, type, mode) \ do { \ - sys_attr.owner = from_kuid(current_user_ns(), current_fsuid()); \ - sys_attr.group = from_kgid(current_user_ns(), current_fsgid()); \ + sys_attr.owner = from_kuid(&init_user_ns, current_fsuid()); \ + sys_attr.group = from_kgid(&init_user_ns, current_fsgid()); \ sys_attr.perms = ORANGEFS_util_translate_mode(mode); \ sys_attr.mtime = 0; \ sys_attr.atime = 0; \ diff --git a/fs/orangefs/orangefs-utils.c b/fs/orangefs/orangefs-utils.c index 2d129b5886ee..c5fbc62357c6 100644 --- a/fs/orangefs/orangefs-utils.c +++ b/fs/orangefs/orangefs-utils.c @@ -153,12 +153,12 @@ static inline int copy_attributes_from_inode(struct inode *inode, */ attrs->mask = 0; if (iattr->ia_valid & ATTR_UID) { - attrs->owner = from_kuid(current_user_ns(), iattr->ia_uid); + attrs->owner = from_kuid(&init_user_ns, iattr->ia_uid); attrs->mask |= ORANGEFS_ATTR_SYS_UID; gossip_debug(GOSSIP_UTILS_DEBUG, "(UID) %d\n", attrs->owner); } if (iattr->ia_valid & ATTR_GID) { - attrs->group = from_kgid(current_user_ns(), iattr->ia_gid); + attrs->group = from_kgid(&init_user_ns, iattr->ia_gid); attrs->mask |= ORANGEFS_ATTR_SYS_GID; gossip_debug(GOSSIP_UTILS_DEBUG, "(GID) %d\n", attrs->group); } diff --git a/fs/orangefs/symlink.c b/fs/orangefs/symlink.c index 6418dd638680..8fecf823f5ba 100644 --- a/fs/orangefs/symlink.c +++ b/fs/orangefs/symlink.c @@ -8,7 +8,7 @@ #include "orangefs-kernel.h" #include "orangefs-bufmap.h" -struct inode_operations orangefs_symlink_inode_operations = { +const struct inode_operations orangefs_symlink_inode_operations = { .readlink = generic_readlink, .get_link = simple_get_link, .setattr = orangefs_setattr, diff --git a/fs/orangefs/xattr.c b/fs/orangefs/xattr.c index 99c19545752c..2a9f07f06d10 100644 --- a/fs/orangefs/xattr.c +++ b/fs/orangefs/xattr.c @@ -59,8 +59,8 @@ static inline int convert_to_internal_xattr_flags(int setxattr_flags) * unless the key does not exist for the file and/or if * there were errors in fetching the attribute value. */ -ssize_t orangefs_inode_getxattr(struct inode *inode, const char *prefix, - const char *name, void *buffer, size_t size) +ssize_t orangefs_inode_getxattr(struct inode *inode, const char *name, + void *buffer, size_t size) { struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode); struct orangefs_kernel_op_s *new_op = NULL; @@ -70,17 +70,17 @@ ssize_t orangefs_inode_getxattr(struct inode *inode, const char *prefix, int fsgid; gossip_debug(GOSSIP_XATTR_DEBUG, - "%s: prefix %s name %s, buffer_size %zd\n", - __func__, prefix, name, size); + "%s: name %s, buffer_size %zd\n", + __func__, name, size); - if ((strlen(name) + strlen(prefix)) >= ORANGEFS_MAX_XATTR_NAMELEN) { + if (strlen(name) >= ORANGEFS_MAX_XATTR_NAMELEN) { gossip_err("Invalid key length (%d)\n", - (int)(strlen(name) + strlen(prefix))); + (int)strlen(name)); return -EINVAL; } - fsuid = from_kuid(current_user_ns(), current_fsuid()); - fsgid = from_kgid(current_user_ns(), current_fsgid()); + fsuid = from_kuid(&init_user_ns, current_fsuid()); + fsgid = from_kgid(&init_user_ns, current_fsgid()); gossip_debug(GOSSIP_XATTR_DEBUG, "getxattr on inode %pU, name %s " @@ -97,15 +97,14 @@ ssize_t orangefs_inode_getxattr(struct inode *inode, const char *prefix, goto out_unlock; new_op->upcall.req.getxattr.refn = orangefs_inode->refn; - ret = snprintf((char *)new_op->upcall.req.getxattr.key, - ORANGEFS_MAX_XATTR_NAMELEN, "%s%s", prefix, name); + strcpy(new_op->upcall.req.getxattr.key, name); /* * NOTE: Although keys are meant to be NULL terminated textual * strings, I am going to explicitly pass the length just in case * we change this later on... */ - new_op->upcall.req.getxattr.key_sz = ret + 1; + new_op->upcall.req.getxattr.key_sz = strlen(name) + 1; ret = service_operation(new_op, "orangefs_inode_getxattr", get_interruptible_flag(inode)); @@ -163,10 +162,8 @@ out_unlock: return ret; } -static int orangefs_inode_removexattr(struct inode *inode, - const char *prefix, - const char *name, - int flags) +static int orangefs_inode_removexattr(struct inode *inode, const char *name, + int flags) { struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode); struct orangefs_kernel_op_s *new_op = NULL; @@ -183,12 +180,8 @@ static int orangefs_inode_removexattr(struct inode *inode, * textual strings, I am going to explicitly pass the * length just in case we change this later on... */ - ret = snprintf((char *)new_op->upcall.req.removexattr.key, - ORANGEFS_MAX_XATTR_NAMELEN, - "%s%s", - (prefix ? prefix : ""), - name); - new_op->upcall.req.removexattr.key_sz = ret + 1; + strcpy(new_op->upcall.req.removexattr.key, name); + new_op->upcall.req.removexattr.key_sz = strlen(name) + 1; gossip_debug(GOSSIP_XATTR_DEBUG, "orangefs_inode_removexattr: key %s, key_sz %d\n", @@ -223,8 +216,8 @@ out_unlock: * Returns a -ve number on error and 0 on success. Key is text, but value * can be binary! */ -int orangefs_inode_setxattr(struct inode *inode, const char *prefix, - const char *name, const void *value, size_t size, int flags) +int orangefs_inode_setxattr(struct inode *inode, const char *name, + const void *value, size_t size, int flags) { struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode); struct orangefs_kernel_op_s *new_op; @@ -232,8 +225,8 @@ int orangefs_inode_setxattr(struct inode *inode, const char *prefix, int ret = -ENOMEM; gossip_debug(GOSSIP_XATTR_DEBUG, - "%s: prefix %s, name %s, buffer_size %zd\n", - __func__, prefix, name, size); + "%s: name %s, buffer_size %zd\n", + __func__, name, size); if (size >= ORANGEFS_MAX_XATTR_VALUELEN || flags < 0) { @@ -245,29 +238,19 @@ int orangefs_inode_setxattr(struct inode *inode, const char *prefix, internal_flag = convert_to_internal_xattr_flags(flags); - if (prefix) { - if (strlen(name) + strlen(prefix) >= ORANGEFS_MAX_XATTR_NAMELEN) { - gossip_err - ("orangefs_inode_setxattr: bogus key size (%d)\n", - (int)(strlen(name) + strlen(prefix))); - return -EINVAL; - } - } else { - if (strlen(name) >= ORANGEFS_MAX_XATTR_NAMELEN) { - gossip_err - ("orangefs_inode_setxattr: bogus key size (%d)\n", - (int)(strlen(name))); - return -EINVAL; - } + if (strlen(name) >= ORANGEFS_MAX_XATTR_NAMELEN) { + gossip_err + ("orangefs_inode_setxattr: bogus key size (%d)\n", + (int)(strlen(name))); + return -EINVAL; } /* This is equivalent to a removexattr */ if (size == 0 && value == NULL) { gossip_debug(GOSSIP_XATTR_DEBUG, - "removing xattr (%s%s)\n", - prefix, + "removing xattr (%s)\n", name); - return orangefs_inode_removexattr(inode, prefix, name, flags); + return orangefs_inode_removexattr(inode, name, flags); } gossip_debug(GOSSIP_XATTR_DEBUG, @@ -288,11 +271,8 @@ int orangefs_inode_setxattr(struct inode *inode, const char *prefix, * strings, I am going to explicitly pass the length just in * case we change this later on... */ - ret = snprintf((char *)new_op->upcall.req.setxattr.keyval.key, - ORANGEFS_MAX_XATTR_NAMELEN, - "%s%s", - prefix, name); - new_op->upcall.req.setxattr.keyval.key_sz = ret + 1; + strcpy(new_op->upcall.req.setxattr.keyval.key, name); + new_op->upcall.req.setxattr.keyval.key_sz = strlen(name) + 1; memcpy(new_op->upcall.req.setxattr.keyval.val, value, size); new_op->upcall.req.setxattr.keyval.val_sz = size; @@ -448,18 +428,14 @@ out_unlock: } static int orangefs_xattr_set_default(const struct xattr_handler *handler, - struct dentry *dentry, + struct dentry *unused, + struct inode *inode, const char *name, const void *buffer, size_t size, int flags) { - return orangefs_inode_setxattr(dentry->d_inode, - ORANGEFS_XATTR_NAME_DEFAULT_PREFIX, - name, - buffer, - size, - flags); + return orangefs_inode_setxattr(inode, name, buffer, size, flags); } static int orangefs_xattr_get_default(const struct xattr_handler *handler, @@ -469,56 +445,12 @@ static int orangefs_xattr_get_default(const struct xattr_handler *handler, void *buffer, size_t size) { - return orangefs_inode_getxattr(inode, - ORANGEFS_XATTR_NAME_DEFAULT_PREFIX, - name, - buffer, - size); + return orangefs_inode_getxattr(inode, name, buffer, size); } -static int orangefs_xattr_set_trusted(const struct xattr_handler *handler, - struct dentry *dentry, - const char *name, - const void *buffer, - size_t size, - int flags) -{ - return orangefs_inode_setxattr(dentry->d_inode, - ORANGEFS_XATTR_NAME_TRUSTED_PREFIX, - name, - buffer, - size, - flags); -} - -static int orangefs_xattr_get_trusted(const struct xattr_handler *handler, - struct dentry *unused, - struct inode *inode, - const char *name, - void *buffer, - size_t size) -{ - return orangefs_inode_getxattr(inode, - ORANGEFS_XATTR_NAME_TRUSTED_PREFIX, - name, - buffer, - size); -} - -static struct xattr_handler orangefs_xattr_trusted_handler = { - .prefix = ORANGEFS_XATTR_NAME_TRUSTED_PREFIX, - .get = orangefs_xattr_get_trusted, - .set = orangefs_xattr_set_trusted, -}; - static struct xattr_handler orangefs_xattr_default_handler = { - /* - * NOTE: this is set to be the empty string. - * so that all un-prefixed xattrs keys get caught - * here! - */ - .prefix = ORANGEFS_XATTR_NAME_DEFAULT_PREFIX, + .prefix = "", /* match any name => handlers called with full name */ .get = orangefs_xattr_get_default, .set = orangefs_xattr_set_default, }; @@ -526,7 +458,6 @@ static struct xattr_handler orangefs_xattr_default_handler = { const struct xattr_handler *orangefs_xattr_handlers[] = { &posix_acl_access_xattr_handler, &posix_acl_default_xattr_handler, - &orangefs_xattr_trusted_handler, &orangefs_xattr_default_handler, NULL }; diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c index cc514da6f3e7..80aa6f1eb336 100644 --- a/fs/overlayfs/copy_up.c +++ b/fs/overlayfs/copy_up.c @@ -336,7 +336,6 @@ int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry, struct dentry *upperdir; struct dentry *upperdentry; const struct cred *old_cred; - struct cred *override_cred; char *link = NULL; if (WARN_ON(!workdir)) @@ -357,28 +356,7 @@ int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry, return PTR_ERR(link); } - err = -ENOMEM; - override_cred = prepare_creds(); - if (!override_cred) - goto out_free_link; - - override_cred->fsuid = stat->uid; - override_cred->fsgid = stat->gid; - /* - * CAP_SYS_ADMIN for copying up extended attributes - * CAP_DAC_OVERRIDE for create - * CAP_FOWNER for chmod, timestamp update - * CAP_FSETID for chmod - * CAP_CHOWN for chown - * CAP_MKNOD for mknod - */ - cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN); - cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE); - cap_raise(override_cred->cap_effective, CAP_FOWNER); - cap_raise(override_cred->cap_effective, CAP_FSETID); - cap_raise(override_cred->cap_effective, CAP_CHOWN); - cap_raise(override_cred->cap_effective, CAP_MKNOD); - old_cred = override_creds(override_cred); + old_cred = ovl_override_creds(dentry->d_sb); err = -EIO; if (lock_rename(workdir, upperdir) != NULL) { @@ -401,9 +379,7 @@ int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry, out_unlock: unlock_rename(workdir, upperdir); revert_creds(old_cred); - put_cred(override_cred); -out_free_link: if (link) free_page((unsigned long) link); diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c index b3fc0a35bf62..5c9d2d80ff70 100644 --- a/fs/overlayfs/dir.c +++ b/fs/overlayfs/dir.c @@ -407,26 +407,20 @@ static int ovl_create_or_link(struct dentry *dentry, int mode, dev_t rdev, const struct cred *old_cred; struct cred *override_cred; + old_cred = ovl_override_creds(dentry->d_sb); + err = -ENOMEM; override_cred = prepare_creds(); - if (!override_cred) - goto out_iput; - - /* - * CAP_SYS_ADMIN for setting opaque xattr - * CAP_DAC_OVERRIDE for create in workdir, rename - * CAP_FOWNER for removing whiteout from sticky dir - */ - cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN); - cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE); - cap_raise(override_cred->cap_effective, CAP_FOWNER); - old_cred = override_creds(override_cred); - - err = ovl_create_over_whiteout(dentry, inode, &stat, link, - hardlink); - + if (override_cred) { + override_cred->fsuid = old_cred->fsuid; + override_cred->fsgid = old_cred->fsgid; + put_cred(override_creds(override_cred)); + put_cred(override_cred); + + err = ovl_create_over_whiteout(dentry, inode, &stat, + link, hardlink); + } revert_creds(old_cred); - put_cred(override_cred); } if (!err) @@ -511,6 +505,7 @@ static int ovl_remove_and_whiteout(struct dentry *dentry, bool is_dir) struct dentry *upper; struct dentry *opaquedir = NULL; int err; + int flags = 0; if (WARN_ON(!workdir)) return -EROFS; @@ -540,46 +535,39 @@ static int ovl_remove_and_whiteout(struct dentry *dentry, bool is_dir) if (err) goto out_dput; - whiteout = ovl_whiteout(workdir, dentry); - err = PTR_ERR(whiteout); - if (IS_ERR(whiteout)) + upper = lookup_one_len(dentry->d_name.name, upperdir, + dentry->d_name.len); + err = PTR_ERR(upper); + if (IS_ERR(upper)) goto out_unlock; - upper = ovl_dentry_upper(dentry); - if (!upper) { - upper = lookup_one_len(dentry->d_name.name, upperdir, - dentry->d_name.len); - err = PTR_ERR(upper); - if (IS_ERR(upper)) - goto kill_whiteout; - - err = ovl_do_rename(wdir, whiteout, udir, upper, 0); - dput(upper); - if (err) - goto kill_whiteout; - } else { - int flags = 0; + err = -ESTALE; + if ((opaquedir && upper != opaquedir) || + (!opaquedir && ovl_dentry_upper(dentry) && + upper != ovl_dentry_upper(dentry))) { + goto out_dput_upper; + } - if (opaquedir) - upper = opaquedir; - err = -ESTALE; - if (upper->d_parent != upperdir) - goto kill_whiteout; + whiteout = ovl_whiteout(workdir, dentry); + err = PTR_ERR(whiteout); + if (IS_ERR(whiteout)) + goto out_dput_upper; - if (is_dir) - flags |= RENAME_EXCHANGE; + if (d_is_dir(upper)) + flags = RENAME_EXCHANGE; - err = ovl_do_rename(wdir, whiteout, udir, upper, flags); - if (err) - goto kill_whiteout; + err = ovl_do_rename(wdir, whiteout, udir, upper, flags); + if (err) + goto kill_whiteout; + if (flags) + ovl_cleanup(wdir, upper); - if (is_dir) - ovl_cleanup(wdir, upper); - } ovl_dentry_version_inc(dentry->d_parent); out_d_drop: d_drop(dentry); dput(whiteout); +out_dput_upper: + dput(upper); out_unlock: unlock_rename(workdir, upperdir); out_dput: @@ -662,32 +650,11 @@ static int ovl_do_remove(struct dentry *dentry, bool is_dir) if (OVL_TYPE_PURE_UPPER(type)) { err = ovl_remove_upper(dentry, is_dir); } else { - const struct cred *old_cred; - struct cred *override_cred; - - err = -ENOMEM; - override_cred = prepare_creds(); - if (!override_cred) - goto out_drop_write; - - /* - * CAP_SYS_ADMIN for setting xattr on whiteout, opaque dir - * CAP_DAC_OVERRIDE for create in workdir, rename - * CAP_FOWNER for removing whiteout from sticky dir - * CAP_FSETID for chmod of opaque dir - * CAP_CHOWN for chown of opaque dir - */ - cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN); - cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE); - cap_raise(override_cred->cap_effective, CAP_FOWNER); - cap_raise(override_cred->cap_effective, CAP_FSETID); - cap_raise(override_cred->cap_effective, CAP_CHOWN); - old_cred = override_creds(override_cred); + const struct cred *old_cred = ovl_override_creds(dentry->d_sb); err = ovl_remove_and_whiteout(dentry, is_dir); revert_creds(old_cred); - put_cred(override_cred); } out_drop_write: ovl_drop_write(dentry); @@ -725,7 +692,6 @@ static int ovl_rename2(struct inode *olddir, struct dentry *old, bool new_is_dir = false; struct dentry *opaquedir = NULL; const struct cred *old_cred = NULL; - struct cred *override_cred = NULL; err = -EINVAL; if (flags & ~(RENAME_EXCHANGE | RENAME_NOREPLACE)) @@ -794,26 +760,8 @@ static int ovl_rename2(struct inode *olddir, struct dentry *old, old_opaque = !OVL_TYPE_PURE_UPPER(old_type); new_opaque = !OVL_TYPE_PURE_UPPER(new_type); - if (old_opaque || new_opaque) { - err = -ENOMEM; - override_cred = prepare_creds(); - if (!override_cred) - goto out_drop_write; - - /* - * CAP_SYS_ADMIN for setting xattr on whiteout, opaque dir - * CAP_DAC_OVERRIDE for create in workdir - * CAP_FOWNER for removing whiteout from sticky dir - * CAP_FSETID for chmod of opaque dir - * CAP_CHOWN for chown of opaque dir - */ - cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN); - cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE); - cap_raise(override_cred->cap_effective, CAP_FOWNER); - cap_raise(override_cred->cap_effective, CAP_FSETID); - cap_raise(override_cred->cap_effective, CAP_CHOWN); - old_cred = override_creds(override_cred); - } + if (old_opaque || new_opaque) + old_cred = ovl_override_creds(old->d_sb); if (overwrite && OVL_TYPE_MERGE_OR_LOWER(new_type) && new_is_dir) { opaquedir = ovl_check_empty_and_clear(new); @@ -943,10 +891,8 @@ out_dput_old: out_unlock: unlock_rename(new_upperdir, old_upperdir); out_revert_creds: - if (old_opaque || new_opaque) { + if (old_opaque || new_opaque) revert_creds(old_cred); - put_cred(override_cred); - } out_drop_write: ovl_drop_write(old); out: diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c index c7b31a03dc9c..d554e86abbe3 100644 --- a/fs/overlayfs/inode.c +++ b/fs/overlayfs/inode.c @@ -59,16 +59,40 @@ int ovl_setattr(struct dentry *dentry, struct iattr *attr) if (err) goto out; + if (attr->ia_valid & ATTR_SIZE) { + struct inode *realinode = d_inode(ovl_dentry_real(dentry)); + + err = -ETXTBSY; + if (atomic_read(&realinode->i_writecount) < 0) + goto out_drop_write; + } + err = ovl_copy_up(dentry); if (!err) { + struct inode *winode = NULL; + upperdentry = ovl_dentry_upper(dentry); + if (attr->ia_valid & ATTR_SIZE) { + winode = d_inode(upperdentry); + err = get_write_access(winode); + if (err) + goto out_drop_write; + } + + if (attr->ia_valid & (ATTR_KILL_SUID|ATTR_KILL_SGID)) + attr->ia_valid &= ~ATTR_MODE; + inode_lock(upperdentry->d_inode); err = notify_change(upperdentry, attr, NULL); if (!err) ovl_copyattr(upperdentry->d_inode, dentry->d_inode); inode_unlock(upperdentry->d_inode); + + if (winode) + put_write_access(winode); } +out_drop_write: ovl_drop_write(dentry); out: return err; @@ -121,16 +145,18 @@ int ovl_permission(struct inode *inode, int mask) err = vfs_getattr(&realpath, &stat); if (err) - return err; + goto out_dput; + err = -ESTALE; if ((stat.mode ^ inode->i_mode) & S_IFMT) - return -ESTALE; + goto out_dput; inode->i_mode = stat.mode; inode->i_uid = stat.uid; inode->i_gid = stat.gid; - return generic_permission(inode, mask); + err = generic_permission(inode, mask); + goto out_dput; } /* Careful in RCU walk mode */ @@ -210,8 +236,9 @@ static bool ovl_is_private_xattr(const char *name) return strncmp(name, OVL_XATTR_PRE_NAME, OVL_XATTR_PRE_LEN) == 0; } -int ovl_setxattr(struct dentry *dentry, const char *name, - const void *value, size_t size, int flags) +int ovl_setxattr(struct dentry *dentry, struct inode *inode, + const char *name, const void *value, + size_t size, int flags) { int err; struct dentry *upperdentry; @@ -237,41 +264,27 @@ out: return err; } -static bool ovl_need_xattr_filter(struct dentry *dentry, - enum ovl_path_type type) -{ - if ((type & (__OVL_PATH_PURE | __OVL_PATH_UPPER)) == __OVL_PATH_UPPER) - return S_ISDIR(dentry->d_inode->i_mode); - else - return false; -} - ssize_t ovl_getxattr(struct dentry *dentry, struct inode *inode, const char *name, void *value, size_t size) { - struct path realpath; - enum ovl_path_type type = ovl_path_real(dentry, &realpath); + struct dentry *realdentry = ovl_dentry_real(dentry); - if (ovl_need_xattr_filter(dentry, type) && ovl_is_private_xattr(name)) + if (ovl_is_private_xattr(name)) return -ENODATA; - return vfs_getxattr(realpath.dentry, name, value, size); + return vfs_getxattr(realdentry, name, value, size); } ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size) { - struct path realpath; - enum ovl_path_type type = ovl_path_real(dentry, &realpath); + struct dentry *realdentry = ovl_dentry_real(dentry); ssize_t res; int off; - res = vfs_listxattr(realpath.dentry, list, size); + res = vfs_listxattr(realdentry, list, size); if (res <= 0 || size == 0) return res; - if (!ovl_need_xattr_filter(dentry, type)) - return res; - /* filter out private xattrs */ for (off = 0; off < res;) { char *s = list + off; @@ -301,7 +314,7 @@ int ovl_removexattr(struct dentry *dentry, const char *name) goto out; err = -ENODATA; - if (ovl_need_xattr_filter(dentry, type) && ovl_is_private_xattr(name)) + if (ovl_is_private_xattr(name)) goto out_drop_write; if (!OVL_TYPE_UPPER(type)) { @@ -338,36 +351,25 @@ static bool ovl_open_need_copy_up(int flags, enum ovl_path_type type, return true; } -struct inode *ovl_d_select_inode(struct dentry *dentry, unsigned file_flags) +int ovl_open_maybe_copy_up(struct dentry *dentry, unsigned int file_flags) { - int err; + int err = 0; struct path realpath; enum ovl_path_type type; - if (d_is_dir(dentry)) - return d_backing_inode(dentry); - type = ovl_path_real(dentry, &realpath); if (ovl_open_need_copy_up(file_flags, type, realpath.dentry)) { err = ovl_want_write(dentry); - if (err) - return ERR_PTR(err); - - if (file_flags & O_TRUNC) - err = ovl_copy_up_truncate(dentry); - else - err = ovl_copy_up(dentry); - ovl_drop_write(dentry); - if (err) - return ERR_PTR(err); - - ovl_path_upper(dentry, &realpath); + if (!err) { + if (file_flags & O_TRUNC) + err = ovl_copy_up_truncate(dentry); + else + err = ovl_copy_up(dentry); + ovl_drop_write(dentry); + } } - if (realpath.dentry->d_flags & DCACHE_OP_SELECT_INODE) - return realpath.dentry->d_op->d_select_inode(realpath.dentry, file_flags); - - return d_backing_inode(realpath.dentry); + return err; } static const struct inode_operations ovl_file_inode_operations = { @@ -400,12 +402,11 @@ struct inode *ovl_new_inode(struct super_block *sb, umode_t mode, if (!inode) return NULL; - mode &= S_IFMT; - inode->i_ino = get_next_ino(); inode->i_mode = mode; inode->i_flags |= S_NOATIME | S_NOCMTIME; + mode &= S_IFMT; switch (mode) { case S_IFDIR: inode->i_private = oe; diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index 99ec4b035237..0d3f2ad45708 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -153,6 +153,7 @@ void ovl_drop_write(struct dentry *dentry); bool ovl_dentry_is_opaque(struct dentry *dentry); void ovl_dentry_set_opaque(struct dentry *dentry, bool opaque); bool ovl_is_whiteout(struct dentry *dentry); +const struct cred *ovl_override_creds(struct super_block *sb); void ovl_dentry_update(struct dentry *dentry, struct dentry *upperdentry); struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags); @@ -171,13 +172,14 @@ int ovl_check_d_type_supported(struct path *realpath); /* inode.c */ int ovl_setattr(struct dentry *dentry, struct iattr *attr); int ovl_permission(struct inode *inode, int mask); -int ovl_setxattr(struct dentry *dentry, const char *name, - const void *value, size_t size, int flags); +int ovl_setxattr(struct dentry *dentry, struct inode *inode, + const char *name, const void *value, + size_t size, int flags); ssize_t ovl_getxattr(struct dentry *dentry, struct inode *inode, const char *name, void *value, size_t size); ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size); int ovl_removexattr(struct dentry *dentry, const char *name); -struct inode *ovl_d_select_inode(struct dentry *dentry, unsigned file_flags); +int ovl_open_maybe_copy_up(struct dentry *dentry, unsigned int file_flags); struct inode *ovl_new_inode(struct super_block *sb, umode_t mode, struct ovl_entry *oe); @@ -185,6 +187,7 @@ static inline void ovl_copyattr(struct inode *from, struct inode *to) { to->i_uid = from->i_uid; to->i_gid = from->i_gid; + to->i_mode = from->i_mode; } /* dir.c */ diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c index da186ee4f846..cf37fc76fc9f 100644 --- a/fs/overlayfs/readdir.c +++ b/fs/overlayfs/readdir.c @@ -36,6 +36,7 @@ struct ovl_dir_cache { struct ovl_readdir_data { struct dir_context ctx; + struct dentry *dentry; bool is_lowest; struct rb_root root; struct list_head *list; @@ -206,21 +207,10 @@ static int ovl_check_whiteouts(struct dentry *dir, struct ovl_readdir_data *rdd) struct ovl_cache_entry *p; struct dentry *dentry; const struct cred *old_cred; - struct cred *override_cred; - - override_cred = prepare_creds(); - if (!override_cred) - return -ENOMEM; - /* - * CAP_DAC_OVERRIDE for lookup - */ - cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE); - old_cred = override_creds(override_cred); + old_cred = ovl_override_creds(rdd->dentry->d_sb); - inode_lock(dir->d_inode); - err = 0; - // XXX: err = mutex_lock_killable(&dir->d_inode->i_mutex); + err = down_write_killable(&dir->d_inode->i_rwsem); if (!err) { while (rdd->first_maybe_whiteout) { p = rdd->first_maybe_whiteout; @@ -234,7 +224,6 @@ static int ovl_check_whiteouts(struct dentry *dir, struct ovl_readdir_data *rdd) inode_unlock(dir->d_inode); } revert_creds(old_cred); - put_cred(override_cred); return err; } @@ -290,6 +279,7 @@ static int ovl_dir_read_merged(struct dentry *dentry, struct list_head *list) struct path realpath; struct ovl_readdir_data rdd = { .ctx.actor = ovl_fill_merge, + .dentry = dentry, .list = list, .root = RB_ROOT, .is_lowest = false, diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index ed53ae0fe868..5e254b3a8c56 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -42,6 +42,8 @@ struct ovl_fs { long lower_namelen; /* pathnames of lower and upper dirs, for show_options */ struct ovl_config config; + /* creds of process who forced instantiation of super block */ + const struct cred *creator_cred; }; struct ovl_dir_cache; @@ -265,6 +267,13 @@ bool ovl_is_whiteout(struct dentry *dentry) return inode && IS_WHITEOUT(inode); } +const struct cred *ovl_override_creds(struct super_block *sb) +{ + struct ovl_fs *ofs = sb->s_fs_info; + + return override_creds(ofs->creator_cred); +} + static bool ovl_is_opaquedir(struct dentry *dentry) { int res; @@ -295,7 +304,9 @@ static void ovl_dentry_release(struct dentry *dentry) } } -static struct dentry *ovl_d_real(struct dentry *dentry, struct inode *inode) +static struct dentry *ovl_d_real(struct dentry *dentry, + const struct inode *inode, + unsigned int open_flags) { struct dentry *real; @@ -305,6 +316,16 @@ static struct dentry *ovl_d_real(struct dentry *dentry, struct inode *inode) goto bug; } + if (d_is_negative(dentry)) + return dentry; + + if (open_flags) { + int err = ovl_open_maybe_copy_up(dentry, open_flags); + + if (err) + return ERR_PTR(err); + } + real = ovl_dentry_upper(dentry); if (real && (!inode || inode == d_inode(real))) return real; @@ -317,9 +338,7 @@ static struct dentry *ovl_d_real(struct dentry *dentry, struct inode *inode) return real; /* Handle recursion */ - if (real->d_flags & DCACHE_OP_REAL) - return real->d_op->d_real(real, inode); - + return d_real(real, inode, open_flags); bug: WARN(1, "ovl_d_real(%pd4, %s:%lu\n): real dentry not found\n", dentry, inode ? inode->i_sb->s_id : "NULL", inode ? inode->i_ino : 0); @@ -369,13 +388,11 @@ static int ovl_dentry_weak_revalidate(struct dentry *dentry, unsigned int flags) static const struct dentry_operations ovl_dentry_operations = { .d_release = ovl_dentry_release, - .d_select_inode = ovl_d_select_inode, .d_real = ovl_d_real, }; static const struct dentry_operations ovl_reval_dentry_operations = { .d_release = ovl_dentry_release, - .d_select_inode = ovl_d_select_inode, .d_real = ovl_d_real, .d_revalidate = ovl_dentry_revalidate, .d_weak_revalidate = ovl_dentry_weak_revalidate, @@ -603,6 +620,7 @@ static void ovl_put_super(struct super_block *sb) kfree(ufs->config.lowerdir); kfree(ufs->config.upperdir); kfree(ufs->config.workdir); + put_cred(ufs->creator_cred); kfree(ufs); } @@ -1064,16 +1082,21 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) /* * Upper should support d_type, else whiteouts are visible. * Given workdir and upper are on same fs, we can do - * iterate_dir() on workdir. + * iterate_dir() on workdir. This check requires successful + * creation of workdir in previous step. */ - err = ovl_check_d_type_supported(&workpath); - if (err < 0) - goto out_put_workdir; + if (ufs->workdir) { + err = ovl_check_d_type_supported(&workpath); + if (err < 0) + goto out_put_workdir; - if (!err) { - pr_err("overlayfs: upper fs needs to support d_type.\n"); - err = -EINVAL; - goto out_put_workdir; + /* + * We allowed this configuration and don't want to + * break users over kernel upgrade. So warn instead + * of erroring out. + */ + if (!err) + pr_warn("overlayfs: upper fs needs to support d_type.\n"); } } @@ -1108,10 +1131,14 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) else sb->s_d_op = &ovl_dentry_operations; + ufs->creator_cred = prepare_creds(); + if (!ufs->creator_cred) + goto out_put_lower_mnt; + err = -ENOMEM; oe = ovl_alloc_entry(numlower); if (!oe) - goto out_put_lower_mnt; + goto out_put_cred; root_dentry = d_make_root(ovl_new_inode(sb, S_IFDIR, oe)); if (!root_dentry) @@ -1144,6 +1171,8 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) out_free_oe: kfree(oe); +out_put_cred: + put_cred(ufs->creator_cred); out_put_lower_mnt: for (i = 0; i < ufs->numlower; i++) mntput(ufs->lower_mnt[i]); diff --git a/fs/pipe.c b/fs/pipe.c index 0d3f5165cb0b..4b32928f5426 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -21,6 +21,7 @@ #include <linux/audit.h> #include <linux/syscalls.h> #include <linux/fcntl.h> +#include <linux/memcontrol.h> #include <asm/uaccess.h> #include <asm/ioctls.h> @@ -137,6 +138,22 @@ static void anon_pipe_buf_release(struct pipe_inode_info *pipe, put_page(page); } +static int anon_pipe_buf_steal(struct pipe_inode_info *pipe, + struct pipe_buffer *buf) +{ + struct page *page = buf->page; + + if (page_count(page) == 1) { + if (memcg_kmem_enabled()) { + memcg_kmem_uncharge(page, 0); + __ClearPageKmemcg(page); + } + __SetPageLocked(page); + return 0; + } + return 1; +} + /** * generic_pipe_buf_steal - attempt to take ownership of a &pipe_buffer * @pipe: the pipe that the buffer belongs to @@ -219,7 +236,7 @@ static const struct pipe_buf_operations anon_pipe_buf_ops = { .can_merge = 1, .confirm = generic_pipe_buf_confirm, .release = anon_pipe_buf_release, - .steal = generic_pipe_buf_steal, + .steal = anon_pipe_buf_steal, .get = generic_pipe_buf_get, }; @@ -227,7 +244,7 @@ static const struct pipe_buf_operations packet_pipe_buf_ops = { .can_merge = 0, .confirm = generic_pipe_buf_confirm, .release = anon_pipe_buf_release, - .steal = generic_pipe_buf_steal, + .steal = anon_pipe_buf_steal, .get = generic_pipe_buf_get, }; @@ -405,7 +422,7 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from) int copied; if (!page) { - page = alloc_page(GFP_HIGHUSER); + page = alloc_page(GFP_HIGHUSER | __GFP_ACCOUNT); if (unlikely(!page)) { ret = ret ? : -ENOMEM; break; @@ -611,7 +628,7 @@ struct pipe_inode_info *alloc_pipe_info(void) { struct pipe_inode_info *pipe; - pipe = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL); + pipe = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL_ACCOUNT); if (pipe) { unsigned long pipe_bufs = PIPE_DEF_BUFFERS; struct user_struct *user = get_current_user(); @@ -619,7 +636,9 @@ struct pipe_inode_info *alloc_pipe_info(void) if (!too_many_pipe_buffers_hard(user)) { if (too_many_pipe_buffers_soft(user)) pipe_bufs = 1; - pipe->bufs = kzalloc(sizeof(struct pipe_buffer) * pipe_bufs, GFP_KERNEL); + pipe->bufs = kcalloc(pipe_bufs, + sizeof(struct pipe_buffer), + GFP_KERNEL_ACCOUNT); } if (pipe->bufs) { @@ -1010,7 +1029,8 @@ static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long nr_pages) if (nr_pages < pipe->nrbufs) return -EBUSY; - bufs = kcalloc(nr_pages, sizeof(*bufs), GFP_KERNEL | __GFP_NOWARN); + bufs = kcalloc(nr_pages, sizeof(*bufs), + GFP_KERNEL_ACCOUNT | __GFP_NOWARN); if (unlikely(!bufs)) return -ENOMEM; diff --git a/fs/posix_acl.c b/fs/posix_acl.c index 2c60f17e7d92..edc452c2a563 100644 --- a/fs/posix_acl.c +++ b/fs/posix_acl.c @@ -820,39 +820,43 @@ posix_acl_xattr_get(const struct xattr_handler *handler, return error; } -static int -posix_acl_xattr_set(const struct xattr_handler *handler, - struct dentry *dentry, const char *name, - const void *value, size_t size, int flags) +int +set_posix_acl(struct inode *inode, int type, struct posix_acl *acl) { - struct inode *inode = d_backing_inode(dentry); - struct posix_acl *acl = NULL; - int ret; - if (!IS_POSIXACL(inode)) return -EOPNOTSUPP; if (!inode->i_op->set_acl) return -EOPNOTSUPP; - if (handler->flags == ACL_TYPE_DEFAULT && !S_ISDIR(inode->i_mode)) - return value ? -EACCES : 0; + if (type == ACL_TYPE_DEFAULT && !S_ISDIR(inode->i_mode)) + return acl ? -EACCES : 0; if (!inode_owner_or_capable(inode)) return -EPERM; + if (acl) { + int ret = posix_acl_valid(acl); + if (ret) + return ret; + } + return inode->i_op->set_acl(inode, acl, type); +} +EXPORT_SYMBOL(set_posix_acl); + +static int +posix_acl_xattr_set(const struct xattr_handler *handler, + struct dentry *unused, struct inode *inode, + const char *name, const void *value, + size_t size, int flags) +{ + struct posix_acl *acl = NULL; + int ret; + if (value) { acl = posix_acl_from_xattr(&init_user_ns, value, size); if (IS_ERR(acl)) return PTR_ERR(acl); - - if (acl) { - ret = posix_acl_valid(acl); - if (ret) - goto out; - } } - - ret = inode->i_op->set_acl(inode, acl, handler->flags); -out: + ret = set_posix_acl(inode, handler->flags, acl); posix_acl_release(acl); return ret; } diff --git a/fs/proc/array.c b/fs/proc/array.c index b6c00ce0e29e..88c7de12197b 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -83,6 +83,7 @@ #include <linux/tracehook.h> #include <linux/string_helpers.h> #include <linux/user_namespace.h> +#include <linux/fs_struct.h> #include <asm/pgtable.h> #include <asm/processor.h> @@ -139,12 +140,25 @@ static inline const char *get_task_state(struct task_struct *tsk) return task_state_array[fls(state)]; } +static inline int get_task_umask(struct task_struct *tsk) +{ + struct fs_struct *fs; + int umask = -ENOENT; + + task_lock(tsk); + fs = tsk->fs; + if (fs) + umask = fs->umask; + task_unlock(tsk); + return umask; +} + static inline void task_state(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *p) { struct user_namespace *user_ns = seq_user_ns(m); struct group_info *group_info; - int g; + int g, umask; struct task_struct *tracer; const struct cred *cred; pid_t ppid, tpid = 0, tgid, ngid; @@ -162,6 +176,10 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns, ngid = task_numa_group_id(p); cred = get_task_cred(p); + umask = get_task_umask(p); + if (umask >= 0) + seq_printf(m, "Umask:\t%#04o\n", umask); + task_lock(p); if (p->files) max_fds = files_fdtable(p->files)->max_fds; diff --git a/fs/proc/base.c b/fs/proc/base.c index ff4527dd69b7..a11eb7196ec8 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -3163,6 +3163,44 @@ int proc_pid_readdir(struct file *file, struct dir_context *ctx) } /* + * proc_tid_comm_permission is a special permission function exclusively + * used for the node /proc/<pid>/task/<tid>/comm. + * It bypasses generic permission checks in the case where a task of the same + * task group attempts to access the node. + * The rationale behind this is that glibc and bionic access this node for + * cross thread naming (pthread_set/getname_np(!self)). However, if + * PR_SET_DUMPABLE gets set to 0 this node among others becomes uid=0 gid=0, + * which locks out the cross thread naming implementation. + * This function makes sure that the node is always accessible for members of + * same thread group. + */ +static int proc_tid_comm_permission(struct inode *inode, int mask) +{ + bool is_same_tgroup; + struct task_struct *task; + + task = get_proc_task(inode); + if (!task) + return -ESRCH; + is_same_tgroup = same_thread_group(current, task); + put_task_struct(task); + + if (likely(is_same_tgroup && !(mask & MAY_EXEC))) { + /* This file (/proc/<pid>/task/<tid>/comm) can always be + * read or written by the members of the corresponding + * thread group. + */ + return 0; + } + + return generic_permission(inode, mask); +} + +static const struct inode_operations proc_tid_comm_inode_operations = { + .permission = proc_tid_comm_permission, +}; + +/* * Tasks */ static const struct pid_entry tid_base_stuff[] = { @@ -3180,7 +3218,9 @@ static const struct pid_entry tid_base_stuff[] = { #ifdef CONFIG_SCHED_DEBUG REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations), #endif - REG("comm", S_IRUGO|S_IWUSR, proc_pid_set_comm_operations), + NOD("comm", S_IFREG|S_IRUGO|S_IWUSR, + &proc_tid_comm_inode_operations, + &proc_pid_set_comm_operations, {}), #ifdef CONFIG_HAVE_ARCH_TRACEHOOK ONE("syscall", S_IRUSR, proc_pid_syscall), #endif diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index 83720460c5bc..cf301a9ef512 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -105,6 +105,8 @@ static int meminfo_proc_show(struct seq_file *m, void *v) #endif #ifdef CONFIG_TRANSPARENT_HUGEPAGE "AnonHugePages: %8lu kB\n" + "ShmemHugePages: %8lu kB\n" + "ShmemPmdMapped: %8lu kB\n" #endif #ifdef CONFIG_CMA "CmaTotal: %8lu kB\n" @@ -162,8 +164,9 @@ static int meminfo_proc_show(struct seq_file *m, void *v) , atomic_long_read(&num_poisoned_pages) << (PAGE_SHIFT - 10) #endif #ifdef CONFIG_TRANSPARENT_HUGEPAGE - , K(global_page_state(NR_ANON_TRANSPARENT_HUGEPAGES) * - HPAGE_PMD_NR) + , K(global_page_state(NR_ANON_THPS) * HPAGE_PMD_NR) + , K(global_page_state(NR_SHMEM_THPS) * HPAGE_PMD_NR) + , K(global_page_state(NR_SHMEM_PMDMAPPED) * HPAGE_PMD_NR) #endif #ifdef CONFIG_CMA , K(totalcma_pages) diff --git a/fs/proc/page.c b/fs/proc/page.c index 712f1b9992cc..3ecd445e830d 100644 --- a/fs/proc/page.c +++ b/fs/proc/page.c @@ -142,7 +142,7 @@ u64 stable_page_flags(struct page *page) /* - * Caveats on high order pages: page->_count will only be set + * Caveats on high order pages: page->_refcount will only be set * -1 on the head page; SLUB/SLQB do the same for PG_slab; * SLOB won't set PG_slab at all on compound pages. */ diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 5e57c3e46e1d..b59db94d2ff4 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -623,7 +623,7 @@ static bool proc_sys_fill_cache(struct file *file, qname.name = table->procname; qname.len = strlen(table->procname); - qname.hash = full_name_hash(qname.name, qname.len); + qname.hash = full_name_hash(dir, qname.name, qname.len); child = d_lookup(dir, &qname); if (!child) { diff --git a/fs/proc/root.c b/fs/proc/root.c index 55bc7d6c8aac..06702783bf40 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -121,6 +121,13 @@ static struct dentry *proc_mount(struct file_system_type *fs_type, if (IS_ERR(sb)) return ERR_CAST(sb); + /* + * procfs isn't actually a stacking filesystem; however, there is + * too much magic going on inside it to permit stacking things on + * top of it + */ + sb->s_stack_depth = FILESYSTEM_MAX_STACK_DEPTH; + if (!proc_parse_options(options, ns)) { deactivate_locked_super(sb); return ERR_PTR(-EINVAL); diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 541583510cfb..187d84ef9de9 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -448,6 +448,7 @@ struct mem_size_stats { unsigned long referenced; unsigned long anonymous; unsigned long anonymous_thp; + unsigned long shmem_thp; unsigned long swap; unsigned long shared_hugetlb; unsigned long private_hugetlb; @@ -576,7 +577,12 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr, page = follow_trans_huge_pmd(vma, addr, pmd, FOLL_DUMP); if (IS_ERR_OR_NULL(page)) return; - mss->anonymous_thp += HPAGE_PMD_SIZE; + if (PageAnon(page)) + mss->anonymous_thp += HPAGE_PMD_SIZE; + else if (PageSwapBacked(page)) + mss->shmem_thp += HPAGE_PMD_SIZE; + else + VM_BUG_ON_PAGE(1, page); smaps_account(mss, page, true, pmd_young(*pmd), pmd_dirty(*pmd)); } #else @@ -770,6 +776,7 @@ static int show_smap(struct seq_file *m, void *v, int is_pid) "Referenced: %8lu kB\n" "Anonymous: %8lu kB\n" "AnonHugePages: %8lu kB\n" + "ShmemPmdMapped: %8lu kB\n" "Shared_Hugetlb: %8lu kB\n" "Private_Hugetlb: %7lu kB\n" "Swap: %8lu kB\n" @@ -787,6 +794,7 @@ static int show_smap(struct seq_file *m, void *v, int is_pid) mss.referenced >> 10, mss.anonymous >> 10, mss.anonymous_thp >> 10, + mss.shmem_thp >> 10, mss.shared_hugetlb >> 10, mss.private_hugetlb >> 10, mss.swap >> 10, @@ -1027,11 +1035,15 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, }; if (type == CLEAR_REFS_MM_HIWATER_RSS) { + if (down_write_killable(&mm->mmap_sem)) { + count = -EINTR; + goto out_mm; + } + /* * Writing 5 to /proc/pid/clear_refs resets the peak * resident set size to this mm's current rss value. */ - down_write(&mm->mmap_sem); reset_mm_hiwater_rss(mm); up_write(&mm->mmap_sem); goto out_mm; @@ -1043,7 +1055,10 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, if (!(vma->vm_flags & VM_SOFTDIRTY)) continue; up_read(&mm->mmap_sem); - down_write(&mm->mmap_sem); + if (down_write_killable(&mm->mmap_sem)) { + count = -EINTR; + goto out_mm; + } for (vma = mm->mmap; vma; vma = vma->vm_next) { vma->vm_flags &= ~VM_SOFTDIRTY; vma_set_page_prot(vma); diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index 8afe10cf7df8..8ab782d8b33d 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -1071,7 +1071,7 @@ static int __init parse_crash_elf32_headers(void) /* Do some basic Verification. */ if (memcmp(ehdr.e_ident, ELFMAG, SELFMAG) != 0 || (ehdr.e_type != ET_CORE) || - !elf_check_arch(&ehdr) || + !vmcore_elf32_check_arch(&ehdr) || ehdr.e_ident[EI_CLASS] != ELFCLASS32|| ehdr.e_ident[EI_VERSION] != EV_CURRENT || ehdr.e_version != EV_CURRENT || diff --git a/fs/pstore/Kconfig b/fs/pstore/Kconfig index 360ae43f590c..be40813eff52 100644 --- a/fs/pstore/Kconfig +++ b/fs/pstore/Kconfig @@ -1,8 +1,6 @@ config PSTORE tristate "Persistent store support" default n - select ZLIB_DEFLATE - select ZLIB_INFLATE help This option enables generic access to platform level persistent storage via "pstore" filesystem that can @@ -14,6 +12,35 @@ config PSTORE If you don't have a platform persistent store driver, say N. +choice + prompt "Choose compression algorithm" + depends on PSTORE + default PSTORE_ZLIB_COMPRESS + help + This option chooses compression algorithm. + +config PSTORE_ZLIB_COMPRESS + bool "ZLIB" + select ZLIB_DEFLATE + select ZLIB_INFLATE + help + This option enables ZLIB compression algorithm support. + +config PSTORE_LZO_COMPRESS + bool "LZO" + select LZO_COMPRESS + select LZO_DECOMPRESS + help + This option enables LZO compression algorithm support. + +config PSTORE_LZ4_COMPRESS + bool "LZ4" + select LZ4_COMPRESS + select LZ4_DECOMPRESS + help + This option enables LZ4 compression algorithm support. +endchoice + config PSTORE_CONSOLE bool "Log kernel console messages" depends on PSTORE diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c index 45d6110744cb..ec9ddef5ae75 100644 --- a/fs/pstore/inode.c +++ b/fs/pstore/inode.c @@ -178,7 +178,6 @@ static loff_t pstore_file_llseek(struct file *file, loff_t off, int whence) } static const struct file_operations pstore_file_operations = { - .owner = THIS_MODULE, .open = pstore_file_open, .read = pstore_file_read, .llseek = pstore_file_llseek, diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c index 588461bb2dd4..16ecca5b72d8 100644 --- a/fs/pstore/platform.c +++ b/fs/pstore/platform.c @@ -28,7 +28,15 @@ #include <linux/console.h> #include <linux/module.h> #include <linux/pstore.h> +#ifdef CONFIG_PSTORE_ZLIB_COMPRESS #include <linux/zlib.h> +#endif +#ifdef CONFIG_PSTORE_LZO_COMPRESS +#include <linux/lzo.h> +#endif +#ifdef CONFIG_PSTORE_LZ4_COMPRESS +#include <linux/lz4.h> +#endif #include <linux/string.h> #include <linux/timer.h> #include <linux/slab.h> @@ -69,10 +77,23 @@ struct pstore_info *psinfo; static char *backend; /* Compression parameters */ +#ifdef CONFIG_PSTORE_ZLIB_COMPRESS #define COMPR_LEVEL 6 #define WINDOW_BITS 12 #define MEM_LEVEL 4 static struct z_stream_s stream; +#else +static unsigned char *workspace; +#endif + +struct pstore_zbackend { + int (*compress)(const void *in, void *out, size_t inlen, size_t outlen); + int (*decompress)(void *in, void *out, size_t inlen, size_t outlen); + void (*allocate)(void); + void (*free)(void); + + const char *name; +}; static char *big_oops_buf; static size_t big_oops_buf_sz; @@ -129,9 +150,9 @@ bool pstore_cannot_block_path(enum kmsg_dump_reason reason) } EXPORT_SYMBOL_GPL(pstore_cannot_block_path); +#ifdef CONFIG_PSTORE_ZLIB_COMPRESS /* Derived from logfs_compress() */ -static int pstore_compress(const void *in, void *out, size_t inlen, - size_t outlen) +static int compress_zlib(const void *in, void *out, size_t inlen, size_t outlen) { int err, ret; @@ -165,7 +186,7 @@ error: } /* Derived from logfs_uncompress */ -static int pstore_decompress(void *in, void *out, size_t inlen, size_t outlen) +static int decompress_zlib(void *in, void *out, size_t inlen, size_t outlen) { int err, ret; @@ -194,7 +215,7 @@ error: return ret; } -static void allocate_buf_for_compression(void) +static void allocate_zlib(void) { size_t size; size_t cmpr; @@ -237,12 +258,190 @@ static void allocate_buf_for_compression(void) } -static void free_buf_for_compression(void) +static void free_zlib(void) { kfree(stream.workspace); stream.workspace = NULL; kfree(big_oops_buf); big_oops_buf = NULL; + big_oops_buf_sz = 0; +} + +static struct pstore_zbackend backend_zlib = { + .compress = compress_zlib, + .decompress = decompress_zlib, + .allocate = allocate_zlib, + .free = free_zlib, + .name = "zlib", +}; +#endif + +#ifdef CONFIG_PSTORE_LZO_COMPRESS +static int compress_lzo(const void *in, void *out, size_t inlen, size_t outlen) +{ + int ret; + + ret = lzo1x_1_compress(in, inlen, out, &outlen, workspace); + if (ret != LZO_E_OK) { + pr_err("lzo_compress error, ret = %d!\n", ret); + return -EIO; + } + + return outlen; +} + +static int decompress_lzo(void *in, void *out, size_t inlen, size_t outlen) +{ + int ret; + + ret = lzo1x_decompress_safe(in, inlen, out, &outlen); + if (ret != LZO_E_OK) { + pr_err("lzo_decompress error, ret = %d!\n", ret); + return -EIO; + } + + return outlen; +} + +static void allocate_lzo(void) +{ + big_oops_buf_sz = lzo1x_worst_compress(psinfo->bufsize); + big_oops_buf = kmalloc(big_oops_buf_sz, GFP_KERNEL); + if (big_oops_buf) { + workspace = kmalloc(LZO1X_MEM_COMPRESS, GFP_KERNEL); + if (!workspace) { + pr_err("No memory for compression workspace; skipping compression\n"); + kfree(big_oops_buf); + big_oops_buf = NULL; + } + } else { + pr_err("No memory for uncompressed data; skipping compression\n"); + workspace = NULL; + } +} + +static void free_lzo(void) +{ + kfree(workspace); + kfree(big_oops_buf); + big_oops_buf = NULL; + big_oops_buf_sz = 0; +} + +static struct pstore_zbackend backend_lzo = { + .compress = compress_lzo, + .decompress = decompress_lzo, + .allocate = allocate_lzo, + .free = free_lzo, + .name = "lzo", +}; +#endif + +#ifdef CONFIG_PSTORE_LZ4_COMPRESS +static int compress_lz4(const void *in, void *out, size_t inlen, size_t outlen) +{ + int ret; + + ret = lz4_compress(in, inlen, out, &outlen, workspace); + if (ret) { + pr_err("lz4_compress error, ret = %d!\n", ret); + return -EIO; + } + + return outlen; +} + +static int decompress_lz4(void *in, void *out, size_t inlen, size_t outlen) +{ + int ret; + + ret = lz4_decompress_unknownoutputsize(in, inlen, out, &outlen); + if (ret) { + pr_err("lz4_decompress error, ret = %d!\n", ret); + return -EIO; + } + + return outlen; +} + +static void allocate_lz4(void) +{ + big_oops_buf_sz = lz4_compressbound(psinfo->bufsize); + big_oops_buf = kmalloc(big_oops_buf_sz, GFP_KERNEL); + if (big_oops_buf) { + workspace = kmalloc(LZ4_MEM_COMPRESS, GFP_KERNEL); + if (!workspace) { + pr_err("No memory for compression workspace; skipping compression\n"); + kfree(big_oops_buf); + big_oops_buf = NULL; + } + } else { + pr_err("No memory for uncompressed data; skipping compression\n"); + workspace = NULL; + } +} + +static void free_lz4(void) +{ + kfree(workspace); + kfree(big_oops_buf); + big_oops_buf = NULL; + big_oops_buf_sz = 0; +} + +static struct pstore_zbackend backend_lz4 = { + .compress = compress_lz4, + .decompress = decompress_lz4, + .allocate = allocate_lz4, + .free = free_lz4, + .name = "lz4", +}; +#endif + +static struct pstore_zbackend *zbackend = +#if defined(CONFIG_PSTORE_ZLIB_COMPRESS) + &backend_zlib; +#elif defined(CONFIG_PSTORE_LZO_COMPRESS) + &backend_lzo; +#elif defined(CONFIG_PSTORE_LZ4_COMPRESS) + &backend_lz4; +#else + NULL; +#endif + +static int pstore_compress(const void *in, void *out, + size_t inlen, size_t outlen) +{ + if (zbackend) + return zbackend->compress(in, out, inlen, outlen); + else + return -EIO; +} + +static int pstore_decompress(void *in, void *out, size_t inlen, size_t outlen) +{ + if (zbackend) + return zbackend->decompress(in, out, inlen, outlen); + else + return -EIO; +} + +static void allocate_buf_for_compression(void) +{ + if (zbackend) { + pr_info("using %s compression\n", zbackend->name); + zbackend->allocate(); + } else { + pr_err("allocate compression buffer error!\n"); + } +} + +static void free_buf_for_compression(void) +{ + if (zbackend) + zbackend->free(); + else + pr_err("free compression buffer error!\n"); } /* @@ -284,7 +483,7 @@ static void pstore_dump(struct kmsg_dumper *dumper, u64 id; unsigned int part = 1; unsigned long flags = 0; - int is_locked = 0; + int is_locked; int ret; why = get_reason_str(reason); @@ -295,8 +494,10 @@ static void pstore_dump(struct kmsg_dumper *dumper, pr_err("pstore dump routine blocked in %s path, may corrupt error record\n" , in_nmi() ? "NMI" : why); } - } else + } else { spin_lock_irqsave(&psinfo->buf_lock, flags); + is_locked = 1; + } oopscount++; while (total < kmsg_bytes) { char *dst; @@ -304,19 +505,25 @@ static void pstore_dump(struct kmsg_dumper *dumper, int hsize; int zipped_len = -1; size_t len; - bool compressed; + bool compressed = false; size_t total_len; if (big_oops_buf && is_locked) { dst = big_oops_buf; - hsize = sprintf(dst, "%s#%d Part%u\n", why, - oopscount, part); - size = big_oops_buf_sz - hsize; + size = big_oops_buf_sz; + } else { + dst = psinfo->buf; + size = psinfo->bufsize; + } - if (!kmsg_dump_get_buffer(dumper, true, dst + hsize, - size, &len)) - break; + hsize = sprintf(dst, "%s#%d Part%u\n", why, oopscount, part); + size -= hsize; + + if (!kmsg_dump_get_buffer(dumper, true, dst + hsize, + size, &len)) + break; + if (big_oops_buf && is_locked) { zipped_len = pstore_compress(dst, psinfo->buf, hsize + len, psinfo->bufsize); @@ -324,21 +531,9 @@ static void pstore_dump(struct kmsg_dumper *dumper, compressed = true; total_len = zipped_len; } else { - compressed = false; total_len = copy_kmsg_to_buffer(hsize, len); } } else { - dst = psinfo->buf; - hsize = sprintf(dst, "%s#%d Part%u\n", why, oopscount, - part); - size = psinfo->bufsize - hsize; - dst += hsize; - - if (!kmsg_dump_get_buffer(dumper, true, dst, - size, &len)) - break; - - compressed = false; total_len = hsize + len; } @@ -350,10 +545,7 @@ static void pstore_dump(struct kmsg_dumper *dumper, total += total_len; part++; } - if (pstore_cannot_block_path(reason)) { - if (is_locked) - spin_unlock_irqrestore(&psinfo->buf_lock, flags); - } else + if (is_locked) spin_unlock_irqrestore(&psinfo->buf_lock, flags); } @@ -497,9 +689,11 @@ EXPORT_SYMBOL_GPL(pstore_register); void pstore_unregister(struct pstore_info *psi) { - pstore_unregister_pmsg(); - pstore_unregister_ftrace(); - pstore_unregister_console(); + if ((psi->flags & PSTORE_FLAGS_FRAGILE) == 0) { + pstore_unregister_pmsg(); + pstore_unregister_ftrace(); + pstore_unregister_console(); + } pstore_unregister_kmsg(); free_buf_for_compression(); @@ -527,6 +721,7 @@ void pstore_get_records(int quiet) int failed = 0, rc; bool compressed; int unzipped_len = -1; + ssize_t ecc_notice_size = 0; if (!psi) return; @@ -536,7 +731,7 @@ void pstore_get_records(int quiet) goto out; while ((size = psi->read(&id, &type, &count, &time, &buf, &compressed, - psi)) > 0) { + &ecc_notice_size, psi)) > 0) { if (compressed && (type == PSTORE_TYPE_DMESG)) { if (big_oops_buf) unzipped_len = pstore_decompress(buf, @@ -544,6 +739,9 @@ void pstore_get_records(int quiet) big_oops_buf_sz); if (unzipped_len > 0) { + if (ecc_notice_size) + memcpy(big_oops_buf + unzipped_len, + buf + size, ecc_notice_size); kfree(buf); buf = big_oops_buf; size = unzipped_len; @@ -555,7 +753,8 @@ void pstore_get_records(int quiet) } } rc = pstore_mkfile(type, psi->name, id, count, buf, - compressed, (size_t)size, time, psi); + compressed, size + ecc_notice_size, + time, psi); if (unzipped_len < 0) { /* Free buffer other than big oops */ kfree(buf); diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c index bd9812e83461..47516a794011 100644 --- a/fs/pstore/ram.c +++ b/fs/pstore/ram.c @@ -34,6 +34,8 @@ #include <linux/slab.h> #include <linux/compiler.h> #include <linux/pstore_ram.h> +#include <linux/of.h> +#include <linux/of_address.h> #define RAMOOPS_KERNMSG_HDR "====" #define MIN_MEM_SIZE 4096UL @@ -181,10 +183,10 @@ static bool prz_ok(struct persistent_ram_zone *prz) static ssize_t ramoops_pstore_read(u64 *id, enum pstore_type_id *type, int *count, struct timespec *time, char **buf, bool *compressed, + ssize_t *ecc_notice_size, struct pstore_info *psi) { ssize_t size; - ssize_t ecc_notice_size; struct ramoops_context *cxt = psi->data; struct persistent_ram_zone *prz = NULL; int header_length = 0; @@ -229,16 +231,16 @@ static ssize_t ramoops_pstore_read(u64 *id, enum pstore_type_id *type, size = persistent_ram_old_size(prz) - header_length; /* ECC correction notice */ - ecc_notice_size = persistent_ram_ecc_string(prz, NULL, 0); + *ecc_notice_size = persistent_ram_ecc_string(prz, NULL, 0); - *buf = kmalloc(size + ecc_notice_size + 1, GFP_KERNEL); + *buf = kmalloc(size + *ecc_notice_size + 1, GFP_KERNEL); if (*buf == NULL) return -ENOMEM; memcpy(*buf, (char *)persistent_ram_old(prz) + header_length, size); - persistent_ram_ecc_string(prz, *buf + size, ecc_notice_size + 1); + persistent_ram_ecc_string(prz, *buf + size, *ecc_notice_size + 1); - return size + ecc_notice_size; + return size; } static size_t ramoops_write_kmsg_hdr(struct persistent_ram_zone *prz, @@ -458,15 +460,98 @@ static int ramoops_init_prz(struct device *dev, struct ramoops_context *cxt, return 0; } +static int ramoops_parse_dt_size(struct platform_device *pdev, + const char *propname, u32 *value) +{ + u32 val32 = 0; + int ret; + + ret = of_property_read_u32(pdev->dev.of_node, propname, &val32); + if (ret < 0 && ret != -EINVAL) { + dev_err(&pdev->dev, "failed to parse property %s: %d\n", + propname, ret); + return ret; + } + + if (val32 > INT_MAX) { + dev_err(&pdev->dev, "%s %u > INT_MAX\n", propname, val32); + return -EOVERFLOW; + } + + *value = val32; + return 0; +} + +static int ramoops_parse_dt(struct platform_device *pdev, + struct ramoops_platform_data *pdata) +{ + struct device_node *of_node = pdev->dev.of_node; + struct device_node *mem_region; + struct resource res; + u32 value; + int ret; + + dev_dbg(&pdev->dev, "using Device Tree\n"); + + mem_region = of_parse_phandle(of_node, "memory-region", 0); + if (!mem_region) { + dev_err(&pdev->dev, "no memory-region phandle\n"); + return -ENODEV; + } + + ret = of_address_to_resource(mem_region, 0, &res); + of_node_put(mem_region); + if (ret) { + dev_err(&pdev->dev, + "failed to translate memory-region to resource: %d\n", + ret); + return ret; + } + + pdata->mem_size = resource_size(&res); + pdata->mem_address = res.start; + pdata->mem_type = of_property_read_bool(of_node, "unbuffered"); + pdata->dump_oops = !of_property_read_bool(of_node, "no-dump-oops"); + +#define parse_size(name, field) { \ + ret = ramoops_parse_dt_size(pdev, name, &value); \ + if (ret < 0) \ + return ret; \ + field = value; \ + } + + parse_size("record-size", pdata->record_size); + parse_size("console-size", pdata->console_size); + parse_size("ftrace-size", pdata->ftrace_size); + parse_size("pmsg-size", pdata->pmsg_size); + parse_size("ecc-size", pdata->ecc_info.ecc_size); + +#undef parse_size + + return 0; +} + static int ramoops_probe(struct platform_device *pdev) { struct device *dev = &pdev->dev; - struct ramoops_platform_data *pdata = pdev->dev.platform_data; + struct ramoops_platform_data *pdata = dev->platform_data; struct ramoops_context *cxt = &oops_cxt; size_t dump_mem_sz; phys_addr_t paddr; int err = -EINVAL; + if (dev_of_node(dev) && !pdata) { + pdata = devm_kzalloc(&pdev->dev, sizeof(*pdata), GFP_KERNEL); + if (!pdata) { + err = -ENOMEM; + goto fail_out; + } + + err = ramoops_parse_dt(pdev, pdata); + if (err < 0) + goto fail_out; + } + /* Only a single ramoops area allowed at a time, so fail extra * probes. */ @@ -596,11 +681,17 @@ static int ramoops_remove(struct platform_device *pdev) return 0; } +static const struct of_device_id dt_match[] = { + { .compatible = "ramoops" }, + {} +}; + static struct platform_driver ramoops_driver = { .probe = ramoops_probe, .remove = ramoops_remove, .driver = { - .name = "ramoops", + .name = "ramoops", + .of_match_table = dt_match, }, }; diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c index a586467f6ff6..be3ddd189cd4 100644 --- a/fs/ramfs/file-nommu.c +++ b/fs/ramfs/file-nommu.c @@ -211,14 +211,11 @@ static unsigned long ramfs_nommu_get_unmapped_area(struct file *file, struct page **pages = NULL, **ptr, *page; loff_t isize; - if (!(flags & MAP_SHARED)) - return addr; - /* the mapping mustn't extend beyond the EOF */ lpages = (len + PAGE_SIZE - 1) >> PAGE_SHIFT; isize = i_size_read(inode); - ret = -EINVAL; + ret = -ENOSYS; maxpages = (isize + PAGE_SIZE - 1) >> PAGE_SHIFT; if (pgoff >= maxpages) goto out; @@ -227,7 +224,6 @@ static unsigned long ramfs_nommu_get_unmapped_area(struct file *file, goto out; /* gang-find the pages */ - ret = -ENOMEM; pages = kcalloc(lpages, sizeof(struct page *), GFP_KERNEL); if (!pages) goto out_free; @@ -263,7 +259,7 @@ out: */ static int ramfs_nommu_mmap(struct file *file, struct vm_area_struct *vma) { - if (!(vma->vm_flags & VM_SHARED)) + if (!(vma->vm_flags & (VM_SHARED | VM_MAYSHARE))) return -ENOSYS; file_accessed(file); diff --git a/fs/read_write.c b/fs/read_write.c index 933b53a375b4..66215a7b17cf 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -1168,6 +1168,15 @@ COMPAT_SYSCALL_DEFINE5(preadv, compat_ulong_t, fd, return do_compat_preadv64(fd, vec, vlen, pos, 0); } +#ifdef __ARCH_WANT_COMPAT_SYS_PREADV64V2 +COMPAT_SYSCALL_DEFINE5(preadv64v2, unsigned long, fd, + const struct compat_iovec __user *,vec, + unsigned long, vlen, loff_t, pos, int, flags) +{ + return do_compat_preadv64(fd, vec, vlen, pos, flags); +} +#endif + COMPAT_SYSCALL_DEFINE6(preadv2, compat_ulong_t, fd, const struct compat_iovec __user *,vec, compat_ulong_t, vlen, u32, pos_low, u32, pos_high, @@ -1265,6 +1274,15 @@ COMPAT_SYSCALL_DEFINE5(pwritev, compat_ulong_t, fd, return do_compat_pwritev64(fd, vec, vlen, pos, 0); } +#ifdef __ARCH_WANT_COMPAT_SYS_PWRITEV64V2 +COMPAT_SYSCALL_DEFINE5(pwritev64v2, unsigned long, fd, + const struct compat_iovec __user *,vec, + unsigned long, vlen, loff_t, pos, int, flags) +{ + return do_compat_pwritev64(fd, vec, vlen, pos, flags); +} +#endif + COMPAT_SYSCALL_DEFINE6(pwritev2, compat_ulong_t, fd, const struct compat_iovec __user *,vec, compat_ulong_t, vlen, u32, pos_low, u32, pos_high, int, flags) diff --git a/fs/readdir.c b/fs/readdir.c index a86c6c04b9bc..9d0212c374d6 100644 --- a/fs/readdir.c +++ b/fs/readdir.c @@ -35,13 +35,13 @@ int iterate_dir(struct file *file, struct dir_context *ctx) if (res) goto out; - if (shared) + if (shared) { inode_lock_shared(inode); - else - inode_lock(inode); - // res = mutex_lock_killable(&inode->i_mutex); - // if (res) - // goto out; + } else { + res = down_write_killable(&inode->i_rwsem); + if (res) + goto out; + } res = -ENOENT; if (!IS_DEADDIR(inode)) { @@ -182,6 +182,8 @@ static int filldir(struct dir_context *ctx, const char *name, int namlen, } dirent = buf->previous; if (dirent) { + if (signal_pending(current)) + return -EINTR; if (__put_user(offset, &dirent->d_off)) goto efault; } @@ -261,6 +263,8 @@ static int filldir64(struct dir_context *ctx, const char *name, int namlen, return -EINVAL; dirent = buf->previous; if (dirent) { + if (signal_pending(current)) + return -EINTR; if (__put_user(offset, &dirent->d_off)) goto efault; } diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index 825455d3e4ba..c2c59f9ff04b 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -2668,7 +2668,7 @@ static int reiserfs_write_full_page(struct page *page, do { struct buffer_head *next = bh->b_this_page; if (buffer_async_write(bh)) { - submit_bh(WRITE, bh); + submit_bh(REQ_OP_WRITE, 0, bh); nr++; } put_bh(bh); @@ -2728,7 +2728,7 @@ fail: struct buffer_head *next = bh->b_this_page; if (buffer_async_write(bh)) { clear_buffer_dirty(bh); - submit_bh(WRITE, bh); + submit_bh(REQ_OP_WRITE, 0, bh); nr++; } put_bh(bh); diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c index 2ace90e981f0..bc2dde2423c2 100644 --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -652,7 +652,7 @@ static void submit_logged_buffer(struct buffer_head *bh) BUG(); if (!buffer_uptodate(bh)) BUG(); - submit_bh(WRITE, bh); + submit_bh(REQ_OP_WRITE, 0, bh); } static void submit_ordered_buffer(struct buffer_head *bh) @@ -662,7 +662,7 @@ static void submit_ordered_buffer(struct buffer_head *bh) clear_buffer_dirty(bh); if (!buffer_uptodate(bh)) BUG(); - submit_bh(WRITE, bh); + submit_bh(REQ_OP_WRITE, 0, bh); } #define CHUNK_SIZE 32 @@ -870,7 +870,7 @@ loop_next: */ if (buffer_dirty(bh) && unlikely(bh->b_page->mapping == NULL)) { spin_unlock(lock); - ll_rw_block(WRITE, 1, &bh); + ll_rw_block(REQ_OP_WRITE, 0, 1, &bh); spin_lock(lock); } put_bh(bh); @@ -1057,7 +1057,7 @@ static int flush_commit_list(struct super_block *s, if (tbh) { if (buffer_dirty(tbh)) { depth = reiserfs_write_unlock_nested(s); - ll_rw_block(WRITE, 1, &tbh); + ll_rw_block(REQ_OP_WRITE, 0, 1, &tbh); reiserfs_write_lock_nested(s, depth); } put_bh(tbh) ; @@ -2244,7 +2244,7 @@ abort_replay: } } /* read in the log blocks, memcpy to the corresponding real block */ - ll_rw_block(READ, get_desc_trans_len(desc), log_blocks); + ll_rw_block(REQ_OP_READ, 0, get_desc_trans_len(desc), log_blocks); for (i = 0; i < get_desc_trans_len(desc); i++) { wait_on_buffer(log_blocks[i]); @@ -2269,7 +2269,7 @@ abort_replay: /* flush out the real blocks */ for (i = 0; i < get_desc_trans_len(desc); i++) { set_buffer_dirty(real_blocks[i]); - write_dirty_buffer(real_blocks[i], WRITE); + write_dirty_buffer(real_blocks[i], 0); } for (i = 0; i < get_desc_trans_len(desc); i++) { wait_on_buffer(real_blocks[i]); @@ -2346,7 +2346,7 @@ static struct buffer_head *reiserfs_breada(struct block_device *dev, } else bhlist[j++] = bh; } - ll_rw_block(READ, j, bhlist); + ll_rw_block(REQ_OP_READ, 0, j, bhlist); for (i = 1; i < j; i++) brelse(bhlist[i]); bh = bhlist[0]; diff --git a/fs/reiserfs/objectid.c b/fs/reiserfs/objectid.c index 99a5d5dae46a..415d66ca87d1 100644 --- a/fs/reiserfs/objectid.c +++ b/fs/reiserfs/objectid.c @@ -3,8 +3,8 @@ */ #include <linux/string.h> -#include <linux/random.h> #include <linux/time.h> +#include <linux/uuid.h> #include "reiserfs.h" /* find where objectid map starts */ diff --git a/fs/reiserfs/stree.c b/fs/reiserfs/stree.c index 5feacd689241..4032d1e87c8f 100644 --- a/fs/reiserfs/stree.c +++ b/fs/reiserfs/stree.c @@ -551,7 +551,7 @@ static int search_by_key_reada(struct super_block *s, if (!buffer_uptodate(bh[j])) { if (depth == -1) depth = reiserfs_write_unlock_nested(s); - ll_rw_block(READA, 1, bh + j); + ll_rw_block(REQ_OP_READ, REQ_RAHEAD, 1, bh + j); } brelse(bh[j]); } @@ -660,7 +660,7 @@ int search_by_key(struct super_block *sb, const struct cpu_key *key, if (!buffer_uptodate(bh) && depth == -1) depth = reiserfs_write_unlock_nested(sb); - ll_rw_block(READ, 1, &bh); + ll_rw_block(REQ_OP_READ, 0, 1, &bh); wait_on_buffer(bh); if (depth != -1) diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index c72c16c5a60f..7a4a85a6821e 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -1666,7 +1666,7 @@ static int read_super_block(struct super_block *s, int offset) /* after journal replay, reread all bitmap and super blocks */ static int reread_meta_blocks(struct super_block *s) { - ll_rw_block(READ, 1, &SB_BUFFER_WITH_SB(s)); + ll_rw_block(REQ_OP_READ, 0, 1, &SB_BUFFER_WITH_SB(s)); wait_on_buffer(SB_BUFFER_WITH_SB(s)); if (!buffer_uptodate(SB_BUFFER_WITH_SB(s))) { reiserfs_warning(s, "reiserfs-2504", "error reading the super"); diff --git a/fs/reiserfs/xattr_security.c b/fs/reiserfs/xattr_security.c index 86aeb9dd805a..e4cbb7719906 100644 --- a/fs/reiserfs/xattr_security.c +++ b/fs/reiserfs/xattr_security.c @@ -20,13 +20,14 @@ security_get(const struct xattr_handler *handler, struct dentry *unused, } static int -security_set(const struct xattr_handler *handler, struct dentry *dentry, - const char *name, const void *buffer, size_t size, int flags) +security_set(const struct xattr_handler *handler, struct dentry *unused, + struct inode *inode, const char *name, const void *buffer, + size_t size, int flags) { - if (IS_PRIVATE(d_inode(dentry))) + if (IS_PRIVATE(inode)) return -EPERM; - return reiserfs_xattr_set(d_inode(dentry), + return reiserfs_xattr_set(inode, xattr_full_name(handler, name), buffer, size, flags); } diff --git a/fs/reiserfs/xattr_trusted.c b/fs/reiserfs/xattr_trusted.c index 31837f031f59..f15a5f9e84ce 100644 --- a/fs/reiserfs/xattr_trusted.c +++ b/fs/reiserfs/xattr_trusted.c @@ -19,13 +19,14 @@ trusted_get(const struct xattr_handler *handler, struct dentry *unused, } static int -trusted_set(const struct xattr_handler *handler, struct dentry *dentry, - const char *name, const void *buffer, size_t size, int flags) +trusted_set(const struct xattr_handler *handler, struct dentry *unused, + struct inode *inode, const char *name, const void *buffer, + size_t size, int flags) { - if (!capable(CAP_SYS_ADMIN) || IS_PRIVATE(d_inode(dentry))) + if (!capable(CAP_SYS_ADMIN) || IS_PRIVATE(inode)) return -EPERM; - return reiserfs_xattr_set(d_inode(dentry), + return reiserfs_xattr_set(inode, xattr_full_name(handler, name), buffer, size, flags); } diff --git a/fs/reiserfs/xattr_user.c b/fs/reiserfs/xattr_user.c index f7c39731684b..dc59df43b2db 100644 --- a/fs/reiserfs/xattr_user.c +++ b/fs/reiserfs/xattr_user.c @@ -17,12 +17,13 @@ user_get(const struct xattr_handler *handler, struct dentry *unused, } static int -user_set(const struct xattr_handler *handler, struct dentry *dentry, - const char *name, const void *buffer, size_t size, int flags) +user_set(const struct xattr_handler *handler, struct dentry *unused, + struct inode *inode, const char *name, const void *buffer, + size_t size, int flags) { - if (!reiserfs_xattrs_user(dentry->d_sb)) + if (!reiserfs_xattrs_user(inode->i_sb)) return -EOPNOTSUPP; - return reiserfs_xattr_set(d_inode(dentry), + return reiserfs_xattr_set(inode, xattr_full_name(handler, name), buffer, size, flags); } diff --git a/fs/select.c b/fs/select.c index 869293988c2a..8ed9da50896a 100644 --- a/fs/select.c +++ b/fs/select.c @@ -47,7 +47,7 @@ #define MAX_SLACK (100 * NSEC_PER_MSEC) -static long __estimate_accuracy(struct timespec *tv) +static long __estimate_accuracy(struct timespec64 *tv) { long slack; int divfactor = 1000; @@ -70,10 +70,10 @@ static long __estimate_accuracy(struct timespec *tv) return slack; } -u64 select_estimate_accuracy(struct timespec *tv) +u64 select_estimate_accuracy(struct timespec64 *tv) { u64 ret; - struct timespec now; + struct timespec64 now; /* * Realtime tasks get a slack of 0 for obvious reasons. @@ -82,8 +82,8 @@ u64 select_estimate_accuracy(struct timespec *tv) if (rt_task(current)) return 0; - ktime_get_ts(&now); - now = timespec_sub(*tv, now); + ktime_get_ts64(&now); + now = timespec64_sub(*tv, now); ret = __estimate_accuracy(&now); if (ret < current->timer_slack_ns) return current->timer_slack_ns; @@ -260,7 +260,7 @@ EXPORT_SYMBOL(poll_schedule_timeout); /** * poll_select_set_timeout - helper function to setup the timeout value - * @to: pointer to timespec variable for the final timeout + * @to: pointer to timespec64 variable for the final timeout * @sec: seconds (from user space) * @nsec: nanoseconds (from user space) * @@ -269,26 +269,28 @@ EXPORT_SYMBOL(poll_schedule_timeout); * * Returns -EINVAL if sec/nsec are not normalized. Otherwise 0. */ -int poll_select_set_timeout(struct timespec *to, long sec, long nsec) +int poll_select_set_timeout(struct timespec64 *to, time64_t sec, long nsec) { - struct timespec ts = {.tv_sec = sec, .tv_nsec = nsec}; + struct timespec64 ts = {.tv_sec = sec, .tv_nsec = nsec}; - if (!timespec_valid(&ts)) + if (!timespec64_valid(&ts)) return -EINVAL; /* Optimize for the zero timeout value here */ if (!sec && !nsec) { to->tv_sec = to->tv_nsec = 0; } else { - ktime_get_ts(to); - *to = timespec_add_safe(*to, ts); + ktime_get_ts64(to); + *to = timespec64_add_safe(*to, ts); } return 0; } -static int poll_select_copy_remaining(struct timespec *end_time, void __user *p, +static int poll_select_copy_remaining(struct timespec64 *end_time, + void __user *p, int timeval, int ret) { + struct timespec64 rts64; struct timespec rts; struct timeval rtv; @@ -302,16 +304,18 @@ static int poll_select_copy_remaining(struct timespec *end_time, void __user *p, if (!end_time->tv_sec && !end_time->tv_nsec) return ret; - ktime_get_ts(&rts); - rts = timespec_sub(*end_time, rts); - if (rts.tv_sec < 0) - rts.tv_sec = rts.tv_nsec = 0; + ktime_get_ts64(&rts64); + rts64 = timespec64_sub(*end_time, rts64); + if (rts64.tv_sec < 0) + rts64.tv_sec = rts64.tv_nsec = 0; + + rts = timespec64_to_timespec(rts64); if (timeval) { if (sizeof(rtv) > sizeof(rtv.tv_sec) + sizeof(rtv.tv_usec)) memset(&rtv, 0, sizeof(rtv)); - rtv.tv_sec = rts.tv_sec; - rtv.tv_usec = rts.tv_nsec / NSEC_PER_USEC; + rtv.tv_sec = rts64.tv_sec; + rtv.tv_usec = rts64.tv_nsec / NSEC_PER_USEC; if (!copy_to_user(p, &rtv, sizeof(rtv))) return ret; @@ -396,7 +400,7 @@ static inline void wait_key_set(poll_table *wait, unsigned long in, wait->_key |= POLLOUT_SET; } -int do_select(int n, fd_set_bits *fds, struct timespec *end_time) +int do_select(int n, fd_set_bits *fds, struct timespec64 *end_time) { ktime_t expire, *to = NULL; struct poll_wqueues table; @@ -522,7 +526,7 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time) * pointer to the expiry value. */ if (end_time && !to) { - expire = timespec_to_ktime(*end_time); + expire = timespec64_to_ktime(*end_time); to = &expire; } @@ -545,7 +549,7 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time) * I'm trying ERESTARTNOHAND which restart only when you want to. */ int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp, - fd_set __user *exp, struct timespec *end_time) + fd_set __user *exp, struct timespec64 *end_time) { fd_set_bits fds; void *bits; @@ -622,7 +626,7 @@ out_nofds: SYSCALL_DEFINE5(select, int, n, fd_set __user *, inp, fd_set __user *, outp, fd_set __user *, exp, struct timeval __user *, tvp) { - struct timespec end_time, *to = NULL; + struct timespec64 end_time, *to = NULL; struct timeval tv; int ret; @@ -648,15 +652,17 @@ static long do_pselect(int n, fd_set __user *inp, fd_set __user *outp, const sigset_t __user *sigmask, size_t sigsetsize) { sigset_t ksigmask, sigsaved; - struct timespec ts, end_time, *to = NULL; + struct timespec ts; + struct timespec64 ts64, end_time, *to = NULL; int ret; if (tsp) { if (copy_from_user(&ts, tsp, sizeof(ts))) return -EFAULT; + ts64 = timespec_to_timespec64(ts); to = &end_time; - if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec)) + if (poll_select_set_timeout(to, ts64.tv_sec, ts64.tv_nsec)) return -EINVAL; } @@ -779,7 +785,7 @@ static inline unsigned int do_pollfd(struct pollfd *pollfd, poll_table *pwait, } static int do_poll(struct poll_list *list, struct poll_wqueues *wait, - struct timespec *end_time) + struct timespec64 *end_time) { poll_table* pt = &wait->pt; ktime_t expire, *to = NULL; @@ -854,7 +860,7 @@ static int do_poll(struct poll_list *list, struct poll_wqueues *wait, * pointer to the expiry value. */ if (end_time && !to) { - expire = timespec_to_ktime(*end_time); + expire = timespec64_to_ktime(*end_time); to = &expire; } @@ -868,7 +874,7 @@ static int do_poll(struct poll_list *list, struct poll_wqueues *wait, sizeof(struct pollfd)) int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds, - struct timespec *end_time) + struct timespec64 *end_time) { struct poll_wqueues table; int err = -EFAULT, fdcount, len, size; @@ -936,7 +942,7 @@ static long do_restart_poll(struct restart_block *restart_block) { struct pollfd __user *ufds = restart_block->poll.ufds; int nfds = restart_block->poll.nfds; - struct timespec *to = NULL, end_time; + struct timespec64 *to = NULL, end_time; int ret; if (restart_block->poll.has_timeout) { @@ -957,7 +963,7 @@ static long do_restart_poll(struct restart_block *restart_block) SYSCALL_DEFINE3(poll, struct pollfd __user *, ufds, unsigned int, nfds, int, timeout_msecs) { - struct timespec end_time, *to = NULL; + struct timespec64 end_time, *to = NULL; int ret; if (timeout_msecs >= 0) { @@ -993,7 +999,8 @@ SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds, unsigned int, nfds, size_t, sigsetsize) { sigset_t ksigmask, sigsaved; - struct timespec ts, end_time, *to = NULL; + struct timespec ts; + struct timespec64 end_time, *to = NULL; int ret; if (tsp) { diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c index 2c2618410d51..ce62a380314f 100644 --- a/fs/squashfs/block.c +++ b/fs/squashfs/block.c @@ -124,7 +124,7 @@ int squashfs_read_data(struct super_block *sb, u64 index, int length, goto block_release; bytes += msblk->devblksize; } - ll_rw_block(READ, b, bh); + ll_rw_block(REQ_OP_READ, 0, b, bh); } else { /* * Metadata block. @@ -156,7 +156,7 @@ int squashfs_read_data(struct super_block *sb, u64 index, int length, goto block_release; bytes += msblk->devblksize; } - ll_rw_block(READ, b - 1, bh + 1); + ll_rw_block(REQ_OP_READ, 0, b - 1, bh + 1); } for (i = 0; i < b; i++) { diff --git a/fs/super.c b/fs/super.c index d78b9847e6cb..5806ffd45563 100644 --- a/fs/super.c +++ b/fs/super.c @@ -206,6 +206,8 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags) mutex_init(&s->s_sync_lock); INIT_LIST_HEAD(&s->s_inodes); spin_lock_init(&s->s_inode_list_lock); + INIT_LIST_HEAD(&s->s_inodes_wb); + spin_lock_init(&s->s_inode_wblist_lock); if (list_lru_init_memcg(&s->s_dentry_lru)) goto fail; diff --git a/fs/sysv/namei.c b/fs/sysv/namei.c index 90b60c03b588..a42de45ce40d 100644 --- a/fs/sysv/namei.c +++ b/fs/sysv/namei.c @@ -33,7 +33,7 @@ static int sysv_hash(const struct dentry *dentry, struct qstr *qstr) function. */ if (qstr->len > SYSV_NAMELEN) { qstr->len = SYSV_NAMELEN; - qstr->hash = full_name_hash(qstr->name, qstr->len); + qstr->hash = full_name_hash(dentry, qstr->name, qstr->len); } return 0; } diff --git a/fs/timerfd.c b/fs/timerfd.c index 053818dd6c18..9ae4abb4110b 100644 --- a/fs/timerfd.c +++ b/fs/timerfd.c @@ -390,6 +390,11 @@ SYSCALL_DEFINE2(timerfd_create, int, clockid, int, flags) clockid != CLOCK_BOOTTIME_ALARM)) return -EINVAL; + if (!capable(CAP_WAKE_ALARM) && + (clockid == CLOCK_REALTIME_ALARM || + clockid == CLOCK_BOOTTIME_ALARM)) + return -EPERM; + ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); if (!ctx) return -ENOMEM; @@ -433,6 +438,11 @@ static int do_timerfd_settime(int ufd, int flags, return ret; ctx = f.file->private_data; + if (!capable(CAP_WAKE_ALARM) && isalarm(ctx)) { + fdput(f); + return -EPERM; + } + timerfd_setup_cancel(ctx, flags); /* diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c index 4a0e48f92104..ad40b64c5e2f 100644 --- a/fs/tracefs/inode.c +++ b/fs/tracefs/inode.c @@ -541,9 +541,6 @@ void tracefs_remove(struct dentry *dentry) return; parent = dentry->d_parent; - if (!parent || !parent->d_inode) - return; - inode_lock(parent->d_inode); ret = __tracefs_remove(dentry, parent); inode_unlock(parent->d_inode); @@ -566,10 +563,6 @@ void tracefs_remove_recursive(struct dentry *dentry) if (IS_ERR_OR_NULL(dentry)) return; - parent = dentry->d_parent; - if (!parent || !parent->d_inode) - return; - parent = dentry; down: inode_lock(parent->d_inode); diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c index 595ca0debe11..69e287e20732 100644 --- a/fs/ubifs/debug.c +++ b/fs/ubifs/debug.c @@ -260,7 +260,7 @@ void ubifs_dump_inode(struct ubifs_info *c, const struct inode *inode) pr_err("\txattr_names %u\n", ui->xattr_names); pr_err("\tdirty %u\n", ui->dirty); pr_err("\txattr %u\n", ui->xattr); - pr_err("\tbulk_read %u\n", ui->xattr); + pr_err("\tbulk_read %u\n", ui->bulk_read); pr_err("\tsynced_i_size %llu\n", (unsigned long long)ui->synced_i_size); pr_err("\tui_size %llu\n", diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index 08316972ff93..7bbf420d1289 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -52,6 +52,7 @@ #include "ubifs.h" #include <linux/mount.h> #include <linux/slab.h> +#include <linux/migrate.h> static int read_block(struct inode *inode, void *addr, unsigned int block, struct ubifs_data_node *dn) @@ -1452,6 +1453,26 @@ static int ubifs_set_page_dirty(struct page *page) return ret; } +#ifdef CONFIG_MIGRATION +static int ubifs_migrate_page(struct address_space *mapping, + struct page *newpage, struct page *page, enum migrate_mode mode) +{ + int rc; + + rc = migrate_page_move_mapping(mapping, newpage, page, NULL, mode, 0); + if (rc != MIGRATEPAGE_SUCCESS) + return rc; + + if (PagePrivate(page)) { + ClearPagePrivate(page); + SetPagePrivate(newpage); + } + + migrate_page_copy(newpage, page); + return MIGRATEPAGE_SUCCESS; +} +#endif + static int ubifs_releasepage(struct page *page, gfp_t unused_gfp_flags) { /* @@ -1591,6 +1612,9 @@ const struct address_space_operations ubifs_file_address_operations = { .write_end = ubifs_write_end, .invalidatepage = ubifs_invalidatepage, .set_page_dirty = ubifs_set_page_dirty, +#ifdef CONFIG_MIGRATION + .migratepage = ubifs_migrate_page, +#endif .releasepage = ubifs_releasepage, }; diff --git a/fs/ubifs/sb.c b/fs/ubifs/sb.c index f4fbc7b6b794..3cbb904a6d7d 100644 --- a/fs/ubifs/sb.c +++ b/fs/ubifs/sb.c @@ -28,8 +28,8 @@ #include "ubifs.h" #include <linux/slab.h> -#include <linux/random.h> #include <linux/math64.h> +#include <linux/uuid.h> /* * Default journal size in logical eraseblocks as a percent of total diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c index 6c277eb6aef9..b5fc27969e9d 100644 --- a/fs/ubifs/xattr.c +++ b/fs/ubifs/xattr.c @@ -579,11 +579,10 @@ static int ubifs_xattr_get(const struct xattr_handler *handler, } static int ubifs_xattr_set(const struct xattr_handler *handler, - struct dentry *dentry, const char *name, - const void *value, size_t size, int flags) + struct dentry *dentry, struct inode *inode, + const char *name, const void *value, + size_t size, int flags) { - struct inode *inode = d_inode(dentry); - dbg_gen("xattr '%s', host ino %lu ('%pd'), size %zd", name, inode->i_ino, dentry, size); diff --git a/fs/udf/dir.c b/fs/udf/dir.c index 4c5593abc553..aaec13c95253 100644 --- a/fs/udf/dir.c +++ b/fs/udf/dir.c @@ -113,7 +113,7 @@ static int udf_readdir(struct file *file, struct dir_context *ctx) brelse(tmp); } if (num) { - ll_rw_block(READA, num, bha); + ll_rw_block(REQ_OP_READ, REQ_RAHEAD, num, bha); for (i = 0; i < num; i++) brelse(bha[i]); } diff --git a/fs/udf/directory.c b/fs/udf/directory.c index c763fda257bf..988d5352bdb8 100644 --- a/fs/udf/directory.c +++ b/fs/udf/directory.c @@ -87,7 +87,7 @@ struct fileIdentDesc *udf_fileident_read(struct inode *dir, loff_t *nf_pos, brelse(tmp); } if (num) { - ll_rw_block(READA, num, bha); + ll_rw_block(REQ_OP_READ, REQ_RAHEAD, num, bha); for (i = 0; i < num; i++) brelse(bha[i]); } diff --git a/fs/udf/inode.c b/fs/udf/inode.c index f323aff740ef..55aa587bbc38 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -1199,7 +1199,7 @@ struct buffer_head *udf_bread(struct inode *inode, int block, if (buffer_uptodate(bh)) return bh; - ll_rw_block(READ, 1, &bh); + ll_rw_block(REQ_OP_READ, 0, 1, &bh); wait_on_buffer(bh); if (buffer_uptodate(bh)) diff --git a/fs/ufs/balloc.c b/fs/ufs/balloc.c index 0447b949c7f5..67e085d591d8 100644 --- a/fs/ufs/balloc.c +++ b/fs/ufs/balloc.c @@ -292,7 +292,7 @@ static void ufs_change_blocknr(struct inode *inode, sector_t beg, if (!buffer_mapped(bh)) map_bh(bh, inode->i_sb, oldb + pos); if (!buffer_uptodate(bh)) { - ll_rw_block(READ, 1, &bh); + ll_rw_block(REQ_OP_READ, 0, 1, &bh); wait_on_buffer(bh); if (!buffer_uptodate(bh)) { ufs_error(inode->i_sb, __func__, diff --git a/fs/ufs/dir.c b/fs/ufs/dir.c index 57dcceda17d6..fa3bda1a860f 100644 --- a/fs/ufs/dir.c +++ b/fs/ufs/dir.c @@ -279,12 +279,6 @@ struct ufs_dir_entry *ufs_find_entry(struct inode *dir, const struct qstr *qstr, de = (struct ufs_dir_entry *) kaddr; kaddr += ufs_last_byte(dir, n) - reclen; while ((char *) de <= kaddr) { - if (de->d_reclen == 0) { - ufs_error(dir->i_sb, __func__, - "zero-length directory entry"); - ufs_put_page(page); - goto out; - } if (ufs_match(sb, namelen, name, de)) goto found; de = ufs_next_entry(sb, de); @@ -414,11 +408,8 @@ ufs_validate_entry(struct super_block *sb, char *base, { struct ufs_dir_entry *de = (struct ufs_dir_entry*)(base + offset); struct ufs_dir_entry *p = (struct ufs_dir_entry*)(base + (offset&mask)); - while ((char*)p < (char*)de) { - if (p->d_reclen == 0) - break; + while ((char*)p < (char*)de) p = ufs_next_entry(sb, p); - } return (char *)p - base; } @@ -469,12 +460,6 @@ ufs_readdir(struct file *file, struct dir_context *ctx) de = (struct ufs_dir_entry *)(kaddr+offset); limit = kaddr + ufs_last_byte(inode, n) - UFS_DIR_REC_LEN(1); for ( ;(char*)de <= limit; de = ufs_next_entry(sb, de)) { - if (de->d_reclen == 0) { - ufs_error(sb, __func__, - "zero-length directory entry"); - ufs_put_page(page); - return -EIO; - } if (de->d_ino) { unsigned char d_type = DT_UNKNOWN; diff --git a/fs/ufs/util.c b/fs/ufs/util.c index a409e3e7827a..f41ad0a6106f 100644 --- a/fs/ufs/util.c +++ b/fs/ufs/util.c @@ -118,7 +118,7 @@ void ubh_sync_block(struct ufs_buffer_head *ubh) unsigned i; for (i = 0; i < ubh->count; i++) - write_dirty_buffer(ubh->bh[i], WRITE); + write_dirty_buffer(ubh->bh[i], 0); for (i = 0; i < ubh->count; i++) wait_on_buffer(ubh->bh[i]); diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 66cdb44616d5..85959d8324df 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -137,7 +137,7 @@ static void userfaultfd_ctx_put(struct userfaultfd_ctx *ctx) VM_BUG_ON(waitqueue_active(&ctx->fault_wqh)); VM_BUG_ON(spin_is_locked(&ctx->fd_wqh.lock)); VM_BUG_ON(waitqueue_active(&ctx->fd_wqh)); - mmput(ctx->mm); + mmdrop(ctx->mm); kmem_cache_free(userfaultfd_ctx_cachep, ctx); } } @@ -257,10 +257,9 @@ out: * fatal_signal_pending()s, and the mmap_sem must be released before * returning it. */ -int handle_userfault(struct vm_area_struct *vma, unsigned long address, - unsigned int flags, unsigned long reason) +int handle_userfault(struct fault_env *fe, unsigned long reason) { - struct mm_struct *mm = vma->vm_mm; + struct mm_struct *mm = fe->vma->vm_mm; struct userfaultfd_ctx *ctx; struct userfaultfd_wait_queue uwq; int ret; @@ -269,7 +268,7 @@ int handle_userfault(struct vm_area_struct *vma, unsigned long address, BUG_ON(!rwsem_is_locked(&mm->mmap_sem)); ret = VM_FAULT_SIGBUS; - ctx = vma->vm_userfaultfd_ctx.ctx; + ctx = fe->vma->vm_userfaultfd_ctx.ctx; if (!ctx) goto out; @@ -302,17 +301,17 @@ int handle_userfault(struct vm_area_struct *vma, unsigned long address, * without first stopping userland access to the memory. For * VM_UFFD_MISSING userfaults this is enough for now. */ - if (unlikely(!(flags & FAULT_FLAG_ALLOW_RETRY))) { + if (unlikely(!(fe->flags & FAULT_FLAG_ALLOW_RETRY))) { /* * Validate the invariant that nowait must allow retry * to be sure not to return SIGBUS erroneously on * nowait invocations. */ - BUG_ON(flags & FAULT_FLAG_RETRY_NOWAIT); + BUG_ON(fe->flags & FAULT_FLAG_RETRY_NOWAIT); #ifdef CONFIG_DEBUG_VM if (printk_ratelimit()) { printk(KERN_WARNING - "FAULT_FLAG_ALLOW_RETRY missing %x\n", flags); + "FAULT_FLAG_ALLOW_RETRY missing %x\n", fe->flags); dump_stack(); } #endif @@ -324,7 +323,7 @@ int handle_userfault(struct vm_area_struct *vma, unsigned long address, * and wait. */ ret = VM_FAULT_RETRY; - if (flags & FAULT_FLAG_RETRY_NOWAIT) + if (fe->flags & FAULT_FLAG_RETRY_NOWAIT) goto out; /* take the reference before dropping the mmap_sem */ @@ -332,10 +331,11 @@ int handle_userfault(struct vm_area_struct *vma, unsigned long address, init_waitqueue_func_entry(&uwq.wq, userfaultfd_wake_function); uwq.wq.private = current; - uwq.msg = userfault_msg(address, flags, reason); + uwq.msg = userfault_msg(fe->address, fe->flags, reason); uwq.ctx = ctx; - return_to_userland = (flags & (FAULT_FLAG_USER|FAULT_FLAG_KILLABLE)) == + return_to_userland = + (fe->flags & (FAULT_FLAG_USER|FAULT_FLAG_KILLABLE)) == (FAULT_FLAG_USER|FAULT_FLAG_KILLABLE); spin_lock(&ctx->fault_pending_wqh.lock); @@ -353,7 +353,7 @@ int handle_userfault(struct vm_area_struct *vma, unsigned long address, TASK_KILLABLE); spin_unlock(&ctx->fault_pending_wqh.lock); - must_wait = userfaultfd_must_wait(ctx, address, flags, reason); + must_wait = userfaultfd_must_wait(ctx, fe->address, fe->flags, reason); up_read(&mm->mmap_sem); if (likely(must_wait && !ACCESS_ONCE(ctx->released) && @@ -434,6 +434,9 @@ static int userfaultfd_release(struct inode *inode, struct file *file) ACCESS_ONCE(ctx->released) = true; + if (!mmget_not_zero(mm)) + goto wakeup; + /* * Flush page faults out of all CPUs. NOTE: all page faults * must be retried without returning VM_FAULT_SIGBUS if @@ -466,7 +469,8 @@ static int userfaultfd_release(struct inode *inode, struct file *file) vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; } up_write(&mm->mmap_sem); - + mmput(mm); +wakeup: /* * After no new page faults can wait on this fault_*wqh, flush * the last page faults that may have been already waiting on @@ -760,10 +764,12 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, start = uffdio_register.range.start; end = start + uffdio_register.range.len; + ret = -ENOMEM; + if (!mmget_not_zero(mm)) + goto out; + down_write(&mm->mmap_sem); vma = find_vma_prev(mm, start, &prev); - - ret = -ENOMEM; if (!vma) goto out_unlock; @@ -864,6 +870,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, } while (vma && vma->vm_start < end); out_unlock: up_write(&mm->mmap_sem); + mmput(mm); if (!ret) { /* * Now that we scanned all vmas we can already tell @@ -902,10 +909,12 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, start = uffdio_unregister.start; end = start + uffdio_unregister.len; + ret = -ENOMEM; + if (!mmget_not_zero(mm)) + goto out; + down_write(&mm->mmap_sem); vma = find_vma_prev(mm, start, &prev); - - ret = -ENOMEM; if (!vma) goto out_unlock; @@ -998,6 +1007,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, } while (vma && vma->vm_start < end); out_unlock: up_write(&mm->mmap_sem); + mmput(mm); out: return ret; } @@ -1067,9 +1077,11 @@ static int userfaultfd_copy(struct userfaultfd_ctx *ctx, goto out; if (uffdio_copy.mode & ~UFFDIO_COPY_MODE_DONTWAKE) goto out; - - ret = mcopy_atomic(ctx->mm, uffdio_copy.dst, uffdio_copy.src, - uffdio_copy.len); + if (mmget_not_zero(ctx->mm)) { + ret = mcopy_atomic(ctx->mm, uffdio_copy.dst, uffdio_copy.src, + uffdio_copy.len); + mmput(ctx->mm); + } if (unlikely(put_user(ret, &user_uffdio_copy->copy))) return -EFAULT; if (ret < 0) @@ -1110,8 +1122,11 @@ static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx, if (uffdio_zeropage.mode & ~UFFDIO_ZEROPAGE_MODE_DONTWAKE) goto out; - ret = mfill_zeropage(ctx->mm, uffdio_zeropage.range.start, - uffdio_zeropage.range.len); + if (mmget_not_zero(ctx->mm)) { + ret = mfill_zeropage(ctx->mm, uffdio_zeropage.range.start, + uffdio_zeropage.range.len); + mmput(ctx->mm); + } if (unlikely(put_user(ret, &user_uffdio_zeropage->zeropage))) return -EFAULT; if (ret < 0) @@ -1289,12 +1304,12 @@ static struct file *userfaultfd_file_create(int flags) ctx->released = false; ctx->mm = current->mm; /* prevent the mm struct to be freed */ - atomic_inc(&ctx->mm->mm_users); + atomic_inc(&ctx->mm->mm_count); file = anon_inode_getfile("[userfaultfd]", &userfaultfd_fops, ctx, O_RDWR | (flags & UFFD_SHARED_FCNTL_FLAGS)); if (IS_ERR(file)) { - mmput(ctx->mm); + mmdrop(ctx->mm); kmem_cache_free(userfaultfd_ctx_cachep, ctx); } out: diff --git a/fs/xattr.c b/fs/xattr.c index b11945e15fde..4beafc43daa5 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -100,7 +100,7 @@ int __vfs_setxattr_noperm(struct dentry *dentry, const char *name, if (issec) inode->i_flags &= ~S_NOSEC; if (inode->i_op->setxattr) { - error = inode->i_op->setxattr(dentry, name, value, size, flags); + error = inode->i_op->setxattr(dentry, inode, name, value, size, flags); if (!error) { fsnotify_xattr(dentry); security_inode_post_setxattr(dentry, name, value, @@ -655,6 +655,7 @@ strcmp_prefix(const char *a, const char *a_prefix) * operations to the correct xattr_handler. */ #define for_each_xattr_handler(handlers, handler) \ + if (handlers) \ for ((handler) = *(handlers)++; \ (handler) != NULL; \ (handler) = *(handlers)++) @@ -668,7 +669,7 @@ xattr_resolve_name(const struct xattr_handler **handlers, const char **name) const struct xattr_handler *handler; if (!*name) - return NULL; + return ERR_PTR(-EINVAL); for_each_xattr_handler(handlers, handler) { const char *n; @@ -744,7 +745,8 @@ generic_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) * Find the handler for the prefix and dispatch its set() operation. */ int -generic_setxattr(struct dentry *dentry, const char *name, const void *value, size_t size, int flags) +generic_setxattr(struct dentry *dentry, struct inode *inode, const char *name, + const void *value, size_t size, int flags) { const struct xattr_handler *handler; @@ -753,7 +755,7 @@ generic_setxattr(struct dentry *dentry, const char *name, const void *value, siz handler = xattr_resolve_name(dentry->d_sb->s_xattr, &name); if (IS_ERR(handler)) return PTR_ERR(handler); - return handler->set(handler, dentry, name, value, size, flags); + return handler->set(handler, dentry, inode, name, value, size, flags); } /* @@ -768,7 +770,8 @@ generic_removexattr(struct dentry *dentry, const char *name) handler = xattr_resolve_name(dentry->d_sb->s_xattr, &name); if (IS_ERR(handler)) return PTR_ERR(handler); - return handler->set(handler, dentry, name, NULL, 0, XATTR_REPLACE); + return handler->set(handler, dentry, d_inode(dentry), name, NULL, + 0, XATTR_REPLACE); } EXPORT_SYMBOL(generic_getxattr); diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig index 5d47b4df61ea..35faf128f36d 100644 --- a/fs/xfs/Kconfig +++ b/fs/xfs/Kconfig @@ -4,6 +4,7 @@ config XFS_FS depends on (64BIT || LBDAF) select EXPORTFS select LIBCRC32C + select FS_IOMAP help XFS is a high performance journaling filesystem which originated on the SGI IRIX platform. It is completely multi-threaded, can diff --git a/fs/xfs/kmem.c b/fs/xfs/kmem.c index 686ba6fb20dd..339c696bbc01 100644 --- a/fs/xfs/kmem.c +++ b/fs/xfs/kmem.c @@ -93,19 +93,23 @@ kmem_zalloc_large(size_t size, xfs_km_flags_t flags) } void * -kmem_realloc(const void *ptr, size_t newsize, size_t oldsize, - xfs_km_flags_t flags) +kmem_realloc(const void *old, size_t newsize, xfs_km_flags_t flags) { - void *new; + int retries = 0; + gfp_t lflags = kmem_flags_convert(flags); + void *ptr; - new = kmem_alloc(newsize, flags); - if (ptr) { - if (new) - memcpy(new, ptr, - ((oldsize < newsize) ? oldsize : newsize)); - kmem_free(ptr); - } - return new; + do { + ptr = krealloc(old, newsize, lflags); + if (ptr || (flags & (KM_MAYFAIL|KM_NOSLEEP))) + return ptr; + if (!(++retries % 100)) + xfs_err(NULL, + "%s(%u) possible memory allocation deadlock size %zu in %s (mode:0x%x)", + current->comm, current->pid, + newsize, __func__, lflags); + congestion_wait(BLK_RW_ASYNC, HZ/50); + } while (1); } void * diff --git a/fs/xfs/kmem.h b/fs/xfs/kmem.h index d1c66e465ca5..689f746224e7 100644 --- a/fs/xfs/kmem.h +++ b/fs/xfs/kmem.h @@ -62,7 +62,7 @@ kmem_flags_convert(xfs_km_flags_t flags) extern void *kmem_alloc(size_t, xfs_km_flags_t); extern void *kmem_zalloc_large(size_t size, xfs_km_flags_t); -extern void *kmem_realloc(const void *, size_t, size_t, xfs_km_flags_t); +extern void *kmem_realloc(const void *, size_t, xfs_km_flags_t); static inline void kmem_free(const void *ptr) { kvfree(ptr); diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index a708e38b494c..88c26b827a2d 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -84,7 +84,7 @@ xfs_alloc_lookup_ge( * Lookup the first record less than or equal to [bno, len] * in the btree given by cur. */ -int /* error */ +static int /* error */ xfs_alloc_lookup_le( struct xfs_btree_cur *cur, /* btree cursor */ xfs_agblock_t bno, /* starting block of extent */ @@ -1839,19 +1839,8 @@ void xfs_alloc_compute_maxlevels( xfs_mount_t *mp) /* file system mount structure */ { - int level; - uint maxblocks; - uint maxleafents; - int minleafrecs; - int minnoderecs; - - maxleafents = (mp->m_sb.sb_agblocks + 1) / 2; - minleafrecs = mp->m_alloc_mnr[0]; - minnoderecs = mp->m_alloc_mnr[1]; - maxblocks = (maxleafents + minleafrecs - 1) / minleafrecs; - for (level = 1; maxblocks > 1; level++) - maxblocks = (maxblocks + minnoderecs - 1) / minnoderecs; - mp->m_ag_maxlevels = level; + mp->m_ag_maxlevels = xfs_btree_compute_maxlevels(mp, mp->m_alloc_mnr, + (mp->m_sb.sb_agblocks + 1) / 2); } /* @@ -2658,55 +2647,79 @@ error0: return error; } -/* - * Free an extent. - * Just break up the extent address and hand off to xfs_free_ag_extent - * after fixing up the freelist. - */ -int /* error */ -xfs_free_extent( - xfs_trans_t *tp, /* transaction pointer */ - xfs_fsblock_t bno, /* starting block number of extent */ - xfs_extlen_t len) /* length of extent */ +/* Ensure that the freelist is at full capacity. */ +int +xfs_free_extent_fix_freelist( + struct xfs_trans *tp, + xfs_agnumber_t agno, + struct xfs_buf **agbp) { - xfs_alloc_arg_t args; - int error; + struct xfs_alloc_arg args; + int error; - ASSERT(len != 0); - memset(&args, 0, sizeof(xfs_alloc_arg_t)); + memset(&args, 0, sizeof(struct xfs_alloc_arg)); args.tp = tp; args.mp = tp->t_mountp; + args.agno = agno; /* * validate that the block number is legal - the enables us to detect * and handle a silent filesystem corruption rather than crashing. */ - args.agno = XFS_FSB_TO_AGNO(args.mp, bno); if (args.agno >= args.mp->m_sb.sb_agcount) return -EFSCORRUPTED; - args.agbno = XFS_FSB_TO_AGBNO(args.mp, bno); - if (args.agbno >= args.mp->m_sb.sb_agblocks) - return -EFSCORRUPTED; - args.pag = xfs_perag_get(args.mp, args.agno); ASSERT(args.pag); error = xfs_alloc_fix_freelist(&args, XFS_ALLOC_FLAG_FREEING); if (error) - goto error0; + goto out; + + *agbp = args.agbp; +out: + xfs_perag_put(args.pag); + return error; +} + +/* + * Free an extent. + * Just break up the extent address and hand off to xfs_free_ag_extent + * after fixing up the freelist. + */ +int /* error */ +xfs_free_extent( + struct xfs_trans *tp, /* transaction pointer */ + xfs_fsblock_t bno, /* starting block number of extent */ + xfs_extlen_t len) /* length of extent */ +{ + struct xfs_mount *mp = tp->t_mountp; + struct xfs_buf *agbp; + xfs_agnumber_t agno = XFS_FSB_TO_AGNO(mp, bno); + xfs_agblock_t agbno = XFS_FSB_TO_AGBNO(mp, bno); + int error; + + ASSERT(len != 0); + + error = xfs_free_extent_fix_freelist(tp, agno, &agbp); + if (error) + return error; + + XFS_WANT_CORRUPTED_GOTO(mp, agbno < mp->m_sb.sb_agblocks, err); /* validate the extent size is legal now we have the agf locked */ - if (args.agbno + len > - be32_to_cpu(XFS_BUF_TO_AGF(args.agbp)->agf_length)) { - error = -EFSCORRUPTED; - goto error0; - } + XFS_WANT_CORRUPTED_GOTO(mp, + agbno + len <= be32_to_cpu(XFS_BUF_TO_AGF(agbp)->agf_length), + err); - error = xfs_free_ag_extent(tp, args.agbp, args.agno, args.agbno, len, 0); - if (!error) - xfs_extent_busy_insert(tp, args.agno, args.agbno, len, 0); -error0: - xfs_perag_put(args.pag); + error = xfs_free_ag_extent(tp, agbp, agno, agbno, len, 0); + if (error) + goto err; + + xfs_extent_busy_insert(tp, agno, agbno, len, 0); + return 0; + +err: + xfs_trans_brelse(tp, agbp); return error; } diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h index 135eb3d24db7..cf268b2d0b6c 100644 --- a/fs/xfs/libxfs/xfs_alloc.h +++ b/fs/xfs/libxfs/xfs_alloc.h @@ -212,13 +212,6 @@ xfs_free_extent( xfs_fsblock_t bno, /* starting block number of extent */ xfs_extlen_t len); /* length of extent */ -int /* error */ -xfs_alloc_lookup_le( - struct xfs_btree_cur *cur, /* btree cursor */ - xfs_agblock_t bno, /* starting block of extent */ - xfs_extlen_t len, /* length of extent */ - int *stat); /* success/failure */ - int /* error */ xfs_alloc_lookup_ge( struct xfs_btree_cur *cur, /* btree cursor */ @@ -236,5 +229,7 @@ xfs_alloc_get_rec( int xfs_read_agf(struct xfs_mount *mp, struct xfs_trans *tp, xfs_agnumber_t agno, int flags, struct xfs_buf **bpp); int xfs_alloc_fix_freelist(struct xfs_alloc_arg *args, int flags); +int xfs_free_extent_fix_freelist(struct xfs_trans *tp, xfs_agnumber_t agno, + struct xfs_buf **agbp); #endif /* __XFS_ALLOC_H__ */ diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index fa3b948ef9c2..4e126f41a0aa 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -242,37 +242,21 @@ xfs_attr_set( return error; } - /* - * Start our first transaction of the day. - * - * All future transactions during this code must be "chained" off - * this one via the trans_dup() call. All transactions will contain - * the inode, and the inode will always be marked with trans_ihold(). - * Since the inode will be locked in all transactions, we must log - * the inode in every transaction to let it float upward through - * the log. - */ - args.trans = xfs_trans_alloc(mp, XFS_TRANS_ATTR_SET); + tres.tr_logres = M_RES(mp)->tr_attrsetm.tr_logres + + M_RES(mp)->tr_attrsetrt.tr_logres * args.total; + tres.tr_logcount = XFS_ATTRSET_LOG_COUNT; + tres.tr_logflags = XFS_TRANS_PERM_LOG_RES; /* * Root fork attributes can use reserved data blocks for this * operation if necessary */ - - if (rsvd) - args.trans->t_flags |= XFS_TRANS_RESERVE; - - tres.tr_logres = M_RES(mp)->tr_attrsetm.tr_logres + - M_RES(mp)->tr_attrsetrt.tr_logres * args.total; - tres.tr_logcount = XFS_ATTRSET_LOG_COUNT; - tres.tr_logflags = XFS_TRANS_PERM_LOG_RES; - error = xfs_trans_reserve(args.trans, &tres, args.total, 0); - if (error) { - xfs_trans_cancel(args.trans); + error = xfs_trans_alloc(mp, &tres, args.total, 0, + rsvd ? XFS_TRANS_RESERVE : 0, &args.trans); + if (error) return error; - } - xfs_ilock(dp, XFS_ILOCK_EXCL); + xfs_ilock(dp, XFS_ILOCK_EXCL); error = xfs_trans_reserve_quota_nblks(args.trans, dp, args.total, 0, rsvd ? XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_FORCE_RES : XFS_QMOPT_RES_REGBLKS); @@ -429,31 +413,15 @@ xfs_attr_remove( return error; /* - * Start our first transaction of the day. - * - * All future transactions during this code must be "chained" off - * this one via the trans_dup() call. All transactions will contain - * the inode, and the inode will always be marked with trans_ihold(). - * Since the inode will be locked in all transactions, we must log - * the inode in every transaction to let it float upward through - * the log. - */ - args.trans = xfs_trans_alloc(mp, XFS_TRANS_ATTR_RM); - - /* * Root fork attributes can use reserved data blocks for this * operation if necessary */ - - if (flags & ATTR_ROOT) - args.trans->t_flags |= XFS_TRANS_RESERVE; - - error = xfs_trans_reserve(args.trans, &M_RES(mp)->tr_attrrm, - XFS_ATTRRM_SPACE_RES(mp), 0); - if (error) { - xfs_trans_cancel(args.trans); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_attrrm, + XFS_ATTRRM_SPACE_RES(mp), 0, + (flags & ATTR_ROOT) ? XFS_TRANS_RESERVE : 0, + &args.trans); + if (error) return error; - } xfs_ilock(dp, XFS_ILOCK_EXCL); /* diff --git a/fs/xfs/libxfs/xfs_attr_leaf.h b/fs/xfs/libxfs/xfs_attr_leaf.h index 882c8d338891..4f2aed04f827 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.h +++ b/fs/xfs/libxfs/xfs_attr_leaf.h @@ -50,7 +50,6 @@ int xfs_attr_shortform_lookup(struct xfs_da_args *args); int xfs_attr_shortform_getvalue(struct xfs_da_args *args); int xfs_attr_shortform_to_leaf(struct xfs_da_args *args); int xfs_attr_shortform_remove(struct xfs_da_args *args); -int xfs_attr_shortform_list(struct xfs_attr_list_context *context); int xfs_attr_shortform_allfit(struct xfs_buf *bp, struct xfs_inode *dp); int xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes); void xfs_attr_fork_remove(struct xfs_inode *ip, struct xfs_trans *tp); @@ -88,8 +87,6 @@ int xfs_attr3_leaf_toosmall(struct xfs_da_state *state, int *retval); void xfs_attr3_leaf_unbalance(struct xfs_da_state *state, struct xfs_da_state_blk *drop_blk, struct xfs_da_state_blk *save_blk); -int xfs_attr3_root_inactive(struct xfs_trans **trans, struct xfs_inode *dp); - /* * Utility routines. */ diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index ce41d7fe753c..2f2c85cc8117 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -570,14 +570,12 @@ xfs_bmap_validate_ret( */ void xfs_bmap_add_free( + struct xfs_mount *mp, /* mount point structure */ + struct xfs_bmap_free *flist, /* list of extents */ xfs_fsblock_t bno, /* fs block number of extent */ - xfs_filblks_t len, /* length of extent */ - xfs_bmap_free_t *flist, /* list of extents */ - xfs_mount_t *mp) /* mount point structure */ + xfs_filblks_t len) /* length of extent */ { - xfs_bmap_free_item_t *cur; /* current (next) element */ - xfs_bmap_free_item_t *new; /* new element */ - xfs_bmap_free_item_t *prev; /* previous element */ + struct xfs_bmap_free_item *new; /* new element */ #ifdef DEBUG xfs_agnumber_t agno; xfs_agblock_t agbno; @@ -597,17 +595,7 @@ xfs_bmap_add_free( new = kmem_zone_alloc(xfs_bmap_free_item_zone, KM_SLEEP); new->xbfi_startblock = bno; new->xbfi_blockcount = (xfs_extlen_t)len; - for (prev = NULL, cur = flist->xbf_first; - cur != NULL; - prev = cur, cur = cur->xbfi_next) { - if (cur->xbfi_startblock >= bno) - break; - } - if (prev) - prev->xbfi_next = new; - else - flist->xbf_first = new; - new->xbfi_next = cur; + list_add(&new->xbfi_list, &flist->xbf_flist); flist->xbf_count++; } @@ -617,14 +605,10 @@ xfs_bmap_add_free( */ void xfs_bmap_del_free( - xfs_bmap_free_t *flist, /* free item list header */ - xfs_bmap_free_item_t *prev, /* previous item on list, if any */ - xfs_bmap_free_item_t *free) /* list item to be freed */ + struct xfs_bmap_free *flist, /* free item list header */ + struct xfs_bmap_free_item *free) /* list item to be freed */ { - if (prev) - prev->xbfi_next = free->xbfi_next; - else - flist->xbf_first = free->xbfi_next; + list_del(&free->xbfi_list); flist->xbf_count--; kmem_zone_free(xfs_bmap_free_item_zone, free); } @@ -634,17 +618,16 @@ xfs_bmap_del_free( */ void xfs_bmap_cancel( - xfs_bmap_free_t *flist) /* list of bmap_free_items */ + struct xfs_bmap_free *flist) /* list of bmap_free_items */ { - xfs_bmap_free_item_t *free; /* free list item */ - xfs_bmap_free_item_t *next; + struct xfs_bmap_free_item *free; /* free list item */ if (flist->xbf_count == 0) return; - ASSERT(flist->xbf_first != NULL); - for (free = flist->xbf_first; free; free = next) { - next = free->xbfi_next; - xfs_bmap_del_free(flist, NULL, free); + while (!list_empty(&flist->xbf_flist)) { + free = list_first_entry(&flist->xbf_flist, + struct xfs_bmap_free_item, xbfi_list); + xfs_bmap_del_free(flist, free); } ASSERT(flist->xbf_count == 0); } @@ -699,7 +682,7 @@ xfs_bmap_btree_to_extents( cblock = XFS_BUF_TO_BLOCK(cbp); if ((error = xfs_btree_check_block(cur, cblock, 0, cbp))) return error; - xfs_bmap_add_free(cbno, 1, cur->bc_private.b.flist, mp); + xfs_bmap_add_free(mp, cur->bc_private.b.flist, cbno, 1); ip->i_d.di_nblocks--; xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, -1L); xfs_trans_binval(tp, cbp); @@ -1121,15 +1104,14 @@ xfs_bmap_add_attrfork( mp = ip->i_mount; ASSERT(!XFS_NOT_DQATTACHED(mp, ip)); - tp = xfs_trans_alloc(mp, XFS_TRANS_ADDAFORK); + blks = XFS_ADDAFORK_SPACE_RES(mp); - if (rsvd) - tp->t_flags |= XFS_TRANS_RESERVE; - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_addafork, blks, 0); - if (error) { - xfs_trans_cancel(tp); + + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_addafork, blks, 0, + rsvd ? XFS_TRANS_RESERVE : 0, &tp); + if (error) return error; - } + xfs_ilock(ip, XFS_ILOCK_EXCL); error = xfs_trans_reserve_quota_nblks(tp, ip, blks, 0, rsvd ? XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_FORCE_RES : @@ -5074,8 +5056,8 @@ xfs_bmap_del_extent( * If we need to, add to list of extents to delete. */ if (do_fx) - xfs_bmap_add_free(del->br_startblock, del->br_blockcount, flist, - mp); + xfs_bmap_add_free(mp, flist, del->br_startblock, + del->br_blockcount); /* * Adjust inode # blocks in the file. */ @@ -6026,13 +6008,10 @@ xfs_bmap_split_extent( xfs_fsblock_t firstfsb; int error; - tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT); - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write, - XFS_DIOSTRAT_SPACE_RES(mp, 0), 0); - if (error) { - xfs_trans_cancel(tp); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, + XFS_DIOSTRAT_SPACE_RES(mp, 0), 0, 0, &tp); + if (error) return error; - } xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h index 423a34e832bd..f1f3ae6c0a3f 100644 --- a/fs/xfs/libxfs/xfs_bmap.h +++ b/fs/xfs/libxfs/xfs_bmap.h @@ -62,12 +62,12 @@ struct xfs_bmalloca { * List of extents to be free "later". * The list is kept sorted on xbf_startblock. */ -typedef struct xfs_bmap_free_item +struct xfs_bmap_free_item { xfs_fsblock_t xbfi_startblock;/* starting fs block number */ xfs_extlen_t xbfi_blockcount;/* number of blocks in extent */ - struct xfs_bmap_free_item *xbfi_next; /* link to next entry */ -} xfs_bmap_free_item_t; + struct list_head xbfi_list; +}; /* * Header for free extent list. @@ -85,7 +85,7 @@ typedef struct xfs_bmap_free_item */ typedef struct xfs_bmap_free { - xfs_bmap_free_item_t *xbf_first; /* list of to-be-free extents */ + struct list_head xbf_flist; /* list of to-be-free extents */ int xbf_count; /* count of items on list */ int xbf_low; /* alloc in low mode */ } xfs_bmap_free_t; @@ -141,8 +141,10 @@ static inline int xfs_bmapi_aflag(int w) static inline void xfs_bmap_init(xfs_bmap_free_t *flp, xfs_fsblock_t *fbp) { - ((flp)->xbf_first = NULL, (flp)->xbf_count = 0, \ - (flp)->xbf_low = 0, *(fbp) = NULLFSBLOCK); + INIT_LIST_HEAD(&flp->xbf_flist); + flp->xbf_count = 0; + flp->xbf_low = 0; + *fbp = NULLFSBLOCK; } /* @@ -191,8 +193,8 @@ void xfs_bmap_trace_exlist(struct xfs_inode *ip, xfs_extnum_t cnt, int xfs_bmap_add_attrfork(struct xfs_inode *ip, int size, int rsvd); void xfs_bmap_local_to_extents_empty(struct xfs_inode *ip, int whichfork); -void xfs_bmap_add_free(xfs_fsblock_t bno, xfs_filblks_t len, - struct xfs_bmap_free *flist, struct xfs_mount *mp); +void xfs_bmap_add_free(struct xfs_mount *mp, struct xfs_bmap_free *flist, + xfs_fsblock_t bno, xfs_filblks_t len); void xfs_bmap_cancel(struct xfs_bmap_free *flist); int xfs_bmap_finish(struct xfs_trans **tp, struct xfs_bmap_free *flist, struct xfs_inode *ip); diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c index 6282f6e708af..db0c71e470c9 100644 --- a/fs/xfs/libxfs/xfs_bmap_btree.c +++ b/fs/xfs/libxfs/xfs_bmap_btree.c @@ -526,7 +526,7 @@ xfs_bmbt_free_block( struct xfs_trans *tp = cur->bc_tp; xfs_fsblock_t fsbno = XFS_DADDR_TO_FSB(mp, XFS_BUF_ADDR(bp)); - xfs_bmap_add_free(fsbno, 1, cur->bc_private.b.flist, mp); + xfs_bmap_add_free(mp, cur->bc_private.b.flist, fsbno, 1); ip->i_d.di_nblocks--; xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index 1f88e1ce770f..07eeb0b4ca74 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -543,12 +543,12 @@ xfs_btree_ptr_addr( */ STATIC struct xfs_btree_block * xfs_btree_get_iroot( - struct xfs_btree_cur *cur) + struct xfs_btree_cur *cur) { - struct xfs_ifork *ifp; + struct xfs_ifork *ifp; - ifp = XFS_IFORK_PTR(cur->bc_private.b.ip, cur->bc_private.b.whichfork); - return (struct xfs_btree_block *)ifp->if_broot; + ifp = XFS_IFORK_PTR(cur->bc_private.b.ip, cur->bc_private.b.whichfork); + return (struct xfs_btree_block *)ifp->if_broot; } /* @@ -4152,3 +4152,22 @@ xfs_btree_sblock_verify( return true; } + +/* + * Calculate the number of btree levels needed to store a given number of + * records in a short-format btree. + */ +uint +xfs_btree_compute_maxlevels( + struct xfs_mount *mp, + uint *limits, + unsigned long len) +{ + uint level; + unsigned long maxblocks; + + maxblocks = (len + limits[0] - 1) / limits[0]; + for (level = 1; maxblocks > 1; level++) + maxblocks = (maxblocks + limits[1] - 1) / limits[1]; + return level; +} diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h index 2e874be70209..785a99682159 100644 --- a/fs/xfs/libxfs/xfs_btree.h +++ b/fs/xfs/libxfs/xfs_btree.h @@ -474,5 +474,7 @@ static inline int xfs_btree_get_level(struct xfs_btree_block *block) bool xfs_btree_sblock_v5hdr_verify(struct xfs_buf *bp); bool xfs_btree_sblock_verify(struct xfs_buf *bp, unsigned int max_recs); +uint xfs_btree_compute_maxlevels(struct xfs_mount *mp, uint *limits, + unsigned long len); #endif /* __XFS_BTREE_H__ */ diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c index 097bf7717d80..0f1f165f4048 100644 --- a/fs/xfs/libxfs/xfs_da_btree.c +++ b/fs/xfs/libxfs/xfs_da_btree.c @@ -356,7 +356,6 @@ xfs_da3_split( struct xfs_da_state_blk *newblk; struct xfs_da_state_blk *addblk; struct xfs_da_intnode *node; - struct xfs_buf *bp; int max; int action = 0; int error; @@ -397,7 +396,9 @@ xfs_da3_split( break; } /* - * Entry wouldn't fit, split the leaf again. + * Entry wouldn't fit, split the leaf again. The new + * extrablk will be consumed by xfs_da3_node_split if + * the node is split. */ state->extravalid = 1; if (state->inleaf) { @@ -446,6 +447,14 @@ xfs_da3_split( return 0; /* + * xfs_da3_node_split() should have consumed any extra blocks we added + * during a double leaf split in the attr fork. This is guaranteed as + * we can't be here if the attr fork only has a single leaf block. + */ + ASSERT(state->extravalid == 0 || + state->path.blk[max].magic == XFS_DIR2_LEAFN_MAGIC); + + /* * Split the root node. */ ASSERT(state->path.active == 0); @@ -457,43 +466,33 @@ xfs_da3_split( } /* - * Update pointers to the node which used to be block 0 and - * just got bumped because of the addition of a new root node. - * There might be three blocks involved if a double split occurred, - * and the original block 0 could be at any position in the list. + * Update pointers to the node which used to be block 0 and just got + * bumped because of the addition of a new root node. Note that the + * original block 0 could be at any position in the list of blocks in + * the tree. * - * Note: the magic numbers and sibling pointers are in the same - * physical place for both v2 and v3 headers (by design). Hence it - * doesn't matter which version of the xfs_da_intnode structure we use - * here as the result will be the same using either structure. + * Note: the magic numbers and sibling pointers are in the same physical + * place for both v2 and v3 headers (by design). Hence it doesn't matter + * which version of the xfs_da_intnode structure we use here as the + * result will be the same using either structure. */ node = oldblk->bp->b_addr; if (node->hdr.info.forw) { - if (be32_to_cpu(node->hdr.info.forw) == addblk->blkno) { - bp = addblk->bp; - } else { - ASSERT(state->extravalid); - bp = state->extrablk.bp; - } - node = bp->b_addr; + ASSERT(be32_to_cpu(node->hdr.info.forw) == addblk->blkno); + node = addblk->bp->b_addr; node->hdr.info.back = cpu_to_be32(oldblk->blkno); - xfs_trans_log_buf(state->args->trans, bp, - XFS_DA_LOGRANGE(node, &node->hdr.info, - sizeof(node->hdr.info))); + xfs_trans_log_buf(state->args->trans, addblk->bp, + XFS_DA_LOGRANGE(node, &node->hdr.info, + sizeof(node->hdr.info))); } node = oldblk->bp->b_addr; if (node->hdr.info.back) { - if (be32_to_cpu(node->hdr.info.back) == addblk->blkno) { - bp = addblk->bp; - } else { - ASSERT(state->extravalid); - bp = state->extrablk.bp; - } - node = bp->b_addr; + ASSERT(be32_to_cpu(node->hdr.info.back) == addblk->blkno); + node = addblk->bp->b_addr; node->hdr.info.forw = cpu_to_be32(oldblk->blkno); - xfs_trans_log_buf(state->args->trans, bp, - XFS_DA_LOGRANGE(node, &node->hdr.info, - sizeof(node->hdr.info))); + xfs_trans_log_buf(state->args->trans, addblk->bp, + XFS_DA_LOGRANGE(node, &node->hdr.info, + sizeof(node->hdr.info))); } addblk->bp = NULL; return 0; diff --git a/fs/xfs/libxfs/xfs_da_format.c b/fs/xfs/libxfs/xfs_da_format.c index 9d624a622946..f1e8d4dbb600 100644 --- a/fs/xfs/libxfs/xfs_da_format.c +++ b/fs/xfs/libxfs/xfs_da_format.c @@ -40,8 +40,7 @@ xfs_dir2_sf_entsize( int count = sizeof(struct xfs_dir2_sf_entry); /* namelen + offset */ count += len; /* name */ - count += hdr->i8count ? sizeof(xfs_dir2_ino8_t) : - sizeof(xfs_dir2_ino4_t); /* ino # */ + count += hdr->i8count ? XFS_INO64_SIZE : XFS_INO32_SIZE; /* ino # */ return count; } @@ -125,33 +124,33 @@ xfs_dir3_sfe_put_ftype( static xfs_ino_t xfs_dir2_sf_get_ino( struct xfs_dir2_sf_hdr *hdr, - xfs_dir2_inou_t *from) + __uint8_t *from) { if (hdr->i8count) - return get_unaligned_be64(&from->i8.i) & 0x00ffffffffffffffULL; + return get_unaligned_be64(from) & 0x00ffffffffffffffULL; else - return get_unaligned_be32(&from->i4.i); + return get_unaligned_be32(from); } static void xfs_dir2_sf_put_ino( struct xfs_dir2_sf_hdr *hdr, - xfs_dir2_inou_t *to, + __uint8_t *to, xfs_ino_t ino) { ASSERT((ino & 0xff00000000000000ULL) == 0); if (hdr->i8count) - put_unaligned_be64(ino, &to->i8.i); + put_unaligned_be64(ino, to); else - put_unaligned_be32(ino, &to->i4.i); + put_unaligned_be32(ino, to); } static xfs_ino_t xfs_dir2_sf_get_parent_ino( struct xfs_dir2_sf_hdr *hdr) { - return xfs_dir2_sf_get_ino(hdr, &hdr->parent); + return xfs_dir2_sf_get_ino(hdr, hdr->parent); } static void @@ -159,7 +158,7 @@ xfs_dir2_sf_put_parent_ino( struct xfs_dir2_sf_hdr *hdr, xfs_ino_t ino) { - xfs_dir2_sf_put_ino(hdr, &hdr->parent, ino); + xfs_dir2_sf_put_ino(hdr, hdr->parent, ino); } /* @@ -173,8 +172,7 @@ xfs_dir2_sfe_get_ino( struct xfs_dir2_sf_hdr *hdr, struct xfs_dir2_sf_entry *sfep) { - return xfs_dir2_sf_get_ino(hdr, - (xfs_dir2_inou_t *)&sfep->name[sfep->namelen]); + return xfs_dir2_sf_get_ino(hdr, &sfep->name[sfep->namelen]); } static void @@ -183,8 +181,7 @@ xfs_dir2_sfe_put_ino( struct xfs_dir2_sf_entry *sfep, xfs_ino_t ino) { - xfs_dir2_sf_put_ino(hdr, - (xfs_dir2_inou_t *)&sfep->name[sfep->namelen], ino); + xfs_dir2_sf_put_ino(hdr, &sfep->name[sfep->namelen], ino); } static xfs_ino_t @@ -192,8 +189,7 @@ xfs_dir3_sfe_get_ino( struct xfs_dir2_sf_hdr *hdr, struct xfs_dir2_sf_entry *sfep) { - return xfs_dir2_sf_get_ino(hdr, - (xfs_dir2_inou_t *)&sfep->name[sfep->namelen + 1]); + return xfs_dir2_sf_get_ino(hdr, &sfep->name[sfep->namelen + 1]); } static void @@ -202,8 +198,7 @@ xfs_dir3_sfe_put_ino( struct xfs_dir2_sf_entry *sfep, xfs_ino_t ino) { - xfs_dir2_sf_put_ino(hdr, - (xfs_dir2_inou_t *)&sfep->name[sfep->namelen + 1], ino); + xfs_dir2_sf_put_ino(hdr, &sfep->name[sfep->namelen + 1], ino); } diff --git a/fs/xfs/libxfs/xfs_da_format.h b/fs/xfs/libxfs/xfs_da_format.h index 8d4d8bce41bf..685f23b67056 100644 --- a/fs/xfs/libxfs/xfs_da_format.h +++ b/fs/xfs/libxfs/xfs_da_format.h @@ -192,12 +192,6 @@ typedef __uint16_t xfs_dir2_data_off_t; typedef uint xfs_dir2_data_aoff_t; /* argument form */ /* - * Normalized offset (in a data block) of the entry, really xfs_dir2_data_off_t. - * Only need 16 bits, this is the byte offset into the single block form. - */ -typedef struct { __uint8_t i[2]; } __arch_pack xfs_dir2_sf_off_t; - -/* * Offset in data space of a data entry. */ typedef __uint32_t xfs_dir2_dataptr_t; @@ -214,22 +208,10 @@ typedef xfs_off_t xfs_dir2_off_t; */ typedef __uint32_t xfs_dir2_db_t; -/* - * Inode number stored as 8 8-bit values. - */ -typedef struct { __uint8_t i[8]; } xfs_dir2_ino8_t; - -/* - * Inode number stored as 4 8-bit values. - * Works a lot of the time, when all the inode numbers in a directory - * fit in 32 bits. - */ -typedef struct { __uint8_t i[4]; } xfs_dir2_ino4_t; +#define XFS_INO32_SIZE 4 +#define XFS_INO64_SIZE 8 +#define XFS_INO64_DIFF (XFS_INO64_SIZE - XFS_INO32_SIZE) -typedef union { - xfs_dir2_ino8_t i8; - xfs_dir2_ino4_t i4; -} xfs_dir2_inou_t; #define XFS_DIR2_MAX_SHORT_INUM ((xfs_ino_t)0xffffffffULL) /* @@ -246,39 +228,38 @@ typedef union { typedef struct xfs_dir2_sf_hdr { __uint8_t count; /* count of entries */ __uint8_t i8count; /* count of 8-byte inode #s */ - xfs_dir2_inou_t parent; /* parent dir inode number */ -} __arch_pack xfs_dir2_sf_hdr_t; + __uint8_t parent[8]; /* parent dir inode number */ +} __packed xfs_dir2_sf_hdr_t; typedef struct xfs_dir2_sf_entry { __u8 namelen; /* actual name length */ - xfs_dir2_sf_off_t offset; /* saved offset */ + __u8 offset[2]; /* saved offset */ __u8 name[]; /* name, variable size */ /* * A single byte containing the file type field follows the inode * number for version 3 directory entries. * - * A xfs_dir2_ino8_t or xfs_dir2_ino4_t follows here, at a - * variable offset after the name. + * A 64-bit or 32-bit inode number follows here, at a variable offset + * after the name. */ -} __arch_pack xfs_dir2_sf_entry_t; +} xfs_dir2_sf_entry_t; static inline int xfs_dir2_sf_hdr_size(int i8count) { return sizeof(struct xfs_dir2_sf_hdr) - - (i8count == 0) * - (sizeof(xfs_dir2_ino8_t) - sizeof(xfs_dir2_ino4_t)); + (i8count == 0) * XFS_INO64_DIFF; } static inline xfs_dir2_data_aoff_t xfs_dir2_sf_get_offset(xfs_dir2_sf_entry_t *sfep) { - return get_unaligned_be16(&sfep->offset.i); + return get_unaligned_be16(sfep->offset); } static inline void xfs_dir2_sf_put_offset(xfs_dir2_sf_entry_t *sfep, xfs_dir2_data_aoff_t off) { - put_unaligned_be16(off, &sfep->offset.i); + put_unaligned_be16(off, sfep->offset); } static inline struct xfs_dir2_sf_entry * diff --git a/fs/xfs/libxfs/xfs_dir2_sf.c b/fs/xfs/libxfs/xfs_dir2_sf.c index 974d62e677f4..c6809ff41197 100644 --- a/fs/xfs/libxfs/xfs_dir2_sf.c +++ b/fs/xfs/libxfs/xfs_dir2_sf.c @@ -126,13 +126,12 @@ xfs_dir2_block_sfsize( /* * Calculate the new size, see if we should give up yet. */ - size = xfs_dir2_sf_hdr_size(i8count) + /* header */ - count + /* namelen */ - count * (uint)sizeof(xfs_dir2_sf_off_t) + /* offset */ - namelen + /* name */ - (i8count ? /* inumber */ - (uint)sizeof(xfs_dir2_ino8_t) * count : - (uint)sizeof(xfs_dir2_ino4_t) * count); + size = xfs_dir2_sf_hdr_size(i8count) + /* header */ + count * 3 * sizeof(u8) + /* namelen + offset */ + namelen + /* name */ + (i8count ? /* inumber */ + count * XFS_INO64_SIZE : + count * XFS_INO32_SIZE); if (size > XFS_IFORK_DSIZE(dp)) return size; /* size value is a failure */ } @@ -257,15 +256,12 @@ xfs_dir2_block_to_sf( * * Convert the inode to local format and copy the data in. */ - dp->i_df.if_flags &= ~XFS_IFEXTENTS; - dp->i_df.if_flags |= XFS_IFINLINE; - dp->i_d.di_format = XFS_DINODE_FMT_LOCAL; ASSERT(dp->i_df.if_bytes == 0); - xfs_idata_realloc(dp, size, XFS_DATA_FORK); + xfs_init_local_fork(dp, XFS_DATA_FORK, dst, size); + dp->i_d.di_format = XFS_DINODE_FMT_LOCAL; + dp->i_d.di_size = size; logflags |= XFS_ILOG_DDATA; - memcpy(dp->i_df.if_u1.if_data, dst, size); - dp->i_d.di_size = size; xfs_dir2_sf_check(args); out: xfs_trans_log_inode(args->trans, dp, logflags); @@ -322,10 +318,7 @@ xfs_dir2_sf_addname( /* * Yes, adjust the inode size. old count + (parent + new) */ - incr_isize += - (sfp->count + 2) * - ((uint)sizeof(xfs_dir2_ino8_t) - - (uint)sizeof(xfs_dir2_ino4_t)); + incr_isize += (sfp->count + 2) * XFS_INO64_DIFF; objchange = 1; } @@ -900,11 +893,7 @@ xfs_dir2_sf_replace( int error; /* error return value */ int newsize; /* new inode size */ - newsize = - dp->i_df.if_bytes + - (sfp->count + 1) * - ((uint)sizeof(xfs_dir2_ino8_t) - - (uint)sizeof(xfs_dir2_ino4_t)); + newsize = dp->i_df.if_bytes + (sfp->count + 1) * XFS_INO64_DIFF; /* * Won't fit as shortform, convert to block then do replace. */ @@ -1025,10 +1014,7 @@ xfs_dir2_sf_toino4( /* * Compute the new inode size. */ - newsize = - oldsize - - (oldsfp->count + 1) * - ((uint)sizeof(xfs_dir2_ino8_t) - (uint)sizeof(xfs_dir2_ino4_t)); + newsize = oldsize - (oldsfp->count + 1) * XFS_INO64_DIFF; xfs_idata_realloc(dp, -oldsize, XFS_DATA_FORK); xfs_idata_realloc(dp, newsize, XFS_DATA_FORK); /* @@ -1051,7 +1037,7 @@ xfs_dir2_sf_toino4( i++, sfep = dp->d_ops->sf_nextentry(sfp, sfep), oldsfep = dp->d_ops->sf_nextentry(oldsfp, oldsfep)) { sfep->namelen = oldsfep->namelen; - sfep->offset = oldsfep->offset; + memcpy(sfep->offset, oldsfep->offset, sizeof(sfep->offset)); memcpy(sfep->name, oldsfep->name, sfep->namelen); dp->d_ops->sf_put_ino(sfp, sfep, dp->d_ops->sf_get_ino(oldsfp, oldsfep)); @@ -1101,10 +1087,7 @@ xfs_dir2_sf_toino8( /* * Compute the new inode size (nb: entry count + 1 for parent) */ - newsize = - oldsize + - (oldsfp->count + 1) * - ((uint)sizeof(xfs_dir2_ino8_t) - (uint)sizeof(xfs_dir2_ino4_t)); + newsize = oldsize + (oldsfp->count + 1) * XFS_INO64_DIFF; xfs_idata_realloc(dp, -oldsize, XFS_DATA_FORK); xfs_idata_realloc(dp, newsize, XFS_DATA_FORK); /* @@ -1127,7 +1110,7 @@ xfs_dir2_sf_toino8( i++, sfep = dp->d_ops->sf_nextentry(sfp, sfep), oldsfep = dp->d_ops->sf_nextentry(oldsfp, oldsfep)) { sfep->namelen = oldsfep->namelen; - sfep->offset = oldsfep->offset; + memcpy(sfep->offset, oldsfep->offset, sizeof(sfep->offset)); memcpy(sfep->name, oldsfep->name, sfep->namelen); dp->d_ops->sf_put_ino(sfp, sfep, dp->d_ops->sf_get_ino(oldsfp, oldsfep)); diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h index dc97eb21af07..adb204d40f22 100644 --- a/fs/xfs/libxfs/xfs_format.h +++ b/fs/xfs/libxfs/xfs_format.h @@ -1435,41 +1435,57 @@ typedef __be64 xfs_bmbt_ptr_t, xfs_bmdr_ptr_t; * with the crc feature bit, and all accesses to them must be conditional on * that flag. */ +/* short form block header */ +struct xfs_btree_block_shdr { + __be32 bb_leftsib; + __be32 bb_rightsib; + + __be64 bb_blkno; + __be64 bb_lsn; + uuid_t bb_uuid; + __be32 bb_owner; + __le32 bb_crc; +}; + +/* long form block header */ +struct xfs_btree_block_lhdr { + __be64 bb_leftsib; + __be64 bb_rightsib; + + __be64 bb_blkno; + __be64 bb_lsn; + uuid_t bb_uuid; + __be64 bb_owner; + __le32 bb_crc; + __be32 bb_pad; /* padding for alignment */ +}; + struct xfs_btree_block { __be32 bb_magic; /* magic number for block type */ __be16 bb_level; /* 0 is a leaf */ __be16 bb_numrecs; /* current # of data records */ union { - struct { - __be32 bb_leftsib; - __be32 bb_rightsib; - - __be64 bb_blkno; - __be64 bb_lsn; - uuid_t bb_uuid; - __be32 bb_owner; - __le32 bb_crc; - } s; /* short form pointers */ - struct { - __be64 bb_leftsib; - __be64 bb_rightsib; - - __be64 bb_blkno; - __be64 bb_lsn; - uuid_t bb_uuid; - __be64 bb_owner; - __le32 bb_crc; - __be32 bb_pad; /* padding for alignment */ - } l; /* long form pointers */ + struct xfs_btree_block_shdr s; + struct xfs_btree_block_lhdr l; } bb_u; /* rest */ }; -#define XFS_BTREE_SBLOCK_LEN 16 /* size of a short form block */ -#define XFS_BTREE_LBLOCK_LEN 24 /* size of a long form block */ +/* size of a short form block */ +#define XFS_BTREE_SBLOCK_LEN \ + (offsetof(struct xfs_btree_block, bb_u) + \ + offsetof(struct xfs_btree_block_shdr, bb_blkno)) +/* size of a long form block */ +#define XFS_BTREE_LBLOCK_LEN \ + (offsetof(struct xfs_btree_block, bb_u) + \ + offsetof(struct xfs_btree_block_lhdr, bb_blkno)) /* sizes of CRC enabled btree blocks */ -#define XFS_BTREE_SBLOCK_CRC_LEN (XFS_BTREE_SBLOCK_LEN + 40) -#define XFS_BTREE_LBLOCK_CRC_LEN (XFS_BTREE_LBLOCK_LEN + 48) +#define XFS_BTREE_SBLOCK_CRC_LEN \ + (offsetof(struct xfs_btree_block, bb_u) + \ + sizeof(struct xfs_btree_block_shdr)) +#define XFS_BTREE_LBLOCK_CRC_LEN \ + (offsetof(struct xfs_btree_block, bb_u) + \ + sizeof(struct xfs_btree_block_lhdr)) #define XFS_BTREE_SBLOCK_CRC_OFF \ offsetof(struct xfs_btree_block, bb_u.s.bb_crc) diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h index fffe3d01bd9f..f5ec9c5ccae6 100644 --- a/fs/xfs/libxfs/xfs_fs.h +++ b/fs/xfs/libxfs/xfs_fs.h @@ -521,12 +521,8 @@ typedef struct xfs_swapext #define XFS_IOC_ERROR_CLEARALL _IOW ('X', 117, struct xfs_error_injection) /* XFS_IOC_ATTRCTL_BY_HANDLE -- deprecated 118 */ -/* XFS_IOC_FREEZE -- FIFREEZE 119 */ -/* XFS_IOC_THAW -- FITHAW 120 */ -#ifndef FIFREEZE -#define XFS_IOC_FREEZE _IOWR('X', 119, int) -#define XFS_IOC_THAW _IOWR('X', 120, int) -#endif +#define XFS_IOC_FREEZE _IOWR('X', 119, int) /* aka FIFREEZE */ +#define XFS_IOC_THAW _IOWR('X', 120, int) /* aka FITHAW */ #define XFS_IOC_FSSETDM_BY_HANDLE _IOW ('X', 121, struct xfs_fsop_setdm_handlereq) #define XFS_IOC_ATTRLIST_BY_HANDLE _IOW ('X', 122, struct xfs_fsop_attrlist_handlereq) diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index 22297f9b0fd5..4b1e408169a8 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -1828,9 +1828,8 @@ xfs_difree_inode_chunk( if (!xfs_inobt_issparse(rec->ir_holemask)) { /* not sparse, calculate extent info directly */ - xfs_bmap_add_free(XFS_AGB_TO_FSB(mp, agno, - XFS_AGINO_TO_AGBNO(mp, rec->ir_startino)), - mp->m_ialloc_blks, flist, mp); + xfs_bmap_add_free(mp, flist, XFS_AGB_TO_FSB(mp, agno, sagbno), + mp->m_ialloc_blks); return; } @@ -1873,8 +1872,8 @@ xfs_difree_inode_chunk( ASSERT(agbno % mp->m_sb.sb_spino_align == 0); ASSERT(contigblk % mp->m_sb.sb_spino_align == 0); - xfs_bmap_add_free(XFS_AGB_TO_FSB(mp, agno, agbno), contigblk, - flist, mp); + xfs_bmap_add_free(mp, flist, XFS_AGB_TO_FSB(mp, agno, agbno), + contigblk); /* reset range to current bit and carry on... */ startidx = endidx = nextbit; @@ -2395,20 +2394,11 @@ void xfs_ialloc_compute_maxlevels( xfs_mount_t *mp) /* file system mount structure */ { - int level; - uint maxblocks; - uint maxleafents; - int minleafrecs; - int minnoderecs; - - maxleafents = (1LL << XFS_INO_AGINO_BITS(mp)) >> - XFS_INODES_PER_CHUNK_LOG; - minleafrecs = mp->m_inobt_mnr[0]; - minnoderecs = mp->m_inobt_mnr[1]; - maxblocks = (maxleafents + minleafrecs - 1) / minleafrecs; - for (level = 1; maxblocks > 1; level++) - maxblocks = (maxblocks + minnoderecs - 1) / minnoderecs; - mp->m_in_maxlevels = level; + uint inodes; + + inodes = (1LL << XFS_INO_AGINO_BITS(mp)) >> XFS_INODES_PER_CHUNK_LOG; + mp->m_in_maxlevels = xfs_btree_compute_maxlevels(mp, mp->m_inobt_mnr, + inodes); } /* diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c index 11faf7df14c8..bbcc8c7a44b3 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.c +++ b/fs/xfs/libxfs/xfs_inode_fork.c @@ -231,6 +231,48 @@ xfs_iformat_fork( return error; } +void +xfs_init_local_fork( + struct xfs_inode *ip, + int whichfork, + const void *data, + int size) +{ + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + int mem_size = size, real_size = 0; + bool zero_terminate; + + /* + * If we are using the local fork to store a symlink body we need to + * zero-terminate it so that we can pass it back to the VFS directly. + * Overallocate the in-memory fork by one for that and add a zero + * to terminate it below. + */ + zero_terminate = S_ISLNK(VFS_I(ip)->i_mode); + if (zero_terminate) + mem_size++; + + if (size == 0) + ifp->if_u1.if_data = NULL; + else if (mem_size <= sizeof(ifp->if_u2.if_inline_data)) + ifp->if_u1.if_data = ifp->if_u2.if_inline_data; + else { + real_size = roundup(mem_size, 4); + ifp->if_u1.if_data = kmem_alloc(real_size, KM_SLEEP | KM_NOFS); + } + + if (size) { + memcpy(ifp->if_u1.if_data, data, size); + if (zero_terminate) + ifp->if_u1.if_data[size] = '\0'; + } + + ifp->if_bytes = size; + ifp->if_real_bytes = real_size; + ifp->if_flags &= ~(XFS_IFEXTENTS | XFS_IFBROOT); + ifp->if_flags |= XFS_IFINLINE; +} + /* * The file is in-lined in the on-disk inode. * If it fits into if_inline_data, then copy @@ -248,8 +290,6 @@ xfs_iformat_local( int whichfork, int size) { - xfs_ifork_t *ifp; - int real_size; /* * If the size is unreasonable, then something @@ -265,22 +305,8 @@ xfs_iformat_local( ip->i_mount, dip); return -EFSCORRUPTED; } - ifp = XFS_IFORK_PTR(ip, whichfork); - real_size = 0; - if (size == 0) - ifp->if_u1.if_data = NULL; - else if (size <= sizeof(ifp->if_u2.if_inline_data)) - ifp->if_u1.if_data = ifp->if_u2.if_inline_data; - else { - real_size = roundup(size, 4); - ifp->if_u1.if_data = kmem_alloc(real_size, KM_SLEEP | KM_NOFS); - } - ifp->if_bytes = size; - ifp->if_real_bytes = real_size; - if (size) - memcpy(ifp->if_u1.if_data, XFS_DFORK_PTR(dip, whichfork), size); - ifp->if_flags &= ~XFS_IFEXTENTS; - ifp->if_flags |= XFS_IFINLINE; + + xfs_init_local_fork(ip, whichfork, XFS_DFORK_PTR(dip, whichfork), size); return 0; } @@ -516,7 +542,6 @@ xfs_iroot_realloc( new_max = cur_max + rec_diff; new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, new_max); ifp->if_broot = kmem_realloc(ifp->if_broot, new_size, - XFS_BMAP_BROOT_SPACE_CALC(mp, cur_max), KM_SLEEP | KM_NOFS); op = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1, ifp->if_broot_bytes); @@ -660,7 +685,6 @@ xfs_idata_realloc( ifp->if_u1.if_data = kmem_realloc(ifp->if_u1.if_data, real_size, - ifp->if_real_bytes, KM_SLEEP | KM_NOFS); } } else { @@ -1376,8 +1400,7 @@ xfs_iext_realloc_direct( if (rnew_size != ifp->if_real_bytes) { ifp->if_u1.if_extents = kmem_realloc(ifp->if_u1.if_extents, - rnew_size, - ifp->if_real_bytes, KM_NOFS); + rnew_size, KM_NOFS); } if (rnew_size > ifp->if_real_bytes) { memset(&ifp->if_u1.if_extents[ifp->if_bytes / @@ -1461,9 +1484,8 @@ xfs_iext_realloc_indirect( if (new_size == 0) { xfs_iext_destroy(ifp); } else { - ifp->if_u1.if_ext_irec = (xfs_ext_irec_t *) - kmem_realloc(ifp->if_u1.if_ext_irec, - new_size, size, KM_NOFS); + ifp->if_u1.if_ext_irec = + kmem_realloc(ifp->if_u1.if_ext_irec, new_size, KM_NOFS); } } @@ -1497,6 +1519,24 @@ xfs_iext_indirect_to_direct( } /* + * Remove all records from the indirection array. + */ +STATIC void +xfs_iext_irec_remove_all( + struct xfs_ifork *ifp) +{ + int nlists; + int i; + + ASSERT(ifp->if_flags & XFS_IFEXTIREC); + nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; + for (i = 0; i < nlists; i++) + kmem_free(ifp->if_u1.if_ext_irec[i].er_extbuf); + kmem_free(ifp->if_u1.if_ext_irec); + ifp->if_flags &= ~XFS_IFEXTIREC; +} + +/* * Free incore file extents. */ void @@ -1504,14 +1544,7 @@ xfs_iext_destroy( xfs_ifork_t *ifp) /* inode fork pointer */ { if (ifp->if_flags & XFS_IFEXTIREC) { - int erp_idx; - int nlists; - - nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; - for (erp_idx = nlists - 1; erp_idx >= 0 ; erp_idx--) { - xfs_iext_irec_remove(ifp, erp_idx); - } - ifp->if_flags &= ~XFS_IFEXTIREC; + xfs_iext_irec_remove_all(ifp); } else if (ifp->if_real_bytes) { kmem_free(ifp->if_u1.if_extents); } else if (ifp->if_bytes) { diff --git a/fs/xfs/libxfs/xfs_inode_fork.h b/fs/xfs/libxfs/xfs_inode_fork.h index 7d3b1ed6dcbe..f95e072ae646 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.h +++ b/fs/xfs/libxfs/xfs_inode_fork.h @@ -134,6 +134,7 @@ void xfs_iroot_realloc(struct xfs_inode *, int, int); int xfs_iread_extents(struct xfs_trans *, struct xfs_inode *, int); int xfs_iextents_copy(struct xfs_inode *, struct xfs_bmbt_rec *, int); +void xfs_init_local_fork(struct xfs_inode *, int, const void *, int); struct xfs_bmbt_rec_host * xfs_iext_get_ext(struct xfs_ifork *, xfs_extnum_t); diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h index d54a8018b079..e8f49c029ff0 100644 --- a/fs/xfs/libxfs/xfs_log_format.h +++ b/fs/xfs/libxfs/xfs_log_format.h @@ -212,6 +212,11 @@ typedef struct xfs_trans_header { #define XFS_TRANS_HEADER_MAGIC 0x5452414e /* TRAN */ /* + * The only type valid for th_type in CIL-enabled file system logs: + */ +#define XFS_TRANS_CHECKPOINT 40 + +/* * Log item types. */ #define XFS_LI_EFI 0x1236 diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c index 951c044e24e4..e2e1106c9fad 100644 --- a/fs/xfs/libxfs/xfs_rtbitmap.c +++ b/fs/xfs/libxfs/xfs_rtbitmap.c @@ -70,7 +70,7 @@ const struct xfs_buf_ops xfs_rtbuf_ops = { * Get a buffer for the bitmap or summary file block specified. * The buffer is returned read and locked. */ -int +static int xfs_rtbuf_get( xfs_mount_t *mp, /* file system mount structure */ xfs_trans_t *tp, /* transaction pointer */ diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index 8a53eaa349f4..12ca86778e02 100644 --- a/fs/xfs/libxfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -838,12 +838,10 @@ xfs_sync_sb( struct xfs_trans *tp; int error; - tp = _xfs_trans_alloc(mp, XFS_TRANS_SB_CHANGE, KM_SLEEP); - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_sb, 0, 0); - if (error) { - xfs_trans_cancel(tp); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_sb, 0, 0, + XFS_TRANS_NO_WRITECOUNT, &tp); + if (error) return error; - } xfs_log_sb(tp); if (wait) diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h index 81ac870834da..16002b5ec4eb 100644 --- a/fs/xfs/libxfs/xfs_shared.h +++ b/fs/xfs/libxfs/xfs_shared.h @@ -56,103 +56,6 @@ extern const struct xfs_buf_ops xfs_symlink_buf_ops; extern const struct xfs_buf_ops xfs_rtbuf_ops; /* - * Transaction types. Used to distinguish types of buffers. These never reach - * the log. - */ -#define XFS_TRANS_SETATTR_NOT_SIZE 1 -#define XFS_TRANS_SETATTR_SIZE 2 -#define XFS_TRANS_INACTIVE 3 -#define XFS_TRANS_CREATE 4 -#define XFS_TRANS_CREATE_TRUNC 5 -#define XFS_TRANS_TRUNCATE_FILE 6 -#define XFS_TRANS_REMOVE 7 -#define XFS_TRANS_LINK 8 -#define XFS_TRANS_RENAME 9 -#define XFS_TRANS_MKDIR 10 -#define XFS_TRANS_RMDIR 11 -#define XFS_TRANS_SYMLINK 12 -#define XFS_TRANS_SET_DMATTRS 13 -#define XFS_TRANS_GROWFS 14 -#define XFS_TRANS_STRAT_WRITE 15 -#define XFS_TRANS_DIOSTRAT 16 -/* 17 was XFS_TRANS_WRITE_SYNC */ -#define XFS_TRANS_WRITEID 18 -#define XFS_TRANS_ADDAFORK 19 -#define XFS_TRANS_ATTRINVAL 20 -#define XFS_TRANS_ATRUNCATE 21 -#define XFS_TRANS_ATTR_SET 22 -#define XFS_TRANS_ATTR_RM 23 -#define XFS_TRANS_ATTR_FLAG 24 -#define XFS_TRANS_CLEAR_AGI_BUCKET 25 -#define XFS_TRANS_SB_CHANGE 26 -/* - * Dummy entries since we use the transaction type to index into the - * trans_type[] in xlog_recover_print_trans_head() - */ -#define XFS_TRANS_DUMMY1 27 -#define XFS_TRANS_DUMMY2 28 -#define XFS_TRANS_QM_QUOTAOFF 29 -#define XFS_TRANS_QM_DQALLOC 30 -#define XFS_TRANS_QM_SETQLIM 31 -#define XFS_TRANS_QM_DQCLUSTER 32 -#define XFS_TRANS_QM_QINOCREATE 33 -#define XFS_TRANS_QM_QUOTAOFF_END 34 -#define XFS_TRANS_FSYNC_TS 35 -#define XFS_TRANS_GROWFSRT_ALLOC 36 -#define XFS_TRANS_GROWFSRT_ZERO 37 -#define XFS_TRANS_GROWFSRT_FREE 38 -#define XFS_TRANS_SWAPEXT 39 -#define XFS_TRANS_CHECKPOINT 40 -#define XFS_TRANS_ICREATE 41 -#define XFS_TRANS_CREATE_TMPFILE 42 -#define XFS_TRANS_TYPE_MAX 43 -/* new transaction types need to be reflected in xfs_logprint(8) */ - -#define XFS_TRANS_TYPES \ - { XFS_TRANS_SETATTR_NOT_SIZE, "SETATTR_NOT_SIZE" }, \ - { XFS_TRANS_SETATTR_SIZE, "SETATTR_SIZE" }, \ - { XFS_TRANS_INACTIVE, "INACTIVE" }, \ - { XFS_TRANS_CREATE, "CREATE" }, \ - { XFS_TRANS_CREATE_TRUNC, "CREATE_TRUNC" }, \ - { XFS_TRANS_TRUNCATE_FILE, "TRUNCATE_FILE" }, \ - { XFS_TRANS_REMOVE, "REMOVE" }, \ - { XFS_TRANS_LINK, "LINK" }, \ - { XFS_TRANS_RENAME, "RENAME" }, \ - { XFS_TRANS_MKDIR, "MKDIR" }, \ - { XFS_TRANS_RMDIR, "RMDIR" }, \ - { XFS_TRANS_SYMLINK, "SYMLINK" }, \ - { XFS_TRANS_SET_DMATTRS, "SET_DMATTRS" }, \ - { XFS_TRANS_GROWFS, "GROWFS" }, \ - { XFS_TRANS_STRAT_WRITE, "STRAT_WRITE" }, \ - { XFS_TRANS_DIOSTRAT, "DIOSTRAT" }, \ - { XFS_TRANS_WRITEID, "WRITEID" }, \ - { XFS_TRANS_ADDAFORK, "ADDAFORK" }, \ - { XFS_TRANS_ATTRINVAL, "ATTRINVAL" }, \ - { XFS_TRANS_ATRUNCATE, "ATRUNCATE" }, \ - { XFS_TRANS_ATTR_SET, "ATTR_SET" }, \ - { XFS_TRANS_ATTR_RM, "ATTR_RM" }, \ - { XFS_TRANS_ATTR_FLAG, "ATTR_FLAG" }, \ - { XFS_TRANS_CLEAR_AGI_BUCKET, "CLEAR_AGI_BUCKET" }, \ - { XFS_TRANS_SB_CHANGE, "SBCHANGE" }, \ - { XFS_TRANS_DUMMY1, "DUMMY1" }, \ - { XFS_TRANS_DUMMY2, "DUMMY2" }, \ - { XFS_TRANS_QM_QUOTAOFF, "QM_QUOTAOFF" }, \ - { XFS_TRANS_QM_DQALLOC, "QM_DQALLOC" }, \ - { XFS_TRANS_QM_SETQLIM, "QM_SETQLIM" }, \ - { XFS_TRANS_QM_DQCLUSTER, "QM_DQCLUSTER" }, \ - { XFS_TRANS_QM_QINOCREATE, "QM_QINOCREATE" }, \ - { XFS_TRANS_QM_QUOTAOFF_END, "QM_QOFF_END" }, \ - { XFS_TRANS_FSYNC_TS, "FSYNC_TS" }, \ - { XFS_TRANS_GROWFSRT_ALLOC, "GROWFSRT_ALLOC" }, \ - { XFS_TRANS_GROWFSRT_ZERO, "GROWFSRT_ZERO" }, \ - { XFS_TRANS_GROWFSRT_FREE, "GROWFSRT_FREE" }, \ - { XFS_TRANS_SWAPEXT, "SWAPEXT" }, \ - { XFS_TRANS_CHECKPOINT, "CHECKPOINT" }, \ - { XFS_TRANS_ICREATE, "ICREATE" }, \ - { XFS_TRANS_CREATE_TMPFILE, "CREATE_TMPFILE" }, \ - { XLOG_UNMOUNT_REC_TYPE, "UNMOUNT" } - -/* * This structure is used to track log items associated with * a transaction. It points to the log item and keeps some * flags to track the state of the log item. It also tracks @@ -181,8 +84,9 @@ int xfs_log_calc_minimum_size(struct xfs_mount *); #define XFS_TRANS_SYNC 0x08 /* make commit synchronous */ #define XFS_TRANS_DQ_DIRTY 0x10 /* at least one dquot in trx dirty */ #define XFS_TRANS_RESERVE 0x20 /* OK to use reserved data blocks */ -#define XFS_TRANS_FREEZE_PROT 0x40 /* Transaction has elevated writer - count in superblock */ +#define XFS_TRANS_NO_WRITECOUNT 0x40 /* do not elevate SB writecount */ +#define XFS_TRANS_NOFS 0x80 /* pass KM_NOFS to kmem_alloc */ + /* * Field values for xfs_trans_mod_sb. */ diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index c535887c60a8..7575cfc3ad15 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -84,23 +84,80 @@ xfs_find_bdev_for_inode( } /* - * We're now finished for good with this ioend structure. - * Update the page state via the associated buffer_heads, - * release holds on the inode and bio, and finally free - * up memory. Do not use the ioend after this. + * We're now finished for good with this page. Update the page state via the + * associated buffer_heads, paying attention to the start and end offsets that + * we need to process on the page. + * + * Landmine Warning: bh->b_end_io() will call end_page_writeback() on the last + * buffer in the IO. Once it does this, it is unsafe to access the bufferhead or + * the page at all, as we may be racing with memory reclaim and it can free both + * the bufferhead chain and the page as it will see the page as clean and + * unused. + */ +static void +xfs_finish_page_writeback( + struct inode *inode, + struct bio_vec *bvec, + int error) +{ + unsigned int end = bvec->bv_offset + bvec->bv_len - 1; + struct buffer_head *head, *bh, *next; + unsigned int off = 0; + unsigned int bsize; + + ASSERT(bvec->bv_offset < PAGE_SIZE); + ASSERT((bvec->bv_offset & ((1 << inode->i_blkbits) - 1)) == 0); + ASSERT(end < PAGE_SIZE); + ASSERT((bvec->bv_len & ((1 << inode->i_blkbits) - 1)) == 0); + + bh = head = page_buffers(bvec->bv_page); + + bsize = bh->b_size; + do { + next = bh->b_this_page; + if (off < bvec->bv_offset) + goto next_bh; + if (off > end) + break; + bh->b_end_io(bh, !error); +next_bh: + off += bsize; + } while ((bh = next) != head); +} + +/* + * We're now finished for good with this ioend structure. Update the page + * state, release holds on bios, and finally free up memory. Do not use the + * ioend after this. */ STATIC void xfs_destroy_ioend( - xfs_ioend_t *ioend) + struct xfs_ioend *ioend, + int error) { - struct buffer_head *bh, *next; + struct inode *inode = ioend->io_inode; + struct bio *last = ioend->io_bio; + struct bio *bio, *next; - for (bh = ioend->io_buffer_head; bh; bh = next) { - next = bh->b_private; - bh->b_end_io(bh, !ioend->io_error); - } + for (bio = &ioend->io_inline_bio; bio; bio = next) { + struct bio_vec *bvec; + int i; - mempool_free(ioend, xfs_ioend_pool); + /* + * For the last bio, bi_private points to the ioend, so we + * need to explicitly end the iteration here. + */ + if (bio == last) + next = NULL; + else + next = bio->bi_private; + + /* walk each page on bio, ending page IO on them */ + bio_for_each_segment_all(bvec, bio, i) + xfs_finish_page_writeback(inode, bvec, error); + + bio_put(bio); + } } /* @@ -120,13 +177,9 @@ xfs_setfilesize_trans_alloc( struct xfs_trans *tp; int error; - tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS); - - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_fsyncts, 0, 0); - if (error) { - xfs_trans_cancel(tp); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_fsyncts, 0, 0, 0, &tp); + if (error) return error; - } ioend->io_append_trans = tp; @@ -174,7 +227,8 @@ xfs_setfilesize( STATIC int xfs_setfilesize_ioend( - struct xfs_ioend *ioend) + struct xfs_ioend *ioend, + int error) { struct xfs_inode *ip = XFS_I(ioend->io_inode); struct xfs_trans *tp = ioend->io_append_trans; @@ -188,53 +242,32 @@ xfs_setfilesize_ioend( __sb_writers_acquired(VFS_I(ip)->i_sb, SB_FREEZE_FS); /* we abort the update if there was an IO error */ - if (ioend->io_error) { + if (error) { xfs_trans_cancel(tp); - return ioend->io_error; + return error; } return xfs_setfilesize(ip, tp, ioend->io_offset, ioend->io_size); } /* - * Schedule IO completion handling on the final put of an ioend. - * - * If there is no work to do we might as well call it a day and free the - * ioend right now. - */ -STATIC void -xfs_finish_ioend( - struct xfs_ioend *ioend) -{ - if (atomic_dec_and_test(&ioend->io_remaining)) { - struct xfs_mount *mp = XFS_I(ioend->io_inode)->i_mount; - - if (ioend->io_type == XFS_IO_UNWRITTEN) - queue_work(mp->m_unwritten_workqueue, &ioend->io_work); - else if (ioend->io_append_trans) - queue_work(mp->m_data_workqueue, &ioend->io_work); - else - xfs_destroy_ioend(ioend); - } -} - -/* * IO write completion. */ STATIC void xfs_end_io( struct work_struct *work) { - xfs_ioend_t *ioend = container_of(work, xfs_ioend_t, io_work); - struct xfs_inode *ip = XFS_I(ioend->io_inode); - int error = 0; + struct xfs_ioend *ioend = + container_of(work, struct xfs_ioend, io_work); + struct xfs_inode *ip = XFS_I(ioend->io_inode); + int error = ioend->io_bio->bi_error; /* * Set an error if the mount has shut down and proceed with end I/O * processing so it can perform whatever cleanups are necessary. */ if (XFS_FORCED_SHUTDOWN(ip->i_mount)) - ioend->io_error = -EIO; + error = -EIO; /* * For unwritten extents we need to issue transactions to convert a @@ -244,55 +277,33 @@ xfs_end_io( * on error. */ if (ioend->io_type == XFS_IO_UNWRITTEN) { - if (ioend->io_error) + if (error) goto done; error = xfs_iomap_write_unwritten(ip, ioend->io_offset, ioend->io_size); } else if (ioend->io_append_trans) { - error = xfs_setfilesize_ioend(ioend); + error = xfs_setfilesize_ioend(ioend, error); } else { ASSERT(!xfs_ioend_is_append(ioend)); } done: - if (error) - ioend->io_error = error; - xfs_destroy_ioend(ioend); + xfs_destroy_ioend(ioend, error); } -/* - * Allocate and initialise an IO completion structure. - * We need to track unwritten extent write completion here initially. - * We'll need to extend this for updating the ondisk inode size later - * (vs. incore size). - */ -STATIC xfs_ioend_t * -xfs_alloc_ioend( - struct inode *inode, - unsigned int type) +STATIC void +xfs_end_bio( + struct bio *bio) { - xfs_ioend_t *ioend; - - ioend = mempool_alloc(xfs_ioend_pool, GFP_NOFS); - - /* - * Set the count to 1 initially, which will prevent an I/O - * completion callback from happening before we have started - * all the I/O from calling the completion routine too early. - */ - atomic_set(&ioend->io_remaining, 1); - ioend->io_error = 0; - INIT_LIST_HEAD(&ioend->io_list); - ioend->io_type = type; - ioend->io_inode = inode; - ioend->io_buffer_head = NULL; - ioend->io_buffer_tail = NULL; - ioend->io_offset = 0; - ioend->io_size = 0; - ioend->io_append_trans = NULL; + struct xfs_ioend *ioend = bio->bi_private; + struct xfs_mount *mp = XFS_I(ioend->io_inode)->i_mount; - INIT_WORK(&ioend->io_work, xfs_end_io); - return ioend; + if (ioend->io_type == XFS_IO_UNWRITTEN) + queue_work(mp->m_unwritten_workqueue, &ioend->io_work); + else if (ioend->io_append_trans) + queue_work(mp->m_data_workqueue, &ioend->io_work); + else + xfs_destroy_ioend(ioend, bio->bi_error); } STATIC int @@ -364,50 +375,6 @@ xfs_imap_valid( offset < imap->br_startoff + imap->br_blockcount; } -/* - * BIO completion handler for buffered IO. - */ -STATIC void -xfs_end_bio( - struct bio *bio) -{ - xfs_ioend_t *ioend = bio->bi_private; - - if (!ioend->io_error) - ioend->io_error = bio->bi_error; - - /* Toss bio and pass work off to an xfsdatad thread */ - bio->bi_private = NULL; - bio->bi_end_io = NULL; - bio_put(bio); - - xfs_finish_ioend(ioend); -} - -STATIC void -xfs_submit_ioend_bio( - struct writeback_control *wbc, - xfs_ioend_t *ioend, - struct bio *bio) -{ - atomic_inc(&ioend->io_remaining); - bio->bi_private = ioend; - bio->bi_end_io = xfs_end_bio; - submit_bio(wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE, bio); -} - -STATIC struct bio * -xfs_alloc_ioend_bio( - struct buffer_head *bh) -{ - struct bio *bio = bio_alloc(GFP_NOIO, BIO_MAX_PAGES); - - ASSERT(bio->bi_private == NULL); - bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9); - bio->bi_bdev = bh->b_bdev; - return bio; -} - STATIC void xfs_start_buffer_writeback( struct buffer_head *bh) @@ -452,28 +419,36 @@ static inline int xfs_bio_add_buffer(struct bio *bio, struct buffer_head *bh) } /* - * Submit all of the bios for an ioend. We are only passed a single ioend at a - * time; the caller is responsible for chaining prior to submission. + * Submit the bio for an ioend. We are passed an ioend with a bio attached to + * it, and we submit that bio. The ioend may be used for multiple bio + * submissions, so we only want to allocate an append transaction for the ioend + * once. In the case of multiple bio submission, each bio will take an IO + * reference to the ioend to ensure that the ioend completion is only done once + * all bios have been submitted and the ioend is really done. * * If @fail is non-zero, it means that we have a situation where some part of * the submission process has failed after we have marked paged for writeback - * and unlocked them. In this situation, we need to fail the ioend chain rather - * than submit it to IO. This typically only happens on a filesystem shutdown. + * and unlocked them. In this situation, we need to fail the bio and ioend + * rather than submit it to IO. This typically only happens on a filesystem + * shutdown. */ STATIC int xfs_submit_ioend( struct writeback_control *wbc, - xfs_ioend_t *ioend, + struct xfs_ioend *ioend, int status) { - struct buffer_head *bh; - struct bio *bio; - sector_t lastblock = 0; - /* Reserve log space if we might write beyond the on-disk inode size. */ if (!status && - ioend->io_type != XFS_IO_UNWRITTEN && xfs_ioend_is_append(ioend)) + ioend->io_type != XFS_IO_UNWRITTEN && + xfs_ioend_is_append(ioend) && + !ioend->io_append_trans) status = xfs_setfilesize_trans_alloc(ioend); + + ioend->io_bio->bi_private = ioend; + ioend->io_bio->bi_end_io = xfs_end_bio; + bio_set_op_attrs(ioend->io_bio, REQ_OP_WRITE, + (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : 0); /* * If we are failing the IO now, just mark the ioend with an * error and finish it. This will run IO completion immediately @@ -481,33 +456,73 @@ xfs_submit_ioend( * time. */ if (status) { - ioend->io_error = status; - xfs_finish_ioend(ioend); + ioend->io_bio->bi_error = status; + bio_endio(ioend->io_bio); return status; } - bio = NULL; - for (bh = ioend->io_buffer_head; bh; bh = bh->b_private) { + submit_bio(ioend->io_bio); + return 0; +} - if (!bio) { -retry: - bio = xfs_alloc_ioend_bio(bh); - } else if (bh->b_blocknr != lastblock + 1) { - xfs_submit_ioend_bio(wbc, ioend, bio); - goto retry; - } +static void +xfs_init_bio_from_bh( + struct bio *bio, + struct buffer_head *bh) +{ + bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9); + bio->bi_bdev = bh->b_bdev; +} - if (xfs_bio_add_buffer(bio, bh) != bh->b_size) { - xfs_submit_ioend_bio(wbc, ioend, bio); - goto retry; - } +static struct xfs_ioend * +xfs_alloc_ioend( + struct inode *inode, + unsigned int type, + xfs_off_t offset, + struct buffer_head *bh) +{ + struct xfs_ioend *ioend; + struct bio *bio; - lastblock = bh->b_blocknr; - } - if (bio) - xfs_submit_ioend_bio(wbc, ioend, bio); - xfs_finish_ioend(ioend); - return 0; + bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_PAGES, xfs_ioend_bioset); + xfs_init_bio_from_bh(bio, bh); + + ioend = container_of(bio, struct xfs_ioend, io_inline_bio); + INIT_LIST_HEAD(&ioend->io_list); + ioend->io_type = type; + ioend->io_inode = inode; + ioend->io_size = 0; + ioend->io_offset = offset; + INIT_WORK(&ioend->io_work, xfs_end_io); + ioend->io_append_trans = NULL; + ioend->io_bio = bio; + return ioend; +} + +/* + * Allocate a new bio, and chain the old bio to the new one. + * + * Note that we have to do perform the chaining in this unintuitive order + * so that the bi_private linkage is set up in the right direction for the + * traversal in xfs_destroy_ioend(). + */ +static void +xfs_chain_bio( + struct xfs_ioend *ioend, + struct writeback_control *wbc, + struct buffer_head *bh) +{ + struct bio *new; + + new = bio_alloc(GFP_NOFS, BIO_MAX_PAGES); + xfs_init_bio_from_bh(new, bh); + + bio_chain(ioend->io_bio, new); + bio_get(ioend->io_bio); /* for xfs_destroy_ioend */ + bio_set_op_attrs(ioend->io_bio, REQ_OP_WRITE, + (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : 0); + submit_bio(ioend->io_bio); + ioend->io_bio = new; } /* @@ -523,27 +538,24 @@ xfs_add_to_ioend( struct buffer_head *bh, xfs_off_t offset, struct xfs_writepage_ctx *wpc, + struct writeback_control *wbc, struct list_head *iolist) { if (!wpc->ioend || wpc->io_type != wpc->ioend->io_type || bh->b_blocknr != wpc->last_block + 1 || offset != wpc->ioend->io_offset + wpc->ioend->io_size) { - struct xfs_ioend *new; - if (wpc->ioend) list_add(&wpc->ioend->io_list, iolist); - - new = xfs_alloc_ioend(inode, wpc->io_type); - new->io_offset = offset; - new->io_buffer_head = bh; - new->io_buffer_tail = bh; - wpc->ioend = new; - } else { - wpc->ioend->io_buffer_tail->b_private = bh; - wpc->ioend->io_buffer_tail = bh; + wpc->ioend = xfs_alloc_ioend(inode, wpc->io_type, offset, bh); } - bh->b_private = NULL; + /* + * If the buffer doesn't fit into the bio we need to allocate a new + * one. This shouldn't happen more than once for a given buffer. + */ + while (xfs_bio_add_buffer(wpc->ioend->io_bio, bh) != bh->b_size) + xfs_chain_bio(wpc->ioend, wbc, bh); + wpc->ioend->io_size += bh->b_size; wpc->last_block = bh->b_blocknr; xfs_start_buffer_writeback(bh); @@ -803,7 +815,7 @@ xfs_writepage_map( lock_buffer(bh); if (wpc->io_type != XFS_IO_OVERWRITE) xfs_map_at_offset(inode, bh, &wpc->imap, offset); - xfs_add_to_ioend(inode, bh, offset, wpc, &submit_list); + xfs_add_to_ioend(inode, bh, offset, wpc, wbc, &submit_list); count++; } @@ -1038,6 +1050,20 @@ xfs_vm_releasepage( trace_xfs_releasepage(page->mapping->host, page, 0, 0); + /* + * mm accommodates an old ext3 case where clean pages might not have had + * the dirty bit cleared. Thus, it can send actual dirty pages to + * ->releasepage() via shrink_active_list(). Conversely, + * block_invalidatepage() can send pages that are still marked dirty + * but otherwise have invalidated buffers. + * + * We've historically freed buffers on the latter. Instead, quietly + * filter out all dirty pages to avoid spurious buffer state warnings. + * This can likely be removed once shrink_active_list() is fixed. + */ + if (PageDirty(page)) + return 0; + xfs_count_page_state(page, &delalloc, &unwritten); if (WARN_ON_ONCE(delalloc)) @@ -1141,6 +1167,8 @@ __xfs_get_blocks( ssize_t size; int new = 0; + BUG_ON(create && !direct); + if (XFS_FORCED_SHUTDOWN(mp)) return -EIO; @@ -1148,22 +1176,14 @@ __xfs_get_blocks( ASSERT(bh_result->b_size >= (1 << inode->i_blkbits)); size = bh_result->b_size; - if (!create && direct && offset >= i_size_read(inode)) + if (!create && offset >= i_size_read(inode)) return 0; /* * Direct I/O is usually done on preallocated files, so try getting - * a block mapping without an exclusive lock first. For buffered - * writes we already have the exclusive iolock anyway, so avoiding - * a lock roundtrip here by taking the ilock exclusive from the - * beginning is a useful micro optimization. + * a block mapping without an exclusive lock first. */ - if (create && !direct) { - lockmode = XFS_ILOCK_EXCL; - xfs_ilock(ip, lockmode); - } else { - lockmode = xfs_ilock_data_map_shared(ip); - } + lockmode = xfs_ilock_data_map_shared(ip); ASSERT(offset <= mp->m_super->s_maxbytes); if (offset + size > mp->m_super->s_maxbytes) @@ -1182,37 +1202,19 @@ __xfs_get_blocks( (imap.br_startblock == HOLESTARTBLOCK || imap.br_startblock == DELAYSTARTBLOCK) || (IS_DAX(inode) && ISUNWRITTEN(&imap)))) { - if (direct || xfs_get_extsz_hint(ip)) { - /* - * xfs_iomap_write_direct() expects the shared lock. It - * is unlocked on return. - */ - if (lockmode == XFS_ILOCK_EXCL) - xfs_ilock_demote(ip, lockmode); - - error = xfs_iomap_write_direct(ip, offset, size, - &imap, nimaps); - if (error) - return error; - new = 1; + /* + * xfs_iomap_write_direct() expects the shared lock. It + * is unlocked on return. + */ + if (lockmode == XFS_ILOCK_EXCL) + xfs_ilock_demote(ip, lockmode); - } else { - /* - * Delalloc reservations do not require a transaction, - * we can go on without dropping the lock here. If we - * are allocating a new delalloc block, make sure that - * we set the new flag so that we mark the buffer new so - * that we know that it is newly allocated if the write - * fails. - */ - if (nimaps && imap.br_startblock == HOLESTARTBLOCK) - new = 1; - error = xfs_iomap_write_delay(ip, offset, size, &imap); - if (error) - goto out_unlock; + error = xfs_iomap_write_direct(ip, offset, size, + &imap, nimaps); + if (error) + return error; + new = 1; - xfs_iunlock(ip, lockmode); - } trace_xfs_get_blocks_alloc(ip, offset, size, ISUNWRITTEN(&imap) ? XFS_IO_UNWRITTEN : XFS_IO_DELALLOC, &imap); @@ -1233,9 +1235,7 @@ __xfs_get_blocks( } /* trim mapping down to size requested */ - if (direct || size > (1 << inode->i_blkbits)) - xfs_map_trim_size(inode, iblock, bh_result, - &imap, offset, size); + xfs_map_trim_size(inode, iblock, bh_result, &imap, offset, size); /* * For unwritten extents do not report a disk address in the buffered @@ -1248,7 +1248,7 @@ __xfs_get_blocks( if (ISUNWRITTEN(&imap)) set_buffer_unwritten(bh_result); /* direct IO needs special help */ - if (create && direct) { + if (create) { if (dax_fault) ASSERT(!ISUNWRITTEN(&imap)); else @@ -1277,14 +1277,7 @@ __xfs_get_blocks( (new || ISUNWRITTEN(&imap)))) set_buffer_new(bh_result); - if (imap.br_startblock == DELAYSTARTBLOCK) { - BUG_ON(direct); - if (create) { - set_buffer_uptodate(bh_result); - set_buffer_mapped(bh_result); - set_buffer_delay(bh_result); - } - } + BUG_ON(direct && imap.br_startblock == DELAYSTARTBLOCK); return 0; @@ -1334,7 +1327,7 @@ xfs_get_blocks_dax_fault( * whereas if we have flags set we will always be called in task context * (i.e. from a workqueue). */ -STATIC int +int xfs_end_io_direct_write( struct kiocb *iocb, loff_t offset, @@ -1391,13 +1384,10 @@ xfs_end_io_direct_write( trace_xfs_end_io_direct_write_append(ip, offset, size); - tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS); - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_fsyncts, 0, 0); - if (error) { - xfs_trans_cancel(tp); - return error; - } - error = xfs_setfilesize(ip, tp, offset, size); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_fsyncts, 0, 0, 0, + &tp); + if (!error) + error = xfs_setfilesize(ip, tp, offset, size); } return error; @@ -1408,234 +1398,10 @@ xfs_vm_direct_IO( struct kiocb *iocb, struct iov_iter *iter) { - struct inode *inode = iocb->ki_filp->f_mapping->host; - dio_iodone_t *endio = NULL; - int flags = 0; - struct block_device *bdev; - - if (iov_iter_rw(iter) == WRITE) { - endio = xfs_end_io_direct_write; - flags = DIO_ASYNC_EXTEND; - } - - if (IS_DAX(inode)) { - return dax_do_io(iocb, inode, iter, - xfs_get_blocks_direct, endio, 0); - } - - bdev = xfs_find_bdev_for_inode(inode); - return __blockdev_direct_IO(iocb, inode, bdev, iter, - xfs_get_blocks_direct, endio, NULL, flags); -} - -/* - * Punch out the delalloc blocks we have already allocated. - * - * Don't bother with xfs_setattr given that nothing can have made it to disk yet - * as the page is still locked at this point. - */ -STATIC void -xfs_vm_kill_delalloc_range( - struct inode *inode, - loff_t start, - loff_t end) -{ - struct xfs_inode *ip = XFS_I(inode); - xfs_fileoff_t start_fsb; - xfs_fileoff_t end_fsb; - int error; - - start_fsb = XFS_B_TO_FSB(ip->i_mount, start); - end_fsb = XFS_B_TO_FSB(ip->i_mount, end); - if (end_fsb <= start_fsb) - return; - - xfs_ilock(ip, XFS_ILOCK_EXCL); - error = xfs_bmap_punch_delalloc_range(ip, start_fsb, - end_fsb - start_fsb); - if (error) { - /* something screwed, just bail */ - if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) { - xfs_alert(ip->i_mount, - "xfs_vm_write_failed: unable to clean up ino %lld", - ip->i_ino); - } - } - xfs_iunlock(ip, XFS_ILOCK_EXCL); -} - -STATIC void -xfs_vm_write_failed( - struct inode *inode, - struct page *page, - loff_t pos, - unsigned len) -{ - loff_t block_offset; - loff_t block_start; - loff_t block_end; - loff_t from = pos & (PAGE_SIZE - 1); - loff_t to = from + len; - struct buffer_head *bh, *head; - struct xfs_mount *mp = XFS_I(inode)->i_mount; - /* - * The request pos offset might be 32 or 64 bit, this is all fine - * on 64-bit platform. However, for 64-bit pos request on 32-bit - * platform, the high 32-bit will be masked off if we evaluate the - * block_offset via (pos & PAGE_MASK) because the PAGE_MASK is - * 0xfffff000 as an unsigned long, hence the result is incorrect - * which could cause the following ASSERT failed in most cases. - * In order to avoid this, we can evaluate the block_offset of the - * start of the page by using shifts rather than masks the mismatch - * problem. + * We just need the method present so that open/fcntl allow direct I/O. */ - block_offset = (pos >> PAGE_SHIFT) << PAGE_SHIFT; - - ASSERT(block_offset + from == pos); - - head = page_buffers(page); - block_start = 0; - for (bh = head; bh != head || !block_start; - bh = bh->b_this_page, block_start = block_end, - block_offset += bh->b_size) { - block_end = block_start + bh->b_size; - - /* skip buffers before the write */ - if (block_end <= from) - continue; - - /* if the buffer is after the write, we're done */ - if (block_start >= to) - break; - - /* - * Process delalloc and unwritten buffers beyond EOF. We can - * encounter unwritten buffers in the event that a file has - * post-EOF unwritten extents and an extending write happens to - * fail (e.g., an unaligned write that also involves a delalloc - * to the same page). - */ - if (!buffer_delay(bh) && !buffer_unwritten(bh)) - continue; - - if (!xfs_mp_fail_writes(mp) && !buffer_new(bh) && - block_offset < i_size_read(inode)) - continue; - - if (buffer_delay(bh)) - xfs_vm_kill_delalloc_range(inode, block_offset, - block_offset + bh->b_size); - - /* - * This buffer does not contain data anymore. make sure anyone - * who finds it knows that for certain. - */ - clear_buffer_delay(bh); - clear_buffer_uptodate(bh); - clear_buffer_mapped(bh); - clear_buffer_new(bh); - clear_buffer_dirty(bh); - clear_buffer_unwritten(bh); - } - -} - -/* - * This used to call block_write_begin(), but it unlocks and releases the page - * on error, and we need that page to be able to punch stale delalloc blocks out - * on failure. hence we copy-n-waste it here and call xfs_vm_write_failed() at - * the appropriate point. - */ -STATIC int -xfs_vm_write_begin( - struct file *file, - struct address_space *mapping, - loff_t pos, - unsigned len, - unsigned flags, - struct page **pagep, - void **fsdata) -{ - pgoff_t index = pos >> PAGE_SHIFT; - struct page *page; - int status; - struct xfs_mount *mp = XFS_I(mapping->host)->i_mount; - - ASSERT(len <= PAGE_SIZE); - - page = grab_cache_page_write_begin(mapping, index, flags); - if (!page) - return -ENOMEM; - - status = __block_write_begin(page, pos, len, xfs_get_blocks); - if (xfs_mp_fail_writes(mp)) - status = -EIO; - if (unlikely(status)) { - struct inode *inode = mapping->host; - size_t isize = i_size_read(inode); - - xfs_vm_write_failed(inode, page, pos, len); - unlock_page(page); - - /* - * If the write is beyond EOF, we only want to kill blocks - * allocated in this write, not blocks that were previously - * written successfully. - */ - if (xfs_mp_fail_writes(mp)) - isize = 0; - if (pos + len > isize) { - ssize_t start = max_t(ssize_t, pos, isize); - - truncate_pagecache_range(inode, start, pos + len); - } - - put_page(page); - page = NULL; - } - - *pagep = page; - return status; -} - -/* - * On failure, we only need to kill delalloc blocks beyond EOF in the range of - * this specific write because they will never be written. Previous writes - * beyond EOF where block allocation succeeded do not need to be trashed, so - * only new blocks from this write should be trashed. For blocks within - * EOF, generic_write_end() zeros them so they are safe to leave alone and be - * written with all the other valid data. - */ -STATIC int -xfs_vm_write_end( - struct file *file, - struct address_space *mapping, - loff_t pos, - unsigned len, - unsigned copied, - struct page *page, - void *fsdata) -{ - int ret; - - ASSERT(len <= PAGE_SIZE); - - ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata); - if (unlikely(ret < len)) { - struct inode *inode = mapping->host; - size_t isize = i_size_read(inode); - loff_t to = pos + len; - - if (to > isize) { - /* only kill blocks in this write beyond EOF */ - if (pos > isize) - isize = pos; - xfs_vm_kill_delalloc_range(inode, isize, to); - truncate_pagecache_range(inode, isize, to); - } - } - return ret; + return -EINVAL; } STATIC sector_t @@ -1748,8 +1514,6 @@ const struct address_space_operations xfs_address_space_operations = { .set_page_dirty = xfs_vm_set_page_dirty, .releasepage = xfs_vm_releasepage, .invalidatepage = xfs_vm_invalidatepage, - .write_begin = xfs_vm_write_begin, - .write_end = xfs_vm_write_end, .bmap = xfs_vm_bmap, .direct_IO = xfs_vm_direct_IO, .migratepage = buffer_migrate_page, diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h index b4421177b68d..bf2d9a141a73 100644 --- a/fs/xfs/xfs_aops.h +++ b/fs/xfs/xfs_aops.h @@ -18,7 +18,7 @@ #ifndef __XFS_AOPS_H__ #define __XFS_AOPS_H__ -extern mempool_t *xfs_ioend_pool; +extern struct bio_set *xfs_ioend_bioset; /* * Types of I/O for bmap clustering and I/O completion tracking. @@ -37,22 +37,19 @@ enum { { XFS_IO_OVERWRITE, "overwrite" } /* - * xfs_ioend struct manages large extent writes for XFS. - * It can manage several multi-page bio's at once. + * Structure for buffered I/O completions. */ -typedef struct xfs_ioend { +struct xfs_ioend { struct list_head io_list; /* next ioend in chain */ unsigned int io_type; /* delalloc / unwritten */ - int io_error; /* I/O error code */ - atomic_t io_remaining; /* hold count */ struct inode *io_inode; /* file being written to */ - struct buffer_head *io_buffer_head;/* buffer linked list head */ - struct buffer_head *io_buffer_tail;/* buffer linked list tail */ size_t io_size; /* size of the extent */ xfs_off_t io_offset; /* offset in the file */ struct work_struct io_work; /* xfsdatad work queue */ struct xfs_trans *io_append_trans;/* xact. for size update */ -} xfs_ioend_t; + struct bio *io_bio; /* bio being built */ + struct bio io_inline_bio; /* MUST BE LAST! */ +}; extern const struct address_space_operations xfs_address_space_operations; @@ -63,6 +60,9 @@ int xfs_get_blocks_direct(struct inode *inode, sector_t offset, int xfs_get_blocks_dax_fault(struct inode *inode, sector_t offset, struct buffer_head *map_bh, int create); +int xfs_end_io_direct_write(struct kiocb *iocb, loff_t offset, + ssize_t size, void *private); + extern void xfs_count_page_state(struct page *, int *, int *); extern struct block_device *xfs_find_bdev_for_inode(struct inode *); diff --git a/fs/xfs/xfs_attr.h b/fs/xfs/xfs_attr.h index dd4824589470..e3da5d448bcf 100644 --- a/fs/xfs/xfs_attr.h +++ b/fs/xfs/xfs_attr.h @@ -112,8 +112,9 @@ typedef struct attrlist_cursor_kern { *========================================================================*/ +/* Return 0 on success, or -errno; other state communicated via *context */ typedef int (*put_listent_func_t)(struct xfs_attr_list_context *, int, - unsigned char *, int, int, unsigned char *); + unsigned char *, int, int); typedef struct xfs_attr_list_context { struct xfs_inode *dp; /* inode */ @@ -126,7 +127,6 @@ typedef struct xfs_attr_list_context { int firstu; /* first used byte in buffer */ int flags; /* from VOP call */ int resynch; /* T/F: resynch with cursor */ - int put_value; /* T/F: need value for listent */ put_listent_func_t put_listent; /* list output fmt function */ int index; /* index into output buffer */ } xfs_attr_list_context_t; diff --git a/fs/xfs/xfs_attr_inactive.c b/fs/xfs/xfs_attr_inactive.c index 2bb959ada45b..be0b79d8900f 100644 --- a/fs/xfs/xfs_attr_inactive.c +++ b/fs/xfs/xfs_attr_inactive.c @@ -322,7 +322,7 @@ xfs_attr3_node_inactive( * Recurse (gasp!) through the attribute nodes until we find leaves. * We're doing a depth-first traversal in order to invalidate everything. */ -int +static int xfs_attr3_root_inactive( struct xfs_trans **trans, struct xfs_inode *dp) @@ -405,21 +405,11 @@ xfs_attr_inactive( goto out_destroy_fork; xfs_iunlock(dp, lock_mode); - /* - * Start our first transaction of the day. - * - * All future transactions during this code must be "chained" off - * this one via the trans_dup() call. All transactions will contain - * the inode, and the inode will always be marked with trans_ihold(). - * Since the inode will be locked in all transactions, we must log - * the inode in every transaction to let it float upward through - * the log. - */ lock_mode = 0; - trans = xfs_trans_alloc(mp, XFS_TRANS_ATTRINVAL); - error = xfs_trans_reserve(trans, &M_RES(mp)->tr_attrinval, 0, 0); + + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_attrinval, 0, 0, 0, &trans); if (error) - goto out_cancel; + goto out_destroy_fork; lock_mode = XFS_ILOCK_EXCL; xfs_ilock(dp, lock_mode); diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c index 4fa14820e2e2..25e76cd6c053 100644 --- a/fs/xfs/xfs_attr_list.c +++ b/fs/xfs/xfs_attr_list.c @@ -65,7 +65,7 @@ xfs_attr_shortform_compare(const void *a, const void *b) * we have to calculate each entries' hashvalue and sort them before * we can begin returning them to the user. */ -int +static int xfs_attr_shortform_list(xfs_attr_list_context_t *context) { attrlist_cursor_kern_t *cursor; @@ -106,18 +106,15 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context) sfe->flags, sfe->nameval, (int)sfe->namelen, - (int)sfe->valuelen, - &sfe->nameval[sfe->namelen]); - + (int)sfe->valuelen); + if (error) + return error; /* * Either search callback finished early or * didn't fit it all in the buffer after all. */ if (context->seen_enough) break; - - if (error) - return error; sfe = XFS_ATTR_SF_NEXTENTRY(sfe); } trace_xfs_attr_list_sf_all(context); @@ -200,8 +197,7 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context) sbp->flags, sbp->name, sbp->namelen, - sbp->valuelen, - &sbp->name[sbp->namelen]); + sbp->valuelen); if (error) { kmem_free(sbuf); return error; @@ -416,6 +412,9 @@ xfs_attr3_leaf_list_int( */ retval = 0; for (; i < ichdr.count; entry++, i++) { + char *name; + int namelen, valuelen; + if (be32_to_cpu(entry->hashval) != cursor->hashval) { cursor->hashval = be32_to_cpu(entry->hashval); cursor->offset = 0; @@ -425,56 +424,25 @@ xfs_attr3_leaf_list_int( continue; /* skip incomplete entries */ if (entry->flags & XFS_ATTR_LOCAL) { - xfs_attr_leaf_name_local_t *name_loc = - xfs_attr3_leaf_name_local(leaf, i); - - retval = context->put_listent(context, - entry->flags, - name_loc->nameval, - (int)name_loc->namelen, - be16_to_cpu(name_loc->valuelen), - &name_loc->nameval[name_loc->namelen]); - if (retval) - return retval; + xfs_attr_leaf_name_local_t *name_loc; + + name_loc = xfs_attr3_leaf_name_local(leaf, i); + name = name_loc->nameval; + namelen = name_loc->namelen; + valuelen = be16_to_cpu(name_loc->valuelen); } else { - xfs_attr_leaf_name_remote_t *name_rmt = - xfs_attr3_leaf_name_remote(leaf, i); - - int valuelen = be32_to_cpu(name_rmt->valuelen); - - if (context->put_value) { - xfs_da_args_t args; - - memset((char *)&args, 0, sizeof(args)); - args.geo = context->dp->i_mount->m_attr_geo; - args.dp = context->dp; - args.whichfork = XFS_ATTR_FORK; - args.valuelen = valuelen; - args.rmtvaluelen = valuelen; - args.value = kmem_alloc(valuelen, KM_SLEEP | KM_NOFS); - args.rmtblkno = be32_to_cpu(name_rmt->valueblk); - args.rmtblkcnt = xfs_attr3_rmt_blocks( - args.dp->i_mount, valuelen); - retval = xfs_attr_rmtval_get(&args); - if (!retval) - retval = context->put_listent(context, - entry->flags, - name_rmt->name, - (int)name_rmt->namelen, - valuelen, - args.value); - kmem_free(args.value); - } else { - retval = context->put_listent(context, - entry->flags, - name_rmt->name, - (int)name_rmt->namelen, - valuelen, - NULL); - } - if (retval) - return retval; + xfs_attr_leaf_name_remote_t *name_rmt; + + name_rmt = xfs_attr3_leaf_name_remote(leaf, i); + name = name_rmt->name; + namelen = name_rmt->namelen; + valuelen = be32_to_cpu(name_rmt->valuelen); } + + retval = context->put_listent(context, entry->flags, + name, namelen, valuelen); + if (retval) + break; if (context->seen_enough) break; cursor->offset++; @@ -551,8 +519,7 @@ xfs_attr_put_listent( int flags, unsigned char *name, int namelen, - int valuelen, - unsigned char *value) + int valuelen) { struct attrlist *alist = (struct attrlist *)context->alist; attrlist_ent_t *aep; @@ -581,7 +548,7 @@ xfs_attr_put_listent( trace_xfs_attr_list_full(context); alist->al_more = 1; context->seen_enough = 1; - return 1; + return 0; } aep = (attrlist_ent_t *)&context->alist[context->firstu]; diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 3b6309865c65..cd4a850564f2 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -72,18 +72,28 @@ xfs_zero_extent( struct xfs_mount *mp = ip->i_mount; xfs_daddr_t sector = xfs_fsb_to_db(ip, start_fsb); sector_t block = XFS_BB_TO_FSBT(mp, sector); - ssize_t size = XFS_FSB_TO_B(mp, count_fsb); - if (IS_DAX(VFS_I(ip))) - return dax_clear_sectors(xfs_find_bdev_for_inode(VFS_I(ip)), - sector, size); - - /* - * let the block layer decide on the fastest method of - * implementing the zeroing. - */ - return sb_issue_zeroout(mp->m_super, block, count_fsb, GFP_NOFS); + return blkdev_issue_zeroout(xfs_find_bdev_for_inode(VFS_I(ip)), + block << (mp->m_super->s_blocksize_bits - 9), + count_fsb << (mp->m_super->s_blocksize_bits - 9), + GFP_NOFS, true); +} +/* Sort bmap items by AG. */ +static int +xfs_bmap_free_list_cmp( + void *priv, + struct list_head *a, + struct list_head *b) +{ + struct xfs_mount *mp = priv; + struct xfs_bmap_free_item *ra; + struct xfs_bmap_free_item *rb; + + ra = container_of(a, struct xfs_bmap_free_item, xbfi_list); + rb = container_of(b, struct xfs_bmap_free_item, xbfi_list); + return XFS_FSB_TO_AGNO(mp, ra->xbfi_startblock) - + XFS_FSB_TO_AGNO(mp, rb->xbfi_startblock); } /* @@ -106,14 +116,15 @@ xfs_bmap_finish( int error; /* error return value */ int committed;/* xact committed or not */ struct xfs_bmap_free_item *free; /* free extent item */ - struct xfs_bmap_free_item *next; /* next item on free list */ ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES); if (flist->xbf_count == 0) return 0; + list_sort((*tp)->t_mountp, &flist->xbf_flist, xfs_bmap_free_list_cmp); + efi = xfs_trans_get_efi(*tp, flist->xbf_count); - for (free = flist->xbf_first; free; free = free->xbfi_next) + list_for_each_entry(free, &flist->xbf_flist, xbfi_list) xfs_trans_log_efi_extent(*tp, efi, free->xbfi_startblock, free->xbfi_blockcount); @@ -132,9 +143,7 @@ xfs_bmap_finish( if (committed) { xfs_efi_release(efi); xfs_force_shutdown((*tp)->t_mountp, - (error == -EFSCORRUPTED) ? - SHUTDOWN_CORRUPT_INCORE : - SHUTDOWN_META_IO_ERROR); + SHUTDOWN_META_IO_ERROR); } return error; } @@ -145,15 +154,15 @@ xfs_bmap_finish( * on error. */ efd = xfs_trans_get_efd(*tp, efi, flist->xbf_count); - for (free = flist->xbf_first; free != NULL; free = next) { - next = free->xbfi_next; - + while (!list_empty(&flist->xbf_flist)) { + free = list_first_entry(&flist->xbf_flist, + struct xfs_bmap_free_item, xbfi_list); error = xfs_trans_free_extent(*tp, efd, free->xbfi_startblock, free->xbfi_blockcount); if (error) return error; - xfs_bmap_del_free(flist, NULL, free); + xfs_bmap_del_free(flist, free); } return 0; @@ -416,7 +425,7 @@ xfs_bmap_count_tree( /* * Count fsblocks of the given fork. */ -int /* error */ +static int /* error */ xfs_bmap_count_blocks( xfs_trans_t *tp, /* transaction pointer */ xfs_inode_t *ip, /* incore inode */ @@ -806,7 +815,7 @@ xfs_bmap_punch_delalloc_range( if (error) break; - ASSERT(!flist.xbf_count && !flist.xbf_first); + ASSERT(!flist.xbf_count && list_empty(&flist.xbf_flist)); next_block: start_fsb++; remaining--; @@ -900,19 +909,15 @@ xfs_free_eofblocks( * Free them up now by truncating the file to * its current size. */ - tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE); - if (need_iolock) { - if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) { - xfs_trans_cancel(tp); + if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) return -EAGAIN; - } } - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, + &tp); if (error) { ASSERT(XFS_FORCED_SHUTDOWN(mp)); - xfs_trans_cancel(tp); if (need_iolock) xfs_iunlock(ip, XFS_IOLOCK_EXCL); return error; @@ -1037,9 +1042,9 @@ xfs_alloc_file_space( /* * Allocate and setup the transaction. */ - tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT); - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write, - resblks, resrtextents); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, + resrtextents, 0, &tp); + /* * Check for running out of space */ @@ -1048,7 +1053,6 @@ xfs_alloc_file_space( * Free the transaction structure. */ ASSERT(error == -ENOSPC || XFS_FORCED_SHUTDOWN(mp)); - xfs_trans_cancel(tp); break; } xfs_ilock(ip, XFS_ILOCK_EXCL); @@ -1101,99 +1105,120 @@ error1: /* Just cancel transaction */ return error; } -/* - * Zero file bytes between startoff and endoff inclusive. - * The iolock is held exclusive and no blocks are buffered. - * - * This function is used by xfs_free_file_space() to zero - * partial blocks when the range to free is not block aligned. - * When unreserving space with boundaries that are not block - * aligned we round up the start and round down the end - * boundaries and then use this function to zero the parts of - * the blocks that got dropped during the rounding. - */ -STATIC int -xfs_zero_remaining_bytes( - xfs_inode_t *ip, - xfs_off_t startoff, - xfs_off_t endoff) +static int +xfs_unmap_extent( + struct xfs_inode *ip, + xfs_fileoff_t startoffset_fsb, + xfs_filblks_t len_fsb, + int *done) { - xfs_bmbt_irec_t imap; - xfs_fileoff_t offset_fsb; - xfs_off_t lastoffset; - xfs_off_t offset; - xfs_buf_t *bp; - xfs_mount_t *mp = ip->i_mount; - int nimap; - int error = 0; + struct xfs_mount *mp = ip->i_mount; + struct xfs_trans *tp; + struct xfs_bmap_free free_list; + xfs_fsblock_t firstfsb; + uint resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0); + int error; - /* - * Avoid doing I/O beyond eof - it's not necessary - * since nothing can read beyond eof. The space will - * be zeroed when the file is extended anyway. - */ - if (startoff >= XFS_ISIZE(ip)) - return 0; + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, &tp); + if (error) { + ASSERT(error == -ENOSPC || XFS_FORCED_SHUTDOWN(mp)); + return error; + } - if (endoff > XFS_ISIZE(ip)) - endoff = XFS_ISIZE(ip); + xfs_ilock(ip, XFS_ILOCK_EXCL); + error = xfs_trans_reserve_quota(tp, mp, ip->i_udquot, ip->i_gdquot, + ip->i_pdquot, resblks, 0, XFS_QMOPT_RES_REGBLKS); + if (error) + goto out_trans_cancel; - for (offset = startoff; offset <= endoff; offset = lastoffset + 1) { - uint lock_mode; + xfs_trans_ijoin(tp, ip, 0); - offset_fsb = XFS_B_TO_FSBT(mp, offset); - nimap = 1; + xfs_bmap_init(&free_list, &firstfsb); + error = xfs_bunmapi(tp, ip, startoffset_fsb, len_fsb, 0, 2, &firstfsb, + &free_list, done); + if (error) + goto out_bmap_cancel; - lock_mode = xfs_ilock_data_map_shared(ip); - error = xfs_bmapi_read(ip, offset_fsb, 1, &imap, &nimap, 0); - xfs_iunlock(ip, lock_mode); + error = xfs_bmap_finish(&tp, &free_list, NULL); + if (error) + goto out_bmap_cancel; - if (error || nimap < 1) - break; - ASSERT(imap.br_blockcount >= 1); - ASSERT(imap.br_startoff == offset_fsb); - ASSERT(imap.br_startblock != DELAYSTARTBLOCK); + error = xfs_trans_commit(tp); +out_unlock: + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return error; - if (imap.br_startblock == HOLESTARTBLOCK || - imap.br_state == XFS_EXT_UNWRITTEN) { - /* skip the entire extent */ - lastoffset = XFS_FSB_TO_B(mp, imap.br_startoff + - imap.br_blockcount) - 1; - continue; - } +out_bmap_cancel: + xfs_bmap_cancel(&free_list); +out_trans_cancel: + xfs_trans_cancel(tp); + goto out_unlock; +} - lastoffset = XFS_FSB_TO_B(mp, imap.br_startoff + 1) - 1; - if (lastoffset > endoff) - lastoffset = endoff; +static int +xfs_adjust_extent_unmap_boundaries( + struct xfs_inode *ip, + xfs_fileoff_t *startoffset_fsb, + xfs_fileoff_t *endoffset_fsb) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_bmbt_irec imap; + int nimap, error; + xfs_extlen_t mod = 0; - /* DAX can just zero the backing device directly */ - if (IS_DAX(VFS_I(ip))) { - error = dax_zero_page_range(VFS_I(ip), offset, - lastoffset - offset + 1, - xfs_get_blocks_direct); - if (error) - return error; - continue; - } + nimap = 1; + error = xfs_bmapi_read(ip, *startoffset_fsb, 1, &imap, &nimap, 0); + if (error) + return error; - error = xfs_buf_read_uncached(XFS_IS_REALTIME_INODE(ip) ? - mp->m_rtdev_targp : mp->m_ddev_targp, - xfs_fsb_to_db(ip, imap.br_startblock), - BTOBB(mp->m_sb.sb_blocksize), - 0, &bp, NULL); - if (error) - return error; + if (nimap && imap.br_startblock != HOLESTARTBLOCK) { + xfs_daddr_t block; - memset(bp->b_addr + - (offset - XFS_FSB_TO_B(mp, imap.br_startoff)), - 0, lastoffset - offset + 1); + ASSERT(imap.br_startblock != DELAYSTARTBLOCK); + block = imap.br_startblock; + mod = do_div(block, mp->m_sb.sb_rextsize); + if (mod) + *startoffset_fsb += mp->m_sb.sb_rextsize - mod; + } - error = xfs_bwrite(bp); - xfs_buf_relse(bp); - if (error) - return error; + nimap = 1; + error = xfs_bmapi_read(ip, *endoffset_fsb - 1, 1, &imap, &nimap, 0); + if (error) + return error; + + if (nimap && imap.br_startblock != HOLESTARTBLOCK) { + ASSERT(imap.br_startblock != DELAYSTARTBLOCK); + mod++; + if (mod && mod != mp->m_sb.sb_rextsize) + *endoffset_fsb -= mod; } - return error; + + return 0; +} + +static int +xfs_flush_unmap_range( + struct xfs_inode *ip, + xfs_off_t offset, + xfs_off_t len) +{ + struct xfs_mount *mp = ip->i_mount; + struct inode *inode = VFS_I(ip); + xfs_off_t rounding, start, end; + int error; + + /* wait for the completion of any pending DIOs */ + inode_dio_wait(inode); + + rounding = max_t(xfs_off_t, 1 << mp->m_sb.sb_blocklog, PAGE_SIZE); + start = round_down(offset, rounding); + end = round_up(offset + len, rounding) - 1; + + error = filemap_write_and_wait_range(inode->i_mapping, start, end); + if (error) + return error; + truncate_pagecache_range(inode, start, end); + return 0; } int @@ -1202,24 +1227,10 @@ xfs_free_file_space( xfs_off_t offset, xfs_off_t len) { - int done; - xfs_fileoff_t endoffset_fsb; - int error; - xfs_fsblock_t firstfsb; - xfs_bmap_free_t free_list; - xfs_bmbt_irec_t imap; - xfs_off_t ioffset; - xfs_off_t iendoffset; - xfs_extlen_t mod=0; - xfs_mount_t *mp; - int nimap; - uint resblks; - xfs_off_t rounding; - int rt; + struct xfs_mount *mp = ip->i_mount; xfs_fileoff_t startoffset_fsb; - xfs_trans_t *tp; - - mp = ip->i_mount; + xfs_fileoff_t endoffset_fsb; + int done = 0, error; trace_xfs_free_file_space(ip); @@ -1227,143 +1238,45 @@ xfs_free_file_space( if (error) return error; - error = 0; if (len <= 0) /* if nothing being freed */ - return error; - rt = XFS_IS_REALTIME_INODE(ip); - startoffset_fsb = XFS_B_TO_FSB(mp, offset); - endoffset_fsb = XFS_B_TO_FSBT(mp, offset + len); - - /* wait for the completion of any pending DIOs */ - inode_dio_wait(VFS_I(ip)); + return 0; - rounding = max_t(xfs_off_t, 1 << mp->m_sb.sb_blocklog, PAGE_SIZE); - ioffset = round_down(offset, rounding); - iendoffset = round_up(offset + len, rounding) - 1; - error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping, ioffset, - iendoffset); + error = xfs_flush_unmap_range(ip, offset, len); if (error) - goto out; - truncate_pagecache_range(VFS_I(ip), ioffset, iendoffset); + return error; + + startoffset_fsb = XFS_B_TO_FSB(mp, offset); + endoffset_fsb = XFS_B_TO_FSBT(mp, offset + len); /* - * Need to zero the stuff we're not freeing, on disk. - * If it's a realtime file & can't use unwritten extents then we - * actually need to zero the extent edges. Otherwise xfs_bunmapi - * will take care of it for us. + * Need to zero the stuff we're not freeing, on disk. If it's a RT file + * and we can't use unwritten extents then we actually need to ensure + * to zero the whole extent, otherwise we just need to take of block + * boundaries, and xfs_bunmapi will handle the rest. */ - if (rt && !xfs_sb_version_hasextflgbit(&mp->m_sb)) { - nimap = 1; - error = xfs_bmapi_read(ip, startoffset_fsb, 1, - &imap, &nimap, 0); - if (error) - goto out; - ASSERT(nimap == 0 || nimap == 1); - if (nimap && imap.br_startblock != HOLESTARTBLOCK) { - xfs_daddr_t block; - - ASSERT(imap.br_startblock != DELAYSTARTBLOCK); - block = imap.br_startblock; - mod = do_div(block, mp->m_sb.sb_rextsize); - if (mod) - startoffset_fsb += mp->m_sb.sb_rextsize - mod; - } - nimap = 1; - error = xfs_bmapi_read(ip, endoffset_fsb - 1, 1, - &imap, &nimap, 0); + if (XFS_IS_REALTIME_INODE(ip) && + !xfs_sb_version_hasextflgbit(&mp->m_sb)) { + error = xfs_adjust_extent_unmap_boundaries(ip, &startoffset_fsb, + &endoffset_fsb); if (error) - goto out; - ASSERT(nimap == 0 || nimap == 1); - if (nimap && imap.br_startblock != HOLESTARTBLOCK) { - ASSERT(imap.br_startblock != DELAYSTARTBLOCK); - mod++; - if (mod && (mod != mp->m_sb.sb_rextsize)) - endoffset_fsb -= mod; - } - } - if ((done = (endoffset_fsb <= startoffset_fsb))) - /* - * One contiguous piece to clear - */ - error = xfs_zero_remaining_bytes(ip, offset, offset + len - 1); - else { - /* - * Some full blocks, possibly two pieces to clear - */ - if (offset < XFS_FSB_TO_B(mp, startoffset_fsb)) - error = xfs_zero_remaining_bytes(ip, offset, - XFS_FSB_TO_B(mp, startoffset_fsb) - 1); - if (!error && - XFS_FSB_TO_B(mp, endoffset_fsb) < offset + len) - error = xfs_zero_remaining_bytes(ip, - XFS_FSB_TO_B(mp, endoffset_fsb), - offset + len - 1); + return error; } - /* - * free file space until done or until there is an error - */ - resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0); - while (!error && !done) { - - /* - * allocate and setup the transaction. Allow this - * transaction to dip into the reserve blocks to ensure - * the freeing of the space succeeds at ENOSPC. - */ - tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT); - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write, resblks, 0); - - /* - * check for running out of space - */ - if (error) { - /* - * Free the transaction structure. - */ - ASSERT(error == -ENOSPC || XFS_FORCED_SHUTDOWN(mp)); - xfs_trans_cancel(tp); - break; + if (endoffset_fsb > startoffset_fsb) { + while (!done) { + error = xfs_unmap_extent(ip, startoffset_fsb, + endoffset_fsb - startoffset_fsb, &done); + if (error) + return error; } - xfs_ilock(ip, XFS_ILOCK_EXCL); - error = xfs_trans_reserve_quota(tp, mp, - ip->i_udquot, ip->i_gdquot, ip->i_pdquot, - resblks, 0, XFS_QMOPT_RES_REGBLKS); - if (error) - goto error1; - - xfs_trans_ijoin(tp, ip, 0); - - /* - * issue the bunmapi() call to free the blocks - */ - xfs_bmap_init(&free_list, &firstfsb); - error = xfs_bunmapi(tp, ip, startoffset_fsb, - endoffset_fsb - startoffset_fsb, - 0, 2, &firstfsb, &free_list, &done); - if (error) - goto error0; - - /* - * complete the transaction - */ - error = xfs_bmap_finish(&tp, &free_list, NULL); - if (error) - goto error0; - - error = xfs_trans_commit(tp); - xfs_iunlock(ip, XFS_ILOCK_EXCL); } - out: - return error; - - error0: - xfs_bmap_cancel(&free_list); - error1: - xfs_trans_cancel(tp); - xfs_iunlock(ip, XFS_ILOCK_EXCL); - goto out; + /* + * Now that we've unmap all full blocks we'll have to zero out any + * partial block at the beginning and/or end. xfs_zero_range is + * smart enough to skip any holes, including those we just created. + */ + return xfs_zero_range(ip, offset, len, NULL); } /* @@ -1482,19 +1395,16 @@ xfs_shift_file_space( } while (!error && !done) { - tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT); /* * We would need to reserve permanent block for transaction. * This will come into picture when after shifting extent into * hole we found that adjacent extents can be merged which * may lead to freeing of a block during record update. */ - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write, - XFS_DIOSTRAT_SPACE_RES(mp, 0), 0); - if (error) { - xfs_trans_cancel(tp); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, + XFS_DIOSTRAT_SPACE_RES(mp, 0), 0, 0, &tp); + if (error) break; - } xfs_ilock(ip, XFS_ILOCK_EXCL); error = xfs_trans_reserve_quota(tp, mp, ip->i_udquot, @@ -1747,12 +1657,9 @@ xfs_swap_extents( if (error) goto out_unlock; - tp = xfs_trans_alloc(mp, XFS_TRANS_SWAPEXT); - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0); - if (error) { - xfs_trans_cancel(tp); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp); + if (error) goto out_unlock; - } /* * Lock and join the inodes to the tansaction so that transaction commit diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h index af97d9a1dfb4..f20071432ca6 100644 --- a/fs/xfs/xfs_bmap_util.h +++ b/fs/xfs/xfs_bmap_util.h @@ -31,8 +31,6 @@ struct xfs_bmalloca; int xfs_bmap_rtalloc(struct xfs_bmalloca *ap); int xfs_bmap_eof(struct xfs_inode *ip, xfs_fileoff_t endoff, int whichfork, int *eof); -int xfs_bmap_count_blocks(struct xfs_trans *tp, struct xfs_inode *ip, - int whichfork, int *count); int xfs_bmap_punch_delalloc_range(struct xfs_inode *ip, xfs_fileoff_t start_fsb, xfs_fileoff_t length); @@ -43,7 +41,6 @@ int xfs_getbmap(struct xfs_inode *ip, struct getbmapx *bmv, /* functions in xfs_bmap.c that are only needed by xfs_bmap_util.c */ void xfs_bmap_del_free(struct xfs_bmap_free *flist, - struct xfs_bmap_free_item *prev, struct xfs_bmap_free_item *free); int xfs_bmap_extsize_align(struct xfs_mount *mp, struct xfs_bmbt_irec *gotp, struct xfs_bmbt_irec *prevp, xfs_extlen_t extsz, diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 9a2191b91137..47a318ce82e0 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -80,6 +80,47 @@ xfs_buf_vmap_len( } /* + * Bump the I/O in flight count on the buftarg if we haven't yet done so for + * this buffer. The count is incremented once per buffer (per hold cycle) + * because the corresponding decrement is deferred to buffer release. Buffers + * can undergo I/O multiple times in a hold-release cycle and per buffer I/O + * tracking adds unnecessary overhead. This is used for sychronization purposes + * with unmount (see xfs_wait_buftarg()), so all we really need is a count of + * in-flight buffers. + * + * Buffers that are never released (e.g., superblock, iclog buffers) must set + * the XBF_NO_IOACCT flag before I/O submission. Otherwise, the buftarg count + * never reaches zero and unmount hangs indefinitely. + */ +static inline void +xfs_buf_ioacct_inc( + struct xfs_buf *bp) +{ + if (bp->b_flags & (XBF_NO_IOACCT|_XBF_IN_FLIGHT)) + return; + + ASSERT(bp->b_flags & XBF_ASYNC); + bp->b_flags |= _XBF_IN_FLIGHT; + percpu_counter_inc(&bp->b_target->bt_io_count); +} + +/* + * Clear the in-flight state on a buffer about to be released to the LRU or + * freed and unaccount from the buftarg. + */ +static inline void +xfs_buf_ioacct_dec( + struct xfs_buf *bp) +{ + if (!(bp->b_flags & _XBF_IN_FLIGHT)) + return; + + ASSERT(bp->b_flags & XBF_ASYNC); + bp->b_flags &= ~_XBF_IN_FLIGHT; + percpu_counter_dec(&bp->b_target->bt_io_count); +} + +/* * When we mark a buffer stale, we remove the buffer from the LRU and clear the * b_lru_ref count so that the buffer is freed immediately when the buffer * reference count falls to zero. If the buffer is already on the LRU, we need @@ -102,6 +143,14 @@ xfs_buf_stale( */ bp->b_flags &= ~_XBF_DELWRI_Q; + /* + * Once the buffer is marked stale and unlocked, a subsequent lookup + * could reset b_flags. There is no guarantee that the buffer is + * unaccounted (released to LRU) before that occurs. Drop in-flight + * status now to preserve accounting consistency. + */ + xfs_buf_ioacct_dec(bp); + spin_lock(&bp->b_lock); atomic_set(&bp->b_lru_ref, 0); if (!(bp->b_state & XFS_BSTATE_DISPOSE) && @@ -815,7 +864,8 @@ xfs_buf_get_uncached( struct xfs_buf *bp; DEFINE_SINGLE_BUF_MAP(map, XFS_BUF_DADDR_NULL, numblks); - bp = _xfs_buf_alloc(target, &map, 1, 0); + /* flags might contain irrelevant bits, pass only what we care about */ + bp = _xfs_buf_alloc(target, &map, 1, flags & XBF_NO_IOACCT); if (unlikely(bp == NULL)) goto fail; @@ -866,63 +916,85 @@ xfs_buf_hold( } /* - * Releases a hold on the specified buffer. If the - * the hold count is 1, calls xfs_buf_free. + * Release a hold on the specified buffer. If the hold count is 1, the buffer is + * placed on LRU or freed (depending on b_lru_ref). */ void xfs_buf_rele( xfs_buf_t *bp) { struct xfs_perag *pag = bp->b_pag; + bool release; + bool freebuf = false; trace_xfs_buf_rele(bp, _RET_IP_); if (!pag) { ASSERT(list_empty(&bp->b_lru)); ASSERT(RB_EMPTY_NODE(&bp->b_rbnode)); - if (atomic_dec_and_test(&bp->b_hold)) + if (atomic_dec_and_test(&bp->b_hold)) { + xfs_buf_ioacct_dec(bp); xfs_buf_free(bp); + } return; } ASSERT(!RB_EMPTY_NODE(&bp->b_rbnode)); ASSERT(atomic_read(&bp->b_hold) > 0); - if (atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock)) { - spin_lock(&bp->b_lock); - if (!(bp->b_flags & XBF_STALE) && atomic_read(&bp->b_lru_ref)) { - /* - * If the buffer is added to the LRU take a new - * reference to the buffer for the LRU and clear the - * (now stale) dispose list state flag - */ - if (list_lru_add(&bp->b_target->bt_lru, &bp->b_lru)) { - bp->b_state &= ~XFS_BSTATE_DISPOSE; - atomic_inc(&bp->b_hold); - } - spin_unlock(&bp->b_lock); - spin_unlock(&pag->pag_buf_lock); - } else { - /* - * most of the time buffers will already be removed from - * the LRU, so optimise that case by checking for the - * XFS_BSTATE_DISPOSE flag indicating the last list the - * buffer was on was the disposal list - */ - if (!(bp->b_state & XFS_BSTATE_DISPOSE)) { - list_lru_del(&bp->b_target->bt_lru, &bp->b_lru); - } else { - ASSERT(list_empty(&bp->b_lru)); - } - spin_unlock(&bp->b_lock); - ASSERT(!(bp->b_flags & _XBF_DELWRI_Q)); - rb_erase(&bp->b_rbnode, &pag->pag_buf_tree); - spin_unlock(&pag->pag_buf_lock); - xfs_perag_put(pag); - xfs_buf_free(bp); + release = atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock); + spin_lock(&bp->b_lock); + if (!release) { + /* + * Drop the in-flight state if the buffer is already on the LRU + * and it holds the only reference. This is racy because we + * haven't acquired the pag lock, but the use of _XBF_IN_FLIGHT + * ensures the decrement occurs only once per-buf. + */ + if ((atomic_read(&bp->b_hold) == 1) && !list_empty(&bp->b_lru)) + xfs_buf_ioacct_dec(bp); + goto out_unlock; + } + + /* the last reference has been dropped ... */ + xfs_buf_ioacct_dec(bp); + if (!(bp->b_flags & XBF_STALE) && atomic_read(&bp->b_lru_ref)) { + /* + * If the buffer is added to the LRU take a new reference to the + * buffer for the LRU and clear the (now stale) dispose list + * state flag + */ + if (list_lru_add(&bp->b_target->bt_lru, &bp->b_lru)) { + bp->b_state &= ~XFS_BSTATE_DISPOSE; + atomic_inc(&bp->b_hold); } + spin_unlock(&pag->pag_buf_lock); + } else { + /* + * most of the time buffers will already be removed from the + * LRU, so optimise that case by checking for the + * XFS_BSTATE_DISPOSE flag indicating the last list the buffer + * was on was the disposal list + */ + if (!(bp->b_state & XFS_BSTATE_DISPOSE)) { + list_lru_del(&bp->b_target->bt_lru, &bp->b_lru); + } else { + ASSERT(list_empty(&bp->b_lru)); + } + + ASSERT(!(bp->b_flags & _XBF_DELWRI_Q)); + rb_erase(&bp->b_rbnode, &pag->pag_buf_tree); + spin_unlock(&pag->pag_buf_lock); + xfs_perag_put(pag); + freebuf = true; } + +out_unlock: + spin_unlock(&bp->b_lock); + + if (freebuf) + xfs_buf_free(bp); } @@ -944,10 +1016,12 @@ xfs_buf_trylock( int locked; locked = down_trylock(&bp->b_sema) == 0; - if (locked) + if (locked) { XB_SET_OWNER(bp); - - trace_xfs_buf_trylock(bp, _RET_IP_); + trace_xfs_buf_trylock(bp, _RET_IP_); + } else { + trace_xfs_buf_trylock_fail(bp, _RET_IP_); + } return locked; } @@ -1100,22 +1174,18 @@ xfs_bwrite( return error; } -STATIC void +static void xfs_buf_bio_end_io( struct bio *bio) { - xfs_buf_t *bp = (xfs_buf_t *)bio->bi_private; + struct xfs_buf *bp = (struct xfs_buf *)bio->bi_private; /* * don't overwrite existing errors - otherwise we can lose errors on * buffers that require multiple bios to complete. */ - if (bio->bi_error) { - spin_lock(&bp->b_lock); - if (!bp->b_io_error) - bp->b_io_error = bio->bi_error; - spin_unlock(&bp->b_lock); - } + if (bio->bi_error) + cmpxchg(&bp->b_io_error, 0, bio->bi_error); if (!bp->b_error && xfs_buf_is_vmapped(bp) && (bp->b_flags & XBF_READ)) invalidate_kernel_vmap_range(bp->b_addr, xfs_buf_vmap_len(bp)); @@ -1131,7 +1201,8 @@ xfs_buf_ioapply_map( int map, int *buf_offset, int *count, - int rw) + int op, + int op_flags) { int page_index; int total_nr_pages = bp->b_page_count; @@ -1161,16 +1232,14 @@ xfs_buf_ioapply_map( next_chunk: atomic_inc(&bp->b_io_remaining); - nr_pages = BIO_MAX_SECTORS >> (PAGE_SHIFT - BBSHIFT); - if (nr_pages > total_nr_pages) - nr_pages = total_nr_pages; + nr_pages = min(total_nr_pages, BIO_MAX_PAGES); bio = bio_alloc(GFP_NOIO, nr_pages); bio->bi_bdev = bp->b_target->bt_bdev; bio->bi_iter.bi_sector = sector; bio->bi_end_io = xfs_buf_bio_end_io; bio->bi_private = bp; - + bio_set_op_attrs(bio, op, op_flags); for (; size && nr_pages; nr_pages--, page_index++) { int rbytes, nbytes = PAGE_SIZE - offset; @@ -1194,7 +1263,7 @@ next_chunk: flush_kernel_vmap_range(bp->b_addr, xfs_buf_vmap_len(bp)); } - submit_bio(rw, bio); + submit_bio(bio); if (size) goto next_chunk; } else { @@ -1214,7 +1283,8 @@ _xfs_buf_ioapply( struct xfs_buf *bp) { struct blk_plug plug; - int rw; + int op; + int op_flags = 0; int offset; int size; int i; @@ -1233,14 +1303,13 @@ _xfs_buf_ioapply( bp->b_ioend_wq = bp->b_target->bt_mount->m_buf_workqueue; if (bp->b_flags & XBF_WRITE) { + op = REQ_OP_WRITE; if (bp->b_flags & XBF_SYNCIO) - rw = WRITE_SYNC; - else - rw = WRITE; + op_flags = WRITE_SYNC; if (bp->b_flags & XBF_FUA) - rw |= REQ_FUA; + op_flags |= REQ_FUA; if (bp->b_flags & XBF_FLUSH) - rw |= REQ_FLUSH; + op_flags |= REQ_PREFLUSH; /* * Run the write verifier callback function if it exists. If @@ -1270,13 +1339,14 @@ _xfs_buf_ioapply( } } } else if (bp->b_flags & XBF_READ_AHEAD) { - rw = READA; + op = REQ_OP_READ; + op_flags = REQ_RAHEAD; } else { - rw = READ; + op = REQ_OP_READ; } /* we only use the buffer cache for meta-data */ - rw |= REQ_META; + op_flags |= REQ_META; /* * Walk all the vectors issuing IO on them. Set up the initial offset @@ -1288,7 +1358,7 @@ _xfs_buf_ioapply( size = BBTOB(bp->b_io_length); blk_start_plug(&plug); for (i = 0; i < bp->b_map_count; i++) { - xfs_buf_ioapply_map(bp, i, &offset, &size, rw); + xfs_buf_ioapply_map(bp, i, &offset, &size, op, op_flags); if (bp->b_error) break; if (size <= 0) @@ -1343,6 +1413,7 @@ xfs_buf_submit( * xfs_buf_ioend too early. */ atomic_set(&bp->b_io_remaining, 1); + xfs_buf_ioacct_inc(bp); _xfs_buf_ioapply(bp); /* @@ -1528,13 +1599,19 @@ xfs_wait_buftarg( int loop = 0; /* - * We need to flush the buffer workqueue to ensure that all IO - * completion processing is 100% done. Just waiting on buffer locks is - * not sufficient for async IO as the reference count held over IO is - * not released until after the buffer lock is dropped. Hence we need to - * ensure here that all reference counts have been dropped before we - * start walking the LRU list. + * First wait on the buftarg I/O count for all in-flight buffers to be + * released. This is critical as new buffers do not make the LRU until + * they are released. + * + * Next, flush the buffer workqueue to ensure all completion processing + * has finished. Just waiting on buffer locks is not sufficient for + * async IO as the reference count held over IO is not released until + * after the buffer lock is dropped. Hence we need to ensure here that + * all reference counts have been dropped before we start walking the + * LRU list. */ + while (percpu_counter_sum(&btp->bt_io_count)) + delay(100); drain_workqueue(btp->bt_mount->m_buf_workqueue); /* loop until there is nothing left on the lru list. */ @@ -1631,6 +1708,8 @@ xfs_free_buftarg( struct xfs_buftarg *btp) { unregister_shrinker(&btp->bt_shrinker); + ASSERT(percpu_counter_sum(&btp->bt_io_count) == 0); + percpu_counter_destroy(&btp->bt_io_count); list_lru_destroy(&btp->bt_lru); if (mp->m_flags & XFS_MOUNT_BARRIER) @@ -1695,6 +1774,9 @@ xfs_alloc_buftarg( if (list_lru_init(&btp->bt_lru)) goto error; + if (percpu_counter_init(&btp->bt_io_count, 0, GFP_KERNEL)) + goto error; + btp->bt_shrinker.count_objects = xfs_buftarg_shrink_count; btp->bt_shrinker.scan_objects = xfs_buftarg_shrink_scan; btp->bt_shrinker.seeks = DEFAULT_SEEKS; @@ -1778,18 +1860,33 @@ xfs_buf_cmp( return 0; } +/* + * submit buffers for write. + * + * When we have a large buffer list, we do not want to hold all the buffers + * locked while we block on the request queue waiting for IO dispatch. To avoid + * this problem, we lock and submit buffers in groups of 50, thereby minimising + * the lock hold times for lists which may contain thousands of objects. + * + * To do this, we sort the buffer list before we walk the list to lock and + * submit buffers, and we plug and unplug around each group of buffers we + * submit. + */ static int -__xfs_buf_delwri_submit( +xfs_buf_delwri_submit_buffers( struct list_head *buffer_list, - struct list_head *io_list, - bool wait) + struct list_head *wait_list) { - struct blk_plug plug; struct xfs_buf *bp, *n; + LIST_HEAD (submit_list); int pinned = 0; + struct blk_plug plug; + list_sort(NULL, buffer_list, xfs_buf_cmp); + + blk_start_plug(&plug); list_for_each_entry_safe(bp, n, buffer_list, b_list) { - if (!wait) { + if (!wait_list) { if (xfs_buf_ispinned(bp)) { pinned++; continue; @@ -1812,25 +1909,21 @@ __xfs_buf_delwri_submit( continue; } - list_move_tail(&bp->b_list, io_list); trace_xfs_buf_delwri_split(bp, _RET_IP_); - } - - list_sort(NULL, io_list, xfs_buf_cmp); - - blk_start_plug(&plug); - list_for_each_entry_safe(bp, n, io_list, b_list) { - bp->b_flags &= ~(_XBF_DELWRI_Q | XBF_ASYNC | XBF_WRITE_FAIL); - bp->b_flags |= XBF_WRITE | XBF_ASYNC; /* - * we do all Io submission async. This means if we need to wait - * for IO completion we need to take an extra reference so the - * buffer is still valid on the other side. + * We do all IO submission async. This means if we need + * to wait for IO completion we need to take an extra + * reference so the buffer is still valid on the other + * side. We need to move the buffer onto the io_list + * at this point so the caller can still access it. */ - if (wait) + bp->b_flags &= ~(_XBF_DELWRI_Q | XBF_WRITE_FAIL); + bp->b_flags |= XBF_WRITE | XBF_ASYNC; + if (wait_list) { xfs_buf_hold(bp); - else + list_move_tail(&bp->b_list, wait_list); + } else list_del_init(&bp->b_list); xfs_buf_submit(bp); @@ -1853,8 +1946,7 @@ int xfs_buf_delwri_submit_nowait( struct list_head *buffer_list) { - LIST_HEAD (io_list); - return __xfs_buf_delwri_submit(buffer_list, &io_list, false); + return xfs_buf_delwri_submit_buffers(buffer_list, NULL); } /* @@ -1869,15 +1961,15 @@ int xfs_buf_delwri_submit( struct list_head *buffer_list) { - LIST_HEAD (io_list); + LIST_HEAD (wait_list); int error = 0, error2; struct xfs_buf *bp; - __xfs_buf_delwri_submit(buffer_list, &io_list, true); + xfs_buf_delwri_submit_buffers(buffer_list, &wait_list); /* Wait for IO to complete. */ - while (!list_empty(&io_list)) { - bp = list_first_entry(&io_list, struct xfs_buf, b_list); + while (!list_empty(&wait_list)) { + bp = list_first_entry(&wait_list, struct xfs_buf, b_list); list_del_init(&bp->b_list); diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index 4eb89bd4ee73..1c2e52b2d926 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -43,6 +43,7 @@ typedef enum { #define XBF_READ (1 << 0) /* buffer intended for reading from device */ #define XBF_WRITE (1 << 1) /* buffer intended for writing to device */ #define XBF_READ_AHEAD (1 << 2) /* asynchronous read-ahead */ +#define XBF_NO_IOACCT (1 << 3) /* bypass I/O accounting (non-LRU bufs) */ #define XBF_ASYNC (1 << 4) /* initiator will not wait for completion */ #define XBF_DONE (1 << 5) /* all pages in the buffer uptodate */ #define XBF_STALE (1 << 6) /* buffer has been staled, do not find it */ @@ -62,6 +63,7 @@ typedef enum { #define _XBF_KMEM (1 << 21)/* backed by heap memory */ #define _XBF_DELWRI_Q (1 << 22)/* buffer on a delwri queue */ #define _XBF_COMPOUND (1 << 23)/* compound buffer */ +#define _XBF_IN_FLIGHT (1 << 25) /* I/O in flight, for accounting purposes */ typedef unsigned int xfs_buf_flags_t; @@ -81,7 +83,8 @@ typedef unsigned int xfs_buf_flags_t; { _XBF_PAGES, "PAGES" }, \ { _XBF_KMEM, "KMEM" }, \ { _XBF_DELWRI_Q, "DELWRI_Q" }, \ - { _XBF_COMPOUND, "COMPOUND" } + { _XBF_COMPOUND, "COMPOUND" }, \ + { _XBF_IN_FLIGHT, "IN_FLIGHT" } /* @@ -115,6 +118,8 @@ typedef struct xfs_buftarg { /* LRU control structures */ struct shrinker bt_shrinker; struct list_lru bt_lru; + + struct percpu_counter bt_io_count; } xfs_buftarg_t; struct xfs_buf; @@ -183,6 +188,26 @@ typedef struct xfs_buf { unsigned int b_page_count; /* size of page array */ unsigned int b_offset; /* page offset in first page */ int b_error; /* error code on I/O */ + + /* + * async write failure retry count. Initialised to zero on the first + * failure, then when it exceeds the maximum configured without a + * success the write is considered to be failed permanently and the + * iodone handler will take appropriate action. + * + * For retry timeouts, we record the jiffie of the first failure. This + * means that we can change the retry timeout for buffers already under + * I/O and thus avoid getting stuck in a retry loop with a long timeout. + * + * last_error is used to ensure that we are getting repeated errors, not + * different errors. e.g. a block device might change ENOSPC to EIO when + * a failure timeout occurs, so we want to re-initialise the error + * retry behaviour appropriately when that happens. + */ + int b_retries; + unsigned long b_first_retry_time; /* in jiffies */ + int b_last_error; + const struct xfs_buf_ops *b_ops; #ifdef XFS_BUF_LOCK_TRACKING diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index 99e91a0e554e..e455f9098d49 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -359,7 +359,7 @@ xfs_buf_item_format( for (i = 0; i < bip->bli_format_count; i++) { xfs_buf_item_format_segment(bip, lv, &vecp, offset, &bip->bli_formats[i]); - offset += bp->b_maps[i].bm_len; + offset += BBTOB(bp->b_maps[i].bm_len); } /* @@ -915,20 +915,28 @@ xfs_buf_item_log( for (i = 0; i < bip->bli_format_count; i++) { if (start > last) break; - end = start + BBTOB(bp->b_maps[i].bm_len); + end = start + BBTOB(bp->b_maps[i].bm_len) - 1; + + /* skip to the map that includes the first byte to log */ if (first > end) { start += BBTOB(bp->b_maps[i].bm_len); continue; } + + /* + * Trim the range to this segment and mark it in the bitmap. + * Note that we must convert buffer offsets to segment relative + * offsets (e.g., the first byte of each segment is byte 0 of + * that segment). + */ if (first < start) first = start; if (end > last) end = last; - - xfs_buf_item_log_segment(first, end, + xfs_buf_item_log_segment(first - start, end - start, &bip->bli_formats[i].blf_data_map[0]); - start += bp->b_maps[i].bm_len; + start += BBTOB(bp->b_maps[i].bm_len); } } @@ -949,6 +957,7 @@ xfs_buf_item_free( xfs_buf_log_item_t *bip) { xfs_buf_item_free_format(bip); + kmem_free(bip->bli_item.li_lv_shadow); kmem_zone_free(xfs_buf_item_zone, bip); } @@ -1042,35 +1051,22 @@ xfs_buf_do_callbacks( } } -/* - * This is the iodone() function for buffers which have had callbacks - * attached to them by xfs_buf_attach_iodone(). It should remove each - * log item from the buffer's list and call the callback of each in turn. - * When done, the buffer's fsprivate field is set to NULL and the buffer - * is unlocked with a call to iodone(). - */ -void -xfs_buf_iodone_callbacks( +static bool +xfs_buf_iodone_callback_error( struct xfs_buf *bp) { struct xfs_log_item *lip = bp->b_fspriv; struct xfs_mount *mp = lip->li_mountp; static ulong lasttime; static xfs_buftarg_t *lasttarg; - - if (likely(!bp->b_error)) - goto do_callbacks; + struct xfs_error_cfg *cfg; /* * If we've already decided to shutdown the filesystem because of * I/O errors, there's no point in giving this a retry. */ - if (XFS_FORCED_SHUTDOWN(mp)) { - xfs_buf_stale(bp); - bp->b_flags |= XBF_DONE; - trace_xfs_buf_item_iodone(bp, _RET_IP_); - goto do_callbacks; - } + if (XFS_FORCED_SHUTDOWN(mp)) + goto out_stale; if (bp->b_target != lasttarg || time_after(jiffies, (lasttime + 5*HZ))) { @@ -1079,45 +1075,93 @@ xfs_buf_iodone_callbacks( } lasttarg = bp->b_target; + /* synchronous writes will have callers process the error */ + if (!(bp->b_flags & XBF_ASYNC)) + goto out_stale; + + trace_xfs_buf_item_iodone_async(bp, _RET_IP_); + ASSERT(bp->b_iodone != NULL); + + cfg = xfs_error_get_cfg(mp, XFS_ERR_METADATA, bp->b_error); + /* * If the write was asynchronous then no one will be looking for the - * error. Clear the error state and write the buffer out again. - * - * XXX: This helps against transient write errors, but we need to find - * a way to shut the filesystem down if the writes keep failing. - * - * In practice we'll shut the filesystem down soon as non-transient - * errors tend to affect the whole device and a failing log write - * will make us give up. But we really ought to do better here. + * error. If this is the first failure of this type, clear the error + * state and write the buffer out again. This means we always retry an + * async write failure at least once, but we also need to set the buffer + * up to behave correctly now for repeated failures. */ - if (bp->b_flags & XBF_ASYNC) { - ASSERT(bp->b_iodone != NULL); + if (!(bp->b_flags & (XBF_STALE | XBF_WRITE_FAIL)) || + bp->b_last_error != bp->b_error) { + bp->b_flags |= (XBF_WRITE | XBF_DONE | XBF_WRITE_FAIL); + bp->b_last_error = bp->b_error; + if (cfg->retry_timeout && !bp->b_first_retry_time) + bp->b_first_retry_time = jiffies; + + xfs_buf_ioerror(bp, 0); + xfs_buf_submit(bp); + return true; + } - trace_xfs_buf_item_iodone_async(bp, _RET_IP_); + /* + * Repeated failure on an async write. Take action according to the + * error configuration we have been set up to use. + */ - xfs_buf_ioerror(bp, 0); /* errno of 0 unsets the flag */ + if (cfg->max_retries != XFS_ERR_RETRY_FOREVER && + ++bp->b_retries > cfg->max_retries) + goto permanent_error; + if (cfg->retry_timeout && + time_after(jiffies, cfg->retry_timeout + bp->b_first_retry_time)) + goto permanent_error; - if (!(bp->b_flags & (XBF_STALE|XBF_WRITE_FAIL))) { - bp->b_flags |= XBF_WRITE | XBF_ASYNC | - XBF_DONE | XBF_WRITE_FAIL; - xfs_buf_submit(bp); - } else { - xfs_buf_relse(bp); - } + /* At unmount we may treat errors differently */ + if ((mp->m_flags & XFS_MOUNT_UNMOUNTING) && mp->m_fail_unmount) + goto permanent_error; - return; - } + /* still a transient error, higher layers will retry */ + xfs_buf_ioerror(bp, 0); + xfs_buf_relse(bp); + return true; /* - * If the write of the buffer was synchronous, we want to make - * sure to return the error to the caller of xfs_bwrite(). + * Permanent error - we need to trigger a shutdown if we haven't already + * to indicate that inconsistency will result from this action. */ +permanent_error: + xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); +out_stale: xfs_buf_stale(bp); bp->b_flags |= XBF_DONE; - trace_xfs_buf_error_relse(bp, _RET_IP_); + return false; +} + +/* + * This is the iodone() function for buffers which have had callbacks attached + * to them by xfs_buf_attach_iodone(). We need to iterate the items on the + * callback list, mark the buffer as having no more callbacks and then push the + * buffer through IO completion processing. + */ +void +xfs_buf_iodone_callbacks( + struct xfs_buf *bp) +{ + /* + * If there is an error, process it. Some errors require us + * to run callbacks after failure processing is done so we + * detect that and take appropriate action. + */ + if (bp->b_error && xfs_buf_iodone_callback_error(bp)) + return; + + /* + * Successful IO or permanent error. Either way, we can clear the + * retry state here in preparation for the next error that may occur. + */ + bp->b_last_error = 0; + bp->b_retries = 0; -do_callbacks: xfs_buf_do_callbacks(bp); bp->b_fspriv = NULL; bp->b_iodone = NULL; diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index 316b2a1bdba5..ccb0811963b2 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -74,6 +74,7 @@ xfs_qm_dqdestroy( { ASSERT(list_empty(&dqp->q_lru)); + kmem_free(dqp->q_logitem.qli_item.li_lv_shadow); mutex_destroy(&dqp->q_qlock); XFS_STATS_DEC(dqp->q_mount, xs_qm_dquot); @@ -614,11 +615,10 @@ xfs_qm_dqread( trace_xfs_dqread(dqp); if (flags & XFS_QMOPT_DQALLOC) { - tp = xfs_trans_alloc(mp, XFS_TRANS_QM_DQALLOC); - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_qm_dqalloc, - XFS_QM_DQALLOC_SPACE_RES(mp), 0); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_qm_dqalloc, + XFS_QM_DQALLOC_SPACE_RES(mp), 0, 0, &tp); if (error) - goto error1; + goto error0; } /* @@ -692,7 +692,7 @@ error0: * end of the chunk, skip ahead to first id in next allocated chunk * using the SEEK_DATA interface. */ -int +static int xfs_dq_get_next_id( xfs_mount_t *mp, uint type, diff --git a/fs/xfs/xfs_dquot_item.c b/fs/xfs/xfs_dquot_item.c index 814cff94e78f..2c7a1629e064 100644 --- a/fs/xfs/xfs_dquot_item.c +++ b/fs/xfs/xfs_dquot_item.c @@ -370,6 +370,8 @@ xfs_qm_qoffend_logitem_committed( spin_lock(&ailp->xa_lock); xfs_trans_ail_delete(ailp, &qfs->qql_item, SHUTDOWN_LOG_IO_ERROR); + kmem_free(qfs->qql_item.li_lv_shadow); + kmem_free(lip->li_lv_shadow); kmem_free(qfs); kmem_free(qfe); return (xfs_lsn_t)-1; diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c index 88693a98fac5..ed7ee4e8af73 100644 --- a/fs/xfs/xfs_error.c +++ b/fs/xfs/xfs_error.c @@ -55,12 +55,15 @@ xfs_error_test(int error_tag, int *fsidp, char *expression, } int -xfs_errortag_add(int error_tag, xfs_mount_t *mp) +xfs_errortag_add(unsigned int error_tag, xfs_mount_t *mp) { int i; int len; int64_t fsid; + if (error_tag >= XFS_ERRTAG_MAX) + return -EINVAL; + memcpy(&fsid, mp->m_fixedfsid, sizeof(xfs_fsid_t)); for (i = 0; i < XFS_NUM_INJECT_ERROR; i++) { diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h index 4ed3042a0f16..2e4f67f68856 100644 --- a/fs/xfs/xfs_error.h +++ b/fs/xfs/xfs_error.h @@ -128,7 +128,7 @@ extern int xfs_error_test(int, int *, char *, int, char *, unsigned long); xfs_error_test((tag), (mp)->m_fixedfsid, "expr", __LINE__, __FILE__, \ (rf)))) -extern int xfs_errortag_add(int error_tag, struct xfs_mount *mp); +extern int xfs_errortag_add(unsigned int error_tag, struct xfs_mount *mp); extern int xfs_errortag_clearall(struct xfs_mount *mp, int loud); #else #define XFS_TEST_ERROR(expr, mp, tag, rf) (expr) diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index 4aa0153214f9..ab779460ecbf 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -40,6 +40,7 @@ void xfs_efi_item_free( struct xfs_efi_log_item *efip) { + kmem_free(efip->efi_item.li_lv_shadow); if (efip->efi_format.efi_nextents > XFS_EFI_MAX_FAST_EXTENTS) kmem_free(efip); else @@ -300,6 +301,7 @@ static inline struct xfs_efd_log_item *EFD_ITEM(struct xfs_log_item *lip) STATIC void xfs_efd_item_free(struct xfs_efd_log_item *efdp) { + kmem_free(efdp->efd_item.li_lv_shadow); if (efdp->efd_format.efd_nextents > XFS_EFD_MAX_FAST_EXTENTS) kmem_free(efdp); else diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 85ce3032f815..ed95e5bb04e6 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -37,6 +37,7 @@ #include "xfs_log.h" #include "xfs_icache.h" #include "xfs_pnfs.h" +#include "xfs_iomap.h" #include <linux/dcache.h> #include <linux/falloc.h> @@ -80,61 +81,17 @@ xfs_rw_ilock_demote( } /* - * xfs_iozero clears the specified range supplied via the page cache (except in - * the DAX case). Writes through the page cache will allocate blocks over holes, - * though the callers usually map the holes first and avoid them. If a block is - * not completely zeroed, then it will be read from disk before being partially - * zeroed. - * - * In the DAX case, we can just directly write to the underlying pages. This - * will not allocate blocks, but will avoid holes and unwritten extents and so - * not do unnecessary work. + * Clear the specified ranges to zero through either the pagecache or DAX. + * Holes and unwritten extents will be left as-is as they already are zeroed. */ int -xfs_iozero( - struct xfs_inode *ip, /* inode */ - loff_t pos, /* offset in file */ - size_t count) /* size of data to zero */ +xfs_zero_range( + struct xfs_inode *ip, + xfs_off_t pos, + xfs_off_t count, + bool *did_zero) { - struct page *page; - struct address_space *mapping; - int status = 0; - - - mapping = VFS_I(ip)->i_mapping; - do { - unsigned offset, bytes; - void *fsdata; - - offset = (pos & (PAGE_SIZE -1)); /* Within page */ - bytes = PAGE_SIZE - offset; - if (bytes > count) - bytes = count; - - if (IS_DAX(VFS_I(ip))) { - status = dax_zero_page_range(VFS_I(ip), pos, bytes, - xfs_get_blocks_direct); - if (status) - break; - } else { - status = pagecache_write_begin(NULL, mapping, pos, bytes, - AOP_FLAG_UNINTERRUPTIBLE, - &page, &fsdata); - if (status) - break; - - zero_user(page, offset, bytes); - - status = pagecache_write_end(NULL, mapping, pos, bytes, - bytes, page, fsdata); - WARN_ON(status <= 0); /* can't return less than zero! */ - status = 0; - } - pos += bytes; - count -= bytes; - } while (count); - - return status; + return iomap_zero_range(VFS_I(ip), pos, count, NULL, &xfs_iomap_ops); } int @@ -145,12 +102,10 @@ xfs_update_prealloc_flags( struct xfs_trans *tp; int error; - tp = xfs_trans_alloc(ip->i_mount, XFS_TRANS_WRITEID); - error = xfs_trans_reserve(tp, &M_RES(ip->i_mount)->tr_writeid, 0, 0); - if (error) { - xfs_trans_cancel(tp); + error = xfs_trans_alloc(ip->i_mount, &M_RES(ip->i_mount)->tr_writeid, + 0, 0, 0, &tp); + if (error) return error; - } xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); @@ -284,48 +239,35 @@ xfs_file_fsync( } STATIC ssize_t -xfs_file_read_iter( +xfs_file_dio_aio_read( struct kiocb *iocb, struct iov_iter *to) { - struct file *file = iocb->ki_filp; - struct inode *inode = file->f_mapping->host; + struct address_space *mapping = iocb->ki_filp->f_mapping; + struct inode *inode = mapping->host; struct xfs_inode *ip = XFS_I(inode); - struct xfs_mount *mp = ip->i_mount; - size_t size = iov_iter_count(to); + loff_t isize = i_size_read(inode); + size_t count = iov_iter_count(to); + struct iov_iter data; + struct xfs_buftarg *target; ssize_t ret = 0; - int ioflags = 0; - xfs_fsize_t n; - loff_t pos = iocb->ki_pos; - XFS_STATS_INC(mp, xs_read_calls); + trace_xfs_file_direct_read(ip, count, iocb->ki_pos); - if (unlikely(iocb->ki_flags & IOCB_DIRECT)) - ioflags |= XFS_IO_ISDIRECT; - if (file->f_mode & FMODE_NOCMTIME) - ioflags |= XFS_IO_INVIS; - - if ((ioflags & XFS_IO_ISDIRECT) && !IS_DAX(inode)) { - xfs_buftarg_t *target = - XFS_IS_REALTIME_INODE(ip) ? - mp->m_rtdev_targp : mp->m_ddev_targp; - /* DIO must be aligned to device logical sector size */ - if ((pos | size) & target->bt_logical_sectormask) { - if (pos == i_size_read(inode)) - return 0; - return -EINVAL; - } - } + if (!count) + return 0; /* skip atime */ - n = mp->m_super->s_maxbytes - pos; - if (n <= 0 || size == 0) - return 0; - - if (n < size) - size = n; + if (XFS_IS_REALTIME_INODE(ip)) + target = ip->i_mount->m_rtdev_targp; + else + target = ip->i_mount->m_ddev_targp; - if (XFS_FORCED_SHUTDOWN(mp)) - return -EIO; + /* DIO must be aligned to device logical sector size */ + if ((iocb->ki_pos | count) & target->bt_logical_sectormask) { + if (iocb->ki_pos == isize) + return 0; + return -EINVAL; + } /* * Locking is a bit tricky here. If we take an exclusive lock for direct @@ -338,7 +280,7 @@ xfs_file_read_iter( * serialisation. */ xfs_rw_ilock(ip, XFS_IOLOCK_SHARED); - if ((ioflags & XFS_IO_ISDIRECT) && inode->i_mapping->nrpages) { + if (mapping->nrpages) { xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); xfs_rw_ilock(ip, XFS_IOLOCK_EXCL); @@ -353,8 +295,8 @@ xfs_file_read_iter( * flush and reduce the chances of repeated iolock cycles going * forward. */ - if (inode->i_mapping->nrpages) { - ret = filemap_write_and_wait(VFS_I(ip)->i_mapping); + if (mapping->nrpages) { + ret = filemap_write_and_wait(mapping); if (ret) { xfs_rw_iunlock(ip, XFS_IOLOCK_EXCL); return ret; @@ -365,20 +307,95 @@ xfs_file_read_iter( * we fail to invalidate a page, but this should never * happen on XFS. Warn if it does fail. */ - ret = invalidate_inode_pages2(VFS_I(ip)->i_mapping); + ret = invalidate_inode_pages2(mapping); WARN_ON_ONCE(ret); ret = 0; } xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL); } - trace_xfs_file_read(ip, size, pos, ioflags); + data = *to; + ret = __blockdev_direct_IO(iocb, inode, target->bt_bdev, &data, + xfs_get_blocks_direct, NULL, NULL, 0); + if (ret > 0) { + iocb->ki_pos += ret; + iov_iter_advance(to, ret); + } + xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); + + file_accessed(iocb->ki_filp); + return ret; +} + +static noinline ssize_t +xfs_file_dax_read( + struct kiocb *iocb, + struct iov_iter *to) +{ + struct address_space *mapping = iocb->ki_filp->f_mapping; + struct inode *inode = mapping->host; + struct xfs_inode *ip = XFS_I(inode); + struct iov_iter data = *to; + size_t count = iov_iter_count(to); + ssize_t ret = 0; + + trace_xfs_file_dax_read(ip, count, iocb->ki_pos); + + if (!count) + return 0; /* skip atime */ + + xfs_rw_ilock(ip, XFS_IOLOCK_SHARED); + ret = dax_do_io(iocb, inode, &data, xfs_get_blocks_direct, NULL, 0); + if (ret > 0) { + iocb->ki_pos += ret; + iov_iter_advance(to, ret); + } + xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); + + file_accessed(iocb->ki_filp); + return ret; +} + +STATIC ssize_t +xfs_file_buffered_aio_read( + struct kiocb *iocb, + struct iov_iter *to) +{ + struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp)); + ssize_t ret; + + trace_xfs_file_buffered_read(ip, iov_iter_count(to), iocb->ki_pos); + xfs_rw_ilock(ip, XFS_IOLOCK_SHARED); ret = generic_file_read_iter(iocb, to); + xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); + + return ret; +} + +STATIC ssize_t +xfs_file_read_iter( + struct kiocb *iocb, + struct iov_iter *to) +{ + struct inode *inode = file_inode(iocb->ki_filp); + struct xfs_mount *mp = XFS_I(inode)->i_mount; + ssize_t ret = 0; + + XFS_STATS_INC(mp, xs_read_calls); + + if (XFS_FORCED_SHUTDOWN(mp)) + return -EIO; + + if (IS_DAX(inode)) + ret = xfs_file_dax_read(iocb, to); + else if (iocb->ki_flags & IOCB_DIRECT) + ret = xfs_file_dio_aio_read(iocb, to); + else + ret = xfs_file_buffered_aio_read(iocb, to); + if (ret > 0) XFS_STATS_ADD(mp, xs_read_bytes, ret); - - xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); return ret; } @@ -391,18 +408,14 @@ xfs_file_splice_read( unsigned int flags) { struct xfs_inode *ip = XFS_I(infilp->f_mapping->host); - int ioflags = 0; ssize_t ret; XFS_STATS_INC(ip->i_mount, xs_read_calls); - if (infilp->f_mode & FMODE_NOCMTIME) - ioflags |= XFS_IO_INVIS; - if (XFS_FORCED_SHUTDOWN(ip->i_mount)) return -EIO; - trace_xfs_file_splice_read(ip, count, *ppos, ioflags); + trace_xfs_file_splice_read(ip, count, *ppos); /* * DAX inodes cannot ues the page cache for splice, so we have to push @@ -426,49 +439,6 @@ out: } /* - * This routine is called to handle zeroing any space in the last block of the - * file that is beyond the EOF. We do this since the size is being increased - * without writing anything to that block and we don't want to read the - * garbage on the disk. - */ -STATIC int /* error (positive) */ -xfs_zero_last_block( - struct xfs_inode *ip, - xfs_fsize_t offset, - xfs_fsize_t isize, - bool *did_zeroing) -{ - struct xfs_mount *mp = ip->i_mount; - xfs_fileoff_t last_fsb = XFS_B_TO_FSBT(mp, isize); - int zero_offset = XFS_B_FSB_OFFSET(mp, isize); - int zero_len; - int nimaps = 1; - int error = 0; - struct xfs_bmbt_irec imap; - - xfs_ilock(ip, XFS_ILOCK_EXCL); - error = xfs_bmapi_read(ip, last_fsb, 1, &imap, &nimaps, 0); - xfs_iunlock(ip, XFS_ILOCK_EXCL); - if (error) - return error; - - ASSERT(nimaps > 0); - - /* - * If the block underlying isize is just a hole, then there - * is nothing to zero. - */ - if (imap.br_startblock == HOLESTARTBLOCK) - return 0; - - zero_len = mp->m_sb.sb_blocksize - zero_offset; - if (isize + zero_len > offset) - zero_len = offset - isize; - *did_zeroing = true; - return xfs_iozero(ip, isize, zero_len); -} - -/* * Zero any on disk space between the current EOF and the new, larger EOF. * * This handles the normal case of zeroing the remainder of the last block in @@ -486,94 +456,11 @@ xfs_zero_eof( xfs_fsize_t isize, /* current inode size */ bool *did_zeroing) { - struct xfs_mount *mp = ip->i_mount; - xfs_fileoff_t start_zero_fsb; - xfs_fileoff_t end_zero_fsb; - xfs_fileoff_t zero_count_fsb; - xfs_fileoff_t last_fsb; - xfs_fileoff_t zero_off; - xfs_fsize_t zero_len; - int nimaps; - int error = 0; - struct xfs_bmbt_irec imap; - ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL)); ASSERT(offset > isize); trace_xfs_zero_eof(ip, isize, offset - isize); - - /* - * First handle zeroing the block on which isize resides. - * - * We only zero a part of that block so it is handled specially. - */ - if (XFS_B_FSB_OFFSET(mp, isize) != 0) { - error = xfs_zero_last_block(ip, offset, isize, did_zeroing); - if (error) - return error; - } - - /* - * Calculate the range between the new size and the old where blocks - * needing to be zeroed may exist. - * - * To get the block where the last byte in the file currently resides, - * we need to subtract one from the size and truncate back to a block - * boundary. We subtract 1 in case the size is exactly on a block - * boundary. - */ - last_fsb = isize ? XFS_B_TO_FSBT(mp, isize - 1) : (xfs_fileoff_t)-1; - start_zero_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)isize); - end_zero_fsb = XFS_B_TO_FSBT(mp, offset - 1); - ASSERT((xfs_sfiloff_t)last_fsb < (xfs_sfiloff_t)start_zero_fsb); - if (last_fsb == end_zero_fsb) { - /* - * The size was only incremented on its last block. - * We took care of that above, so just return. - */ - return 0; - } - - ASSERT(start_zero_fsb <= end_zero_fsb); - while (start_zero_fsb <= end_zero_fsb) { - nimaps = 1; - zero_count_fsb = end_zero_fsb - start_zero_fsb + 1; - - xfs_ilock(ip, XFS_ILOCK_EXCL); - error = xfs_bmapi_read(ip, start_zero_fsb, zero_count_fsb, - &imap, &nimaps, 0); - xfs_iunlock(ip, XFS_ILOCK_EXCL); - if (error) - return error; - - ASSERT(nimaps > 0); - - if (imap.br_state == XFS_EXT_UNWRITTEN || - imap.br_startblock == HOLESTARTBLOCK) { - start_zero_fsb = imap.br_startoff + imap.br_blockcount; - ASSERT(start_zero_fsb <= (end_zero_fsb + 1)); - continue; - } - - /* - * There are blocks we need to zero. - */ - zero_off = XFS_FSB_TO_B(mp, start_zero_fsb); - zero_len = XFS_FSB_TO_B(mp, imap.br_blockcount); - - if ((zero_off + zero_len) > offset) - zero_len = offset - zero_off; - - error = xfs_iozero(ip, zero_off, zero_len); - if (error) - return error; - - *did_zeroing = true; - start_zero_fsb = imap.br_startoff + imap.br_blockcount; - ASSERT(start_zero_fsb <= (end_zero_fsb + 1)); - } - - return 0; + return xfs_zero_range(ip, isize, offset - isize, did_zeroing); } /* @@ -724,8 +611,7 @@ xfs_file_dio_aio_write( mp->m_rtdev_targp : mp->m_ddev_targp; /* DIO must be aligned to device logical sector size */ - if (!IS_DAX(inode) && - ((iocb->ki_pos | count) & target->bt_logical_sectormask)) + if ((iocb->ki_pos | count) & target->bt_logical_sectormask) return -EINVAL; /* "unaligned" here means not aligned to a filesystem block */ @@ -764,7 +650,7 @@ xfs_file_dio_aio_write( end = iocb->ki_pos + count - 1; /* - * See xfs_file_read_iter() for why we do a full-file flush here. + * See xfs_file_dio_aio_read() for why we do a full-file flush here. */ if (mapping->nrpages) { ret = filemap_write_and_wait(VFS_I(ip)->i_mapping); @@ -791,10 +677,12 @@ xfs_file_dio_aio_write( iolock = XFS_IOLOCK_SHARED; } - trace_xfs_file_direct_write(ip, count, iocb->ki_pos, 0); + trace_xfs_file_direct_write(ip, count, iocb->ki_pos); data = *from; - ret = mapping->a_ops->direct_IO(iocb, &data); + ret = __blockdev_direct_IO(iocb, inode, target->bt_bdev, &data, + xfs_get_blocks_direct, xfs_end_io_direct_write, + NULL, DIO_ASYNC_EXTEND); /* see generic_file_direct_write() for why this is necessary */ if (mapping->nrpages) { @@ -811,10 +699,70 @@ out: xfs_rw_iunlock(ip, iolock); /* - * No fallback to buffered IO on errors for XFS. DAX can result in - * partial writes, but direct IO will either complete fully or fail. + * No fallback to buffered IO on errors for XFS, direct IO will either + * complete fully or fail. */ - ASSERT(ret < 0 || ret == count || IS_DAX(VFS_I(ip))); + ASSERT(ret < 0 || ret == count); + return ret; +} + +static noinline ssize_t +xfs_file_dax_write( + struct kiocb *iocb, + struct iov_iter *from) +{ + struct address_space *mapping = iocb->ki_filp->f_mapping; + struct inode *inode = mapping->host; + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + ssize_t ret = 0; + int unaligned_io = 0; + int iolock; + struct iov_iter data; + + /* "unaligned" here means not aligned to a filesystem block */ + if ((iocb->ki_pos & mp->m_blockmask) || + ((iocb->ki_pos + iov_iter_count(from)) & mp->m_blockmask)) { + unaligned_io = 1; + iolock = XFS_IOLOCK_EXCL; + } else if (mapping->nrpages) { + iolock = XFS_IOLOCK_EXCL; + } else { + iolock = XFS_IOLOCK_SHARED; + } + xfs_rw_ilock(ip, iolock); + + ret = xfs_file_aio_write_checks(iocb, from, &iolock); + if (ret) + goto out; + + /* + * Yes, even DAX files can have page cache attached to them: A zeroed + * page is inserted into the pagecache when we have to serve a write + * fault on a hole. It should never be dirtied and can simply be + * dropped from the pagecache once we get real data for the page. + */ + if (mapping->nrpages) { + ret = invalidate_inode_pages2(mapping); + WARN_ON_ONCE(ret); + } + + if (iolock == XFS_IOLOCK_EXCL && !unaligned_io) { + xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL); + iolock = XFS_IOLOCK_SHARED; + } + + trace_xfs_file_dax_write(ip, iov_iter_count(from), iocb->ki_pos); + + data = *from; + ret = dax_do_io(iocb, inode, &data, xfs_get_blocks_direct, + xfs_end_io_direct_write, 0); + if (ret > 0) { + iocb->ki_pos += ret; + iov_iter_advance(from, ret); + } +out: + xfs_rw_iunlock(ip, iolock); return ret; } @@ -841,9 +789,8 @@ xfs_file_buffered_aio_write( current->backing_dev_info = inode_to_bdi(inode); write_retry: - trace_xfs_file_buffered_write(ip, iov_iter_count(from), - iocb->ki_pos, 0); - ret = generic_perform_write(file, from, iocb->ki_pos); + trace_xfs_file_buffered_write(ip, iov_iter_count(from), iocb->ki_pos); + ret = iomap_file_buffered_write(iocb, from, &xfs_iomap_ops); if (likely(ret >= 0)) iocb->ki_pos += ret; @@ -897,7 +844,9 @@ xfs_file_write_iter( if (XFS_FORCED_SHUTDOWN(ip->i_mount)) return -EIO; - if ((iocb->ki_flags & IOCB_DIRECT) || IS_DAX(inode)) + if (IS_DAX(inode)) + ret = xfs_file_dax_write(iocb, from); + else if (iocb->ki_flags & IOCB_DIRECT) ret = xfs_file_dio_aio_write(iocb, from); else ret = xfs_file_buffered_aio_write(iocb, from); @@ -1553,9 +1502,9 @@ xfs_filemap_page_mkwrite( xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); if (IS_DAX(inode)) { - ret = __dax_mkwrite(vma, vmf, xfs_get_blocks_dax_fault, NULL); + ret = dax_mkwrite(vma, vmf, xfs_get_blocks_dax_fault); } else { - ret = block_page_mkwrite(vma, vmf, xfs_get_blocks); + ret = iomap_page_mkwrite(vma, vmf, &xfs_iomap_ops); ret = block_page_mkwrite_return(ret); } @@ -1587,7 +1536,7 @@ xfs_filemap_fault( * changes to xfs_get_blocks_direct() to map unwritten extent * ioend for conversion on read-only mappings. */ - ret = __dax_fault(vma, vmf, xfs_get_blocks_dax_fault, NULL); + ret = dax_fault(vma, vmf, xfs_get_blocks_dax_fault); } else ret = filemap_fault(vma, vmf); xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); @@ -1624,8 +1573,7 @@ xfs_filemap_pmd_fault( } xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); - ret = __dax_pmd_fault(vma, addr, pmd, flags, xfs_get_blocks_dax_fault, - NULL); + ret = dax_pmd_fault(vma, addr, pmd, flags, xfs_get_blocks_dax_fault); xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); if (flags & FAULT_FLAG_WRITE) diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index ee3aaa0a5317..7191c3878b4a 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -198,14 +198,10 @@ xfs_growfs_data_private( return error; } - tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFS); - tp->t_flags |= XFS_TRANS_RESERVE; - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_growdata, - XFS_GROWFS_SPACE_RES(mp), 0); - if (error) { - xfs_trans_cancel(tp); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_growdata, + XFS_GROWFS_SPACE_RES(mp), 0, XFS_TRANS_RESERVE, &tp); + if (error) return error; - } /* * Write new AG headers to disk. Non-transactional, but written @@ -243,8 +239,8 @@ xfs_growfs_data_private( agf->agf_roots[XFS_BTNUM_CNTi] = cpu_to_be32(XFS_CNT_BLOCK(mp)); agf->agf_levels[XFS_BTNUM_BNOi] = cpu_to_be32(1); agf->agf_levels[XFS_BTNUM_CNTi] = cpu_to_be32(1); - agf->agf_flfirst = 0; - agf->agf_fllast = cpu_to_be32(XFS_AGFL_SIZE(mp) - 1); + agf->agf_flfirst = cpu_to_be32(1); + agf->agf_fllast = 0; agf->agf_flcount = 0; tmpsize = agsize - XFS_PREALLOC_BLOCKS(mp); agf->agf_freeblks = cpu_to_be32(tmpsize); @@ -671,8 +667,11 @@ xfs_reserve_blocks( __uint64_t *inval, xfs_fsop_resblks_t *outval) { - __int64_t lcounter, delta, fdblks_delta; + __int64_t lcounter, delta; + __int64_t fdblks_delta = 0; __uint64_t request; + __int64_t free; + int error = 0; /* If inval is null, report current values and return */ if (inval == (__uint64_t *)NULL) { @@ -686,24 +685,23 @@ xfs_reserve_blocks( request = *inval; /* - * With per-cpu counters, this becomes an interesting - * problem. we needto work out if we are freeing or allocation - * blocks first, then we can do the modification as necessary. + * With per-cpu counters, this becomes an interesting problem. we need + * to work out if we are freeing or allocation blocks first, then we can + * do the modification as necessary. * - * We do this under the m_sb_lock so that if we are near - * ENOSPC, we will hold out any changes while we work out - * what to do. This means that the amount of free space can - * change while we do this, so we need to retry if we end up - * trying to reserve more space than is available. + * We do this under the m_sb_lock so that if we are near ENOSPC, we will + * hold out any changes while we work out what to do. This means that + * the amount of free space can change while we do this, so we need to + * retry if we end up trying to reserve more space than is available. */ -retry: spin_lock(&mp->m_sb_lock); /* * If our previous reservation was larger than the current value, - * then move any unused blocks back to the free pool. + * then move any unused blocks back to the free pool. Modify the resblks + * counters directly since we shouldn't have any problems unreserving + * space. */ - fdblks_delta = 0; if (mp->m_resblks > request) { lcounter = mp->m_resblks_avail - request; if (lcounter > 0) { /* release unused blocks */ @@ -711,54 +709,67 @@ retry: mp->m_resblks_avail -= lcounter; } mp->m_resblks = request; - } else { - __int64_t free; + if (fdblks_delta) { + spin_unlock(&mp->m_sb_lock); + error = xfs_mod_fdblocks(mp, fdblks_delta, 0); + spin_lock(&mp->m_sb_lock); + } + + goto out; + } + /* + * If the request is larger than the current reservation, reserve the + * blocks before we update the reserve counters. Sample m_fdblocks and + * perform a partial reservation if the request exceeds free space. + */ + error = -ENOSPC; + do { free = percpu_counter_sum(&mp->m_fdblocks) - XFS_ALLOC_SET_ASIDE(mp); if (!free) - goto out; /* ENOSPC and fdblks_delta = 0 */ + break; delta = request - mp->m_resblks; lcounter = free - delta; - if (lcounter < 0) { + if (lcounter < 0) /* We can't satisfy the request, just get what we can */ - mp->m_resblks += free; - mp->m_resblks_avail += free; - fdblks_delta = -free; - } else { - fdblks_delta = -delta; - mp->m_resblks = request; - mp->m_resblks_avail += delta; - } - } -out: - if (outval) { - outval->resblks = mp->m_resblks; - outval->resblks_avail = mp->m_resblks_avail; - } - spin_unlock(&mp->m_sb_lock); + fdblks_delta = free; + else + fdblks_delta = delta; - if (fdblks_delta) { /* - * If we are putting blocks back here, m_resblks_avail is - * already at its max so this will put it in the free pool. - * - * If we need space, we'll either succeed in getting it - * from the free block count or we'll get an enospc. If - * we get a ENOSPC, it means things changed while we were - * calculating fdblks_delta and so we should try again to - * see if there is anything left to reserve. + * We'll either succeed in getting space from the free block + * count or we'll get an ENOSPC. If we get a ENOSPC, it means + * things changed while we were calculating fdblks_delta and so + * we should try again to see if there is anything left to + * reserve. * * Don't set the reserved flag here - we don't want to reserve * the extra reserve blocks from the reserve..... */ - int error; - error = xfs_mod_fdblocks(mp, fdblks_delta, 0); - if (error == -ENOSPC) - goto retry; + spin_unlock(&mp->m_sb_lock); + error = xfs_mod_fdblocks(mp, -fdblks_delta, 0); + spin_lock(&mp->m_sb_lock); + } while (error == -ENOSPC); + + /* + * Update the reserve counters if blocks have been successfully + * allocated. + */ + if (!error && fdblks_delta) { + mp->m_resblks += fdblks_delta; + mp->m_resblks_avail += fdblks_delta; } - return 0; + +out: + if (outval) { + outval->resblks = mp->m_resblks; + outval->resblks_avail = mp->m_resblks_avail; + } + + spin_unlock(&mp->m_sb_lock); + return error; } int diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index bf2d60749278..fb39a66914dd 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -37,9 +37,6 @@ #include <linux/kthread.h> #include <linux/freezer.h> -STATIC void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp, - struct xfs_perag *pag, struct xfs_inode *ip); - /* * Allocate and initialise an xfs_inode. */ @@ -94,13 +91,6 @@ xfs_inode_free_callback( struct inode *inode = container_of(head, struct inode, i_rcu); struct xfs_inode *ip = XFS_I(inode); - kmem_zone_free(xfs_inode_zone, ip); -} - -void -xfs_inode_free( - struct xfs_inode *ip) -{ switch (VFS_I(ip)->i_mode & S_IFMT) { case S_IFREG: case S_IFDIR: @@ -118,6 +108,25 @@ xfs_inode_free( ip->i_itemp = NULL; } + kmem_zone_free(xfs_inode_zone, ip); +} + +static void +__xfs_inode_free( + struct xfs_inode *ip) +{ + /* asserts to verify all state is correct here */ + ASSERT(atomic_read(&ip->i_pincount) == 0); + ASSERT(!xfs_isiflocked(ip)); + XFS_STATS_DEC(ip->i_mount, vn_active); + + call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback); +} + +void +xfs_inode_free( + struct xfs_inode *ip) +{ /* * Because we use RCU freeing we need to ensure the inode always * appears to be reclaimed with an invalid inode number when in the @@ -129,12 +138,123 @@ xfs_inode_free( ip->i_ino = 0; spin_unlock(&ip->i_flags_lock); - /* asserts to verify all state is correct here */ - ASSERT(atomic_read(&ip->i_pincount) == 0); - ASSERT(!xfs_isiflocked(ip)); - XFS_STATS_DEC(ip->i_mount, vn_active); + __xfs_inode_free(ip); +} - call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback); +/* + * Queue a new inode reclaim pass if there are reclaimable inodes and there + * isn't a reclaim pass already in progress. By default it runs every 5s based + * on the xfs periodic sync default of 30s. Perhaps this should have it's own + * tunable, but that can be done if this method proves to be ineffective or too + * aggressive. + */ +static void +xfs_reclaim_work_queue( + struct xfs_mount *mp) +{ + + rcu_read_lock(); + if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) { + queue_delayed_work(mp->m_reclaim_workqueue, &mp->m_reclaim_work, + msecs_to_jiffies(xfs_syncd_centisecs / 6 * 10)); + } + rcu_read_unlock(); +} + +/* + * This is a fast pass over the inode cache to try to get reclaim moving on as + * many inodes as possible in a short period of time. It kicks itself every few + * seconds, as well as being kicked by the inode cache shrinker when memory + * goes low. It scans as quickly as possible avoiding locked inodes or those + * already being flushed, and once done schedules a future pass. + */ +void +xfs_reclaim_worker( + struct work_struct *work) +{ + struct xfs_mount *mp = container_of(to_delayed_work(work), + struct xfs_mount, m_reclaim_work); + + xfs_reclaim_inodes(mp, SYNC_TRYLOCK); + xfs_reclaim_work_queue(mp); +} + +static void +xfs_perag_set_reclaim_tag( + struct xfs_perag *pag) +{ + struct xfs_mount *mp = pag->pag_mount; + + ASSERT(spin_is_locked(&pag->pag_ici_lock)); + if (pag->pag_ici_reclaimable++) + return; + + /* propagate the reclaim tag up into the perag radix tree */ + spin_lock(&mp->m_perag_lock); + radix_tree_tag_set(&mp->m_perag_tree, pag->pag_agno, + XFS_ICI_RECLAIM_TAG); + spin_unlock(&mp->m_perag_lock); + + /* schedule periodic background inode reclaim */ + xfs_reclaim_work_queue(mp); + + trace_xfs_perag_set_reclaim(mp, pag->pag_agno, -1, _RET_IP_); +} + +static void +xfs_perag_clear_reclaim_tag( + struct xfs_perag *pag) +{ + struct xfs_mount *mp = pag->pag_mount; + + ASSERT(spin_is_locked(&pag->pag_ici_lock)); + if (--pag->pag_ici_reclaimable) + return; + + /* clear the reclaim tag from the perag radix tree */ + spin_lock(&mp->m_perag_lock); + radix_tree_tag_clear(&mp->m_perag_tree, pag->pag_agno, + XFS_ICI_RECLAIM_TAG); + spin_unlock(&mp->m_perag_lock); + trace_xfs_perag_clear_reclaim(mp, pag->pag_agno, -1, _RET_IP_); +} + + +/* + * We set the inode flag atomically with the radix tree tag. + * Once we get tag lookups on the radix tree, this inode flag + * can go away. + */ +void +xfs_inode_set_reclaim_tag( + struct xfs_inode *ip) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_perag *pag; + + pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); + spin_lock(&pag->pag_ici_lock); + spin_lock(&ip->i_flags_lock); + + radix_tree_tag_set(&pag->pag_ici_root, XFS_INO_TO_AGINO(mp, ip->i_ino), + XFS_ICI_RECLAIM_TAG); + xfs_perag_set_reclaim_tag(pag); + __xfs_iflags_set(ip, XFS_IRECLAIMABLE); + + spin_unlock(&ip->i_flags_lock); + spin_unlock(&pag->pag_ici_lock); + xfs_perag_put(pag); +} + +STATIC void +xfs_inode_clear_reclaim_tag( + struct xfs_perag *pag, + xfs_ino_t ino) +{ + radix_tree_tag_clear(&pag->pag_ici_root, + XFS_INO_TO_AGINO(pag->pag_mount, ino), + XFS_ICI_RECLAIM_TAG); + xfs_perag_clear_reclaim_tag(pag); } /* @@ -264,7 +384,7 @@ xfs_iget_cache_hit( */ ip->i_flags &= ~XFS_IRECLAIM_RESET_FLAGS; ip->i_flags |= XFS_INEW; - __xfs_inode_clear_reclaim_tag(mp, pag, ip); + xfs_inode_clear_reclaim_tag(pag, ip->i_ino); inode->i_state = I_NEW; ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock)); @@ -645,7 +765,7 @@ restart: * Background scanning to trim post-EOF preallocated space. This is queued * based on the 'speculative_prealloc_lifetime' tunable (5m by default). */ -STATIC void +void xfs_queue_eofblocks( struct xfs_mount *mp) { @@ -723,121 +843,6 @@ xfs_inode_ag_iterator_tag( } /* - * Queue a new inode reclaim pass if there are reclaimable inodes and there - * isn't a reclaim pass already in progress. By default it runs every 5s based - * on the xfs periodic sync default of 30s. Perhaps this should have it's own - * tunable, but that can be done if this method proves to be ineffective or too - * aggressive. - */ -static void -xfs_reclaim_work_queue( - struct xfs_mount *mp) -{ - - rcu_read_lock(); - if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) { - queue_delayed_work(mp->m_reclaim_workqueue, &mp->m_reclaim_work, - msecs_to_jiffies(xfs_syncd_centisecs / 6 * 10)); - } - rcu_read_unlock(); -} - -/* - * This is a fast pass over the inode cache to try to get reclaim moving on as - * many inodes as possible in a short period of time. It kicks itself every few - * seconds, as well as being kicked by the inode cache shrinker when memory - * goes low. It scans as quickly as possible avoiding locked inodes or those - * already being flushed, and once done schedules a future pass. - */ -void -xfs_reclaim_worker( - struct work_struct *work) -{ - struct xfs_mount *mp = container_of(to_delayed_work(work), - struct xfs_mount, m_reclaim_work); - - xfs_reclaim_inodes(mp, SYNC_TRYLOCK); - xfs_reclaim_work_queue(mp); -} - -static void -__xfs_inode_set_reclaim_tag( - struct xfs_perag *pag, - struct xfs_inode *ip) -{ - radix_tree_tag_set(&pag->pag_ici_root, - XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino), - XFS_ICI_RECLAIM_TAG); - - if (!pag->pag_ici_reclaimable) { - /* propagate the reclaim tag up into the perag radix tree */ - spin_lock(&ip->i_mount->m_perag_lock); - radix_tree_tag_set(&ip->i_mount->m_perag_tree, - XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino), - XFS_ICI_RECLAIM_TAG); - spin_unlock(&ip->i_mount->m_perag_lock); - - /* schedule periodic background inode reclaim */ - xfs_reclaim_work_queue(ip->i_mount); - - trace_xfs_perag_set_reclaim(ip->i_mount, pag->pag_agno, - -1, _RET_IP_); - } - pag->pag_ici_reclaimable++; -} - -/* - * We set the inode flag atomically with the radix tree tag. - * Once we get tag lookups on the radix tree, this inode flag - * can go away. - */ -void -xfs_inode_set_reclaim_tag( - xfs_inode_t *ip) -{ - struct xfs_mount *mp = ip->i_mount; - struct xfs_perag *pag; - - pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); - spin_lock(&pag->pag_ici_lock); - spin_lock(&ip->i_flags_lock); - __xfs_inode_set_reclaim_tag(pag, ip); - __xfs_iflags_set(ip, XFS_IRECLAIMABLE); - spin_unlock(&ip->i_flags_lock); - spin_unlock(&pag->pag_ici_lock); - xfs_perag_put(pag); -} - -STATIC void -__xfs_inode_clear_reclaim( - xfs_perag_t *pag, - xfs_inode_t *ip) -{ - pag->pag_ici_reclaimable--; - if (!pag->pag_ici_reclaimable) { - /* clear the reclaim tag from the perag radix tree */ - spin_lock(&ip->i_mount->m_perag_lock); - radix_tree_tag_clear(&ip->i_mount->m_perag_tree, - XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino), - XFS_ICI_RECLAIM_TAG); - spin_unlock(&ip->i_mount->m_perag_lock); - trace_xfs_perag_clear_reclaim(ip->i_mount, pag->pag_agno, - -1, _RET_IP_); - } -} - -STATIC void -__xfs_inode_clear_reclaim_tag( - xfs_mount_t *mp, - xfs_perag_t *pag, - xfs_inode_t *ip) -{ - radix_tree_tag_clear(&pag->pag_ici_root, - XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG); - __xfs_inode_clear_reclaim(pag, ip); -} - -/* * Grab the inode for reclaim exclusively. * Return 0 if we grabbed it, non-zero otherwise. */ @@ -929,6 +934,7 @@ xfs_reclaim_inode( int sync_mode) { struct xfs_buf *bp = NULL; + xfs_ino_t ino = ip->i_ino; /* for radix_tree_delete */ int error; restart: @@ -993,6 +999,22 @@ restart: xfs_iflock(ip); reclaim: + /* + * Because we use RCU freeing we need to ensure the inode always appears + * to be reclaimed with an invalid inode number when in the free state. + * We do this as early as possible under the ILOCK and flush lock so + * that xfs_iflush_cluster() can be guaranteed to detect races with us + * here. By doing this, we guarantee that once xfs_iflush_cluster has + * locked both the XFS_ILOCK and the flush lock that it will see either + * a valid, flushable inode that will serialise correctly against the + * locks below, or it will see a clean (and invalid) inode that it can + * skip. + */ + spin_lock(&ip->i_flags_lock); + ip->i_flags = XFS_IRECLAIM; + ip->i_ino = 0; + spin_unlock(&ip->i_flags_lock); + xfs_ifunlock(ip); xfs_iunlock(ip, XFS_ILOCK_EXCL); @@ -1006,9 +1028,9 @@ reclaim: */ spin_lock(&pag->pag_ici_lock); if (!radix_tree_delete(&pag->pag_ici_root, - XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino))) + XFS_INO_TO_AGINO(ip->i_mount, ino))) ASSERT(0); - __xfs_inode_clear_reclaim(pag, ip); + xfs_perag_clear_reclaim_tag(pag); spin_unlock(&pag->pag_ici_lock); /* @@ -1023,7 +1045,7 @@ reclaim: xfs_qm_dqdetach(ip); xfs_iunlock(ip, XFS_ILOCK_EXCL); - xfs_inode_free(ip); + __xfs_inode_free(ip); return error; out_ifunlock: diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h index 62f1f91c32cb..05bac99bef75 100644 --- a/fs/xfs/xfs_icache.h +++ b/fs/xfs/xfs_icache.h @@ -68,6 +68,7 @@ void xfs_inode_clear_eofblocks_tag(struct xfs_inode *ip); int xfs_icache_free_eofblocks(struct xfs_mount *, struct xfs_eofblocks *); int xfs_inode_free_quota_eofblocks(struct xfs_inode *ip); void xfs_eofblocks_worker(struct work_struct *); +void xfs_queue_eofblocks(struct xfs_mount *); int xfs_inode_ag_iterator(struct xfs_mount *mp, int (*execute)(struct xfs_inode *ip, int flags, void *args), diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 96f606deee31..8825bcfd314c 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -431,7 +431,7 @@ xfs_lock_inumorder(int lock_mode, int subclass) * lock more than one at a time, lockdep will report false positives saying we * have violated locking orders. */ -void +static void xfs_lock_inodes( xfs_inode_t **ips, int inodes, @@ -667,14 +667,6 @@ xfs_ip2xflags( return _xfs_dic2xflags(dic->di_flags, dic->di_flags2, XFS_IFORK_Q(ip)); } -uint -xfs_dic2xflags( - struct xfs_dinode *dip) -{ - return _xfs_dic2xflags(be16_to_cpu(dip->di_flags), - be64_to_cpu(dip->di_flags2), XFS_DFORK_Q(dip)); -} - /* * Lookups up an inode from "name". If ci_name is not NULL, then a CI match * is allowed, otherwise it has to be an exact match. If a CI match is found, @@ -748,7 +740,7 @@ out_unlock: * are not linked into the directory structure - they are attached * directly to the superblock - and so have no parent. */ -int +static int xfs_ialloc( xfs_trans_t *tp, xfs_inode_t *pip, @@ -1030,7 +1022,7 @@ xfs_dir_ialloc( tp->t_flags &= ~(XFS_TRANS_DQ_DIRTY); } - code = xfs_trans_roll(&tp, 0); + code = xfs_trans_roll(&tp, NULL); if (committed != NULL) *committed = 1; @@ -1085,7 +1077,7 @@ xfs_dir_ialloc( * link count to go to zero, move the inode to AGI unlinked list so that it can * be freed when the last active reference goes away via xfs_inactive(). */ -int /* error */ +static int /* error */ xfs_droplink( xfs_trans_t *tp, xfs_inode_t *ip) @@ -1104,7 +1096,7 @@ xfs_droplink( /* * Increment the link count on an inode & log the change. */ -int +static int xfs_bumplink( xfs_trans_t *tp, xfs_inode_t *ip) @@ -1161,11 +1153,9 @@ xfs_create( rdev = 0; resblks = XFS_MKDIR_SPACE_RES(mp, name->len); tres = &M_RES(mp)->tr_mkdir; - tp = xfs_trans_alloc(mp, XFS_TRANS_MKDIR); } else { resblks = XFS_CREATE_SPACE_RES(mp, name->len); tres = &M_RES(mp)->tr_create; - tp = xfs_trans_alloc(mp, XFS_TRANS_CREATE); } /* @@ -1174,20 +1164,19 @@ xfs_create( * the case we'll drop the one we have and get a more * appropriate transaction later. */ - error = xfs_trans_reserve(tp, tres, resblks, 0); + error = xfs_trans_alloc(mp, tres, resblks, 0, 0, &tp); if (error == -ENOSPC) { /* flush outstanding delalloc blocks and retry */ xfs_flush_inodes(mp); - error = xfs_trans_reserve(tp, tres, resblks, 0); + error = xfs_trans_alloc(mp, tres, resblks, 0, 0, &tp); } if (error == -ENOSPC) { /* No space at all so try a "no-allocation" reservation */ resblks = 0; - error = xfs_trans_reserve(tp, tres, 0, 0); + error = xfs_trans_alloc(mp, tres, 0, 0, 0, &tp); } if (error) - goto out_trans_cancel; - + goto out_release_inode; xfs_ilock(dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL | XFS_IOLOCK_PARENT | XFS_ILOCK_PARENT); @@ -1337,17 +1326,16 @@ xfs_create_tmpfile( return error; resblks = XFS_IALLOC_SPACE_RES(mp); - tp = xfs_trans_alloc(mp, XFS_TRANS_CREATE_TMPFILE); - tres = &M_RES(mp)->tr_create_tmpfile; - error = xfs_trans_reserve(tp, tres, resblks, 0); + + error = xfs_trans_alloc(mp, tres, resblks, 0, 0, &tp); if (error == -ENOSPC) { /* No space at all so try a "no-allocation" reservation */ resblks = 0; - error = xfs_trans_reserve(tp, tres, 0, 0); + error = xfs_trans_alloc(mp, tres, 0, 0, 0, &tp); } if (error) - goto out_trans_cancel; + goto out_release_inode; error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp, pdqp, resblks, 1, 0); @@ -1432,15 +1420,14 @@ xfs_link( if (error) goto std_return; - tp = xfs_trans_alloc(mp, XFS_TRANS_LINK); resblks = XFS_LINK_SPACE_RES(mp, target_name->len); - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_link, resblks, 0); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_link, resblks, 0, 0, &tp); if (error == -ENOSPC) { resblks = 0; - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_link, 0, 0); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_link, 0, 0, 0, &tp); } if (error) - goto error_return; + goto std_return; xfs_ilock(tdp, XFS_IOLOCK_EXCL | XFS_IOLOCK_PARENT); xfs_lock_two_inodes(sip, tdp, XFS_ILOCK_EXCL); @@ -1710,11 +1697,9 @@ xfs_inactive_truncate( struct xfs_trans *tp; int error; - tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE); - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp); if (error) { ASSERT(XFS_FORCED_SHUTDOWN(mp)); - xfs_trans_cancel(tp); return error; } @@ -1764,8 +1749,6 @@ xfs_inactive_ifree( struct xfs_trans *tp; int error; - tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE); - /* * The ifree transaction might need to allocate blocks for record * insertion to the finobt. We don't want to fail here at ENOSPC, so @@ -1781,9 +1764,8 @@ xfs_inactive_ifree( * now remains allocated and sits on the unlinked list until the fs is * repaired. */ - tp->t_flags |= XFS_TRANS_RESERVE; - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ifree, - XFS_IFREE_SPACE_RES(mp), 0); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ifree, + XFS_IFREE_SPACE_RES(mp), 0, XFS_TRANS_RESERVE, &tp); if (error) { if (error == -ENOSPC) { xfs_warn_ratelimited(mp, @@ -1792,7 +1774,6 @@ xfs_inactive_ifree( } else { ASSERT(XFS_FORCED_SHUTDOWN(mp)); } - xfs_trans_cancel(tp); return error; } @@ -2525,11 +2506,6 @@ xfs_remove( if (error) goto std_return; - if (is_dir) - tp = xfs_trans_alloc(mp, XFS_TRANS_RMDIR); - else - tp = xfs_trans_alloc(mp, XFS_TRANS_REMOVE); - /* * We try to get the real space reservation first, * allowing for directory btree deletion(s) implying @@ -2540,14 +2516,15 @@ xfs_remove( * block from the directory. */ resblks = XFS_REMOVE_SPACE_RES(mp); - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_remove, resblks, 0); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_remove, resblks, 0, 0, &tp); if (error == -ENOSPC) { resblks = 0; - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_remove, 0, 0); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_remove, 0, 0, 0, + &tp); } if (error) { ASSERT(error != -ENOSPC); - goto out_trans_cancel; + goto std_return; } xfs_ilock(dp, XFS_IOLOCK_EXCL | XFS_IOLOCK_PARENT); @@ -2855,6 +2832,7 @@ xfs_rename_alloc_whiteout( * and flag it as linkable. */ drop_nlink(VFS_I(tmpfile)); + xfs_setup_iops(tmpfile); xfs_finish_inode_setup(tmpfile); VFS_I(tmpfile)->i_state |= I_LINKABLE; @@ -2910,15 +2888,15 @@ xfs_rename( xfs_sort_for_rename(src_dp, target_dp, src_ip, target_ip, wip, inodes, &num_inodes); - tp = xfs_trans_alloc(mp, XFS_TRANS_RENAME); spaceres = XFS_RENAME_SPACE_RES(mp, target_name->len); - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_rename, spaceres, 0); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_rename, spaceres, 0, 0, &tp); if (error == -ENOSPC) { spaceres = 0; - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_rename, 0, 0); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_rename, 0, 0, 0, + &tp); } if (error) - goto out_trans_cancel; + goto out_release_wip; /* * Attach the dquots to the inodes @@ -3155,6 +3133,7 @@ out_bmap_cancel: xfs_bmap_cancel(&free_list); out_trans_cancel: xfs_trans_cancel(tp); +out_release_wip: if (wip) IRELE(wip); return error; @@ -3162,16 +3141,16 @@ out_trans_cancel: STATIC int xfs_iflush_cluster( - xfs_inode_t *ip, - xfs_buf_t *bp) + struct xfs_inode *ip, + struct xfs_buf *bp) { - xfs_mount_t *mp = ip->i_mount; + struct xfs_mount *mp = ip->i_mount; struct xfs_perag *pag; unsigned long first_index, mask; unsigned long inodes_per_cluster; - int ilist_size; - xfs_inode_t **ilist; - xfs_inode_t *iq; + int cilist_size; + struct xfs_inode **cilist; + struct xfs_inode *cip; int nr_found; int clcount = 0; int bufwasdelwri; @@ -3180,23 +3159,23 @@ xfs_iflush_cluster( pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); inodes_per_cluster = mp->m_inode_cluster_size >> mp->m_sb.sb_inodelog; - ilist_size = inodes_per_cluster * sizeof(xfs_inode_t *); - ilist = kmem_alloc(ilist_size, KM_MAYFAIL|KM_NOFS); - if (!ilist) + cilist_size = inodes_per_cluster * sizeof(xfs_inode_t *); + cilist = kmem_alloc(cilist_size, KM_MAYFAIL|KM_NOFS); + if (!cilist) goto out_put; mask = ~(((mp->m_inode_cluster_size >> mp->m_sb.sb_inodelog)) - 1); first_index = XFS_INO_TO_AGINO(mp, ip->i_ino) & mask; rcu_read_lock(); /* really need a gang lookup range call here */ - nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, (void**)ilist, + nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, (void**)cilist, first_index, inodes_per_cluster); if (nr_found == 0) goto out_free; for (i = 0; i < nr_found; i++) { - iq = ilist[i]; - if (iq == ip) + cip = cilist[i]; + if (cip == ip) continue; /* @@ -3205,20 +3184,30 @@ xfs_iflush_cluster( * We need to check under the i_flags_lock for a valid inode * here. Skip it if it is not valid or the wrong inode. */ - spin_lock(&ip->i_flags_lock); - if (!ip->i_ino || - (XFS_INO_TO_AGINO(mp, iq->i_ino) & mask) != first_index) { - spin_unlock(&ip->i_flags_lock); + spin_lock(&cip->i_flags_lock); + if (!cip->i_ino || + __xfs_iflags_test(cip, XFS_ISTALE)) { + spin_unlock(&cip->i_flags_lock); continue; } - spin_unlock(&ip->i_flags_lock); + + /* + * Once we fall off the end of the cluster, no point checking + * any more inodes in the list because they will also all be + * outside the cluster. + */ + if ((XFS_INO_TO_AGINO(mp, cip->i_ino) & mask) != first_index) { + spin_unlock(&cip->i_flags_lock); + break; + } + spin_unlock(&cip->i_flags_lock); /* * Do an un-protected check to see if the inode is dirty and * is a candidate for flushing. These checks will be repeated * later after the appropriate locks are acquired. */ - if (xfs_inode_clean(iq) && xfs_ipincount(iq) == 0) + if (xfs_inode_clean(cip) && xfs_ipincount(cip) == 0) continue; /* @@ -3226,15 +3215,28 @@ xfs_iflush_cluster( * then this inode cannot be flushed and is skipped. */ - if (!xfs_ilock_nowait(iq, XFS_ILOCK_SHARED)) + if (!xfs_ilock_nowait(cip, XFS_ILOCK_SHARED)) continue; - if (!xfs_iflock_nowait(iq)) { - xfs_iunlock(iq, XFS_ILOCK_SHARED); + if (!xfs_iflock_nowait(cip)) { + xfs_iunlock(cip, XFS_ILOCK_SHARED); continue; } - if (xfs_ipincount(iq)) { - xfs_ifunlock(iq); - xfs_iunlock(iq, XFS_ILOCK_SHARED); + if (xfs_ipincount(cip)) { + xfs_ifunlock(cip); + xfs_iunlock(cip, XFS_ILOCK_SHARED); + continue; + } + + + /* + * Check the inode number again, just to be certain we are not + * racing with freeing in xfs_reclaim_inode(). See the comments + * in that function for more information as to why the initial + * check is not sufficient. + */ + if (!cip->i_ino) { + xfs_ifunlock(cip); + xfs_iunlock(cip, XFS_ILOCK_SHARED); continue; } @@ -3242,18 +3244,18 @@ xfs_iflush_cluster( * arriving here means that this inode can be flushed. First * re-check that it's dirty before flushing. */ - if (!xfs_inode_clean(iq)) { + if (!xfs_inode_clean(cip)) { int error; - error = xfs_iflush_int(iq, bp); + error = xfs_iflush_int(cip, bp); if (error) { - xfs_iunlock(iq, XFS_ILOCK_SHARED); + xfs_iunlock(cip, XFS_ILOCK_SHARED); goto cluster_corrupt_out; } clcount++; } else { - xfs_ifunlock(iq); + xfs_ifunlock(cip); } - xfs_iunlock(iq, XFS_ILOCK_SHARED); + xfs_iunlock(cip, XFS_ILOCK_SHARED); } if (clcount) { @@ -3263,7 +3265,7 @@ xfs_iflush_cluster( out_free: rcu_read_unlock(); - kmem_free(ilist); + kmem_free(cilist); out_put: xfs_perag_put(pag); return 0; @@ -3306,8 +3308,8 @@ cluster_corrupt_out: /* * Unlocks the flush lock */ - xfs_iflush_abort(iq, false); - kmem_free(ilist); + xfs_iflush_abort(cip, false); + kmem_free(cilist); xfs_perag_put(pag); return -EFSCORRUPTED; } @@ -3327,7 +3329,7 @@ xfs_iflush( struct xfs_buf **bpp) { struct xfs_mount *mp = ip->i_mount; - struct xfs_buf *bp; + struct xfs_buf *bp = NULL; struct xfs_dinode *dip; int error; @@ -3369,14 +3371,22 @@ xfs_iflush( } /* - * Get the buffer containing the on-disk inode. + * Get the buffer containing the on-disk inode. We are doing a try-lock + * operation here, so we may get an EAGAIN error. In that case, we + * simply want to return with the inode still dirty. + * + * If we get any other error, we effectively have a corruption situation + * and we cannot flush the inode, so we treat it the same as failing + * xfs_iflush_int(). */ error = xfs_imap_to_bp(mp, NULL, &ip->i_imap, &dip, &bp, XBF_TRYLOCK, 0); - if (error || !bp) { + if (error == -EAGAIN) { xfs_ifunlock(ip); return error; } + if (error) + goto corrupt_out; /* * First flush out the inode that xfs_iflush was called with. @@ -3404,7 +3414,8 @@ xfs_iflush( return 0; corrupt_out: - xfs_buf_relse(bp); + if (bp) + xfs_buf_relse(bp); xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); cluster_corrupt_out: error = -EFSCORRUPTED; diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 43e1d51b15eb..8eb78ec4a6e2 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -395,12 +395,8 @@ void xfs_ilock_demote(xfs_inode_t *, uint); int xfs_isilocked(xfs_inode_t *, uint); uint xfs_ilock_data_map_shared(struct xfs_inode *); uint xfs_ilock_attr_map_shared(struct xfs_inode *); -int xfs_ialloc(struct xfs_trans *, xfs_inode_t *, umode_t, - xfs_nlink_t, xfs_dev_t, prid_t, int, - struct xfs_buf **, xfs_inode_t **); uint xfs_ip2xflags(struct xfs_inode *); -uint xfs_dic2xflags(struct xfs_dinode *); int xfs_ifree(struct xfs_trans *, xfs_inode_t *, struct xfs_bmap_free *); int xfs_itruncate_extents(struct xfs_trans **, struct xfs_inode *, @@ -411,7 +407,6 @@ void xfs_iunpin_wait(xfs_inode_t *); #define xfs_ipincount(ip) ((unsigned int) atomic_read(&ip->i_pincount)) int xfs_iflush(struct xfs_inode *, struct xfs_buf **); -void xfs_lock_inodes(xfs_inode_t **, int, uint); void xfs_lock_two_inodes(xfs_inode_t *, xfs_inode_t *, uint); xfs_extlen_t xfs_get_extsz_hint(struct xfs_inode *ip); @@ -419,8 +414,6 @@ xfs_extlen_t xfs_get_extsz_hint(struct xfs_inode *ip); int xfs_dir_ialloc(struct xfs_trans **, struct xfs_inode *, umode_t, xfs_nlink_t, xfs_dev_t, prid_t, int, struct xfs_inode **, int *); -int xfs_droplink(struct xfs_trans *, struct xfs_inode *); -int xfs_bumplink(struct xfs_trans *, struct xfs_inode *); /* from xfs_file.c */ enum xfs_prealloc_flags { @@ -434,12 +427,16 @@ int xfs_update_prealloc_flags(struct xfs_inode *ip, enum xfs_prealloc_flags flags); int xfs_zero_eof(struct xfs_inode *ip, xfs_off_t offset, xfs_fsize_t isize, bool *did_zeroing); -int xfs_iozero(struct xfs_inode *ip, loff_t pos, size_t count); +int xfs_zero_range(struct xfs_inode *ip, xfs_off_t pos, xfs_off_t count, + bool *did_zero); loff_t __xfs_seek_hole_data(struct inode *inode, loff_t start, loff_t eof, int whence); /* from xfs_iops.c */ +extern void xfs_setup_inode(struct xfs_inode *ip); +extern void xfs_setup_iops(struct xfs_inode *ip); + /* * When setting up a newly allocated inode, we need to call * xfs_finish_inode_setup() once the inode is fully instantiated at @@ -447,7 +444,6 @@ loff_t __xfs_seek_hole_data(struct inode *inode, loff_t start, * before we've completed instantiation. Otherwise we can do it * the moment the inode lookup is complete. */ -extern void xfs_setup_inode(struct xfs_inode *ip); static inline void xfs_finish_inode_setup(struct xfs_inode *ip) { xfs_iflags_clear(ip, XFS_INEW); @@ -458,6 +454,7 @@ static inline void xfs_finish_inode_setup(struct xfs_inode *ip) static inline void xfs_setup_existing_inode(struct xfs_inode *ip) { xfs_setup_inode(ip); + xfs_setup_iops(ip); xfs_finish_inode_setup(ip); } @@ -476,14 +473,4 @@ do { \ extern struct kmem_zone *xfs_inode_zone; -/* - * Flags for read/write calls - */ -#define XFS_IO_ISDIRECT 0x00001 /* bypass page cache */ -#define XFS_IO_INVIS 0x00002 /* don't update inode timestamps */ - -#define XFS_IO_FLAGS \ - { XFS_IO_ISDIRECT, "DIRECT" }, \ - { XFS_IO_INVIS, "INVIS"} - #endif /* __XFS_INODE_H__ */ diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index c48b5b18d771..892c2aced207 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -210,7 +210,7 @@ xfs_inode_item_format_data_fork( */ data_bytes = roundup(ip->i_df.if_bytes, 4); ASSERT(ip->i_df.if_real_bytes == 0 || - ip->i_df.if_real_bytes == data_bytes); + ip->i_df.if_real_bytes >= data_bytes); ASSERT(ip->i_df.if_u1.if_data != NULL); ASSERT(ip->i_d.di_size > 0); xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_ILOCAL, @@ -305,7 +305,7 @@ xfs_inode_item_format_attr_fork( */ data_bytes = roundup(ip->i_afp->if_bytes, 4); ASSERT(ip->i_afp->if_real_bytes == 0 || - ip->i_afp->if_real_bytes == data_bytes); + ip->i_afp->if_real_bytes >= data_bytes); ASSERT(ip->i_afp->if_u1.if_data != NULL); xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_IATTR_LOCAL, ip->i_afp->if_u1.if_data, @@ -479,6 +479,8 @@ STATIC uint xfs_inode_item_push( struct xfs_log_item *lip, struct list_head *buffer_list) + __releases(&lip->li_ailp->xa_lock) + __acquires(&lip->li_ailp->xa_lock) { struct xfs_inode_log_item *iip = INODE_ITEM(lip); struct xfs_inode *ip = iip->ili_inode; @@ -649,6 +651,7 @@ void xfs_inode_item_destroy( xfs_inode_t *ip) { + kmem_free(ip->i_itemp->ili_item.li_lv_shadow); kmem_zone_free(xfs_ili_zone, ip->i_itemp); } diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index bcb6c19ce3ea..9a7c87809d3b 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -277,7 +277,6 @@ xfs_readlink_by_handle( { struct dentry *dentry; __u32 olen; - void *link; int error; if (!capable(CAP_SYS_ADMIN)) @@ -288,7 +287,7 @@ xfs_readlink_by_handle( return PTR_ERR(dentry); /* Restrict this handle operation to symlinks only. */ - if (!d_is_symlink(dentry)) { + if (!d_inode(dentry)->i_op->readlink) { error = -EINVAL; goto out_dput; } @@ -298,21 +297,8 @@ xfs_readlink_by_handle( goto out_dput; } - link = kmalloc(MAXPATHLEN+1, GFP_KERNEL); - if (!link) { - error = -ENOMEM; - goto out_dput; - } - - error = xfs_readlink(XFS_I(d_inode(dentry)), link); - if (error) - goto out_kfree; - error = readlink_copy(hreq->ohandle, olen, link); - if (error) - goto out_kfree; + error = d_inode(dentry)->i_op->readlink(dentry, hreq->ohandle, olen); - out_kfree: - kfree(link); out_dput: dput(dentry); return error; @@ -334,12 +320,10 @@ xfs_set_dmattrs( if (XFS_FORCED_SHUTDOWN(mp)) return -EIO; - tp = xfs_trans_alloc(mp, XFS_TRANS_SET_DMATTRS); - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0); - if (error) { - xfs_trans_cancel(tp); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp); + if (error) return error; - } + xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); @@ -611,13 +595,12 @@ xfs_attrmulti_by_handle( int xfs_ioc_space( - struct xfs_inode *ip, - struct inode *inode, struct file *filp, - int ioflags, unsigned int cmd, xfs_flock64_t *bf) { + struct inode *inode = file_inode(filp); + struct xfs_inode *ip = XFS_I(inode); struct iattr iattr; enum xfs_prealloc_flags flags = 0; uint iolock = XFS_IOLOCK_EXCL; @@ -642,7 +625,7 @@ xfs_ioc_space( if (filp->f_flags & O_DSYNC) flags |= XFS_PREALLOC_SYNC; - if (ioflags & XFS_IO_INVIS) + if (filp->f_mode & FMODE_NOCMTIME) flags |= XFS_PREALLOC_INVISIBLE; error = mnt_want_write_file(filp); @@ -1141,10 +1124,9 @@ xfs_ioctl_setattr_get_trans( if (XFS_FORCED_SHUTDOWN(mp)) goto out_unlock; - tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE); - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp); if (error) - goto out_cancel; + return ERR_PTR(error); xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | join_flags); @@ -1481,8 +1463,7 @@ xfs_getbmap_format(void **ap, struct getbmapx *bmv, int *full) STATIC int xfs_ioc_getbmap( - struct xfs_inode *ip, - int ioflags, + struct file *file, unsigned int cmd, void __user *arg) { @@ -1496,10 +1477,10 @@ xfs_ioc_getbmap( return -EINVAL; bmx.bmv_iflags = (cmd == XFS_IOC_GETBMAPA ? BMV_IF_ATTRFORK : 0); - if (ioflags & XFS_IO_INVIS) + if (file->f_mode & FMODE_NOCMTIME) bmx.bmv_iflags |= BMV_IF_NO_DMAPI_READ; - error = xfs_getbmap(ip, &bmx, xfs_getbmap_format, + error = xfs_getbmap(XFS_I(file_inode(file)), &bmx, xfs_getbmap_format, (__force struct getbmap *)arg+1); if (error) return error; @@ -1592,6 +1573,17 @@ xfs_ioc_swapext( goto out_put_tmp_file; } + /* + * We need to ensure that the fds passed in point to XFS inodes + * before we cast and access them as XFS structures as we have no + * control over what the user passes us here. + */ + if (f.file->f_op != &xfs_file_operations || + tmp.file->f_op != &xfs_file_operations) { + error = -EINVAL; + goto out_put_tmp_file; + } + ip = XFS_I(file_inode(f.file)); tip = XFS_I(file_inode(tmp.file)); @@ -1636,12 +1628,8 @@ xfs_file_ioctl( struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; void __user *arg = (void __user *)p; - int ioflags = 0; int error; - if (filp->f_mode & FMODE_NOCMTIME) - ioflags |= XFS_IO_INVIS; - trace_xfs_file_ioctl(ip); switch (cmd) { @@ -1660,7 +1648,7 @@ xfs_file_ioctl( if (copy_from_user(&bf, arg, sizeof(bf))) return -EFAULT; - return xfs_ioc_space(ip, inode, filp, ioflags, cmd, &bf); + return xfs_ioc_space(filp, cmd, &bf); } case XFS_IOC_DIOINFO: { struct dioattr da; @@ -1719,7 +1707,7 @@ xfs_file_ioctl( case XFS_IOC_GETBMAP: case XFS_IOC_GETBMAPA: - return xfs_ioc_getbmap(ip, ioflags, cmd, arg); + return xfs_ioc_getbmap(filp, cmd, arg); case XFS_IOC_GETBMAPX: return xfs_ioc_getbmapx(ip, arg); diff --git a/fs/xfs/xfs_ioctl.h b/fs/xfs/xfs_ioctl.h index 77c02c7900b6..8b52881bfd90 100644 --- a/fs/xfs/xfs_ioctl.h +++ b/fs/xfs/xfs_ioctl.h @@ -20,10 +20,7 @@ extern int xfs_ioc_space( - struct xfs_inode *ip, - struct inode *inode, struct file *filp, - int ioflags, unsigned int cmd, xfs_flock64_t *bf); diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c index 1a05d8ae327d..321f57721b92 100644 --- a/fs/xfs/xfs_ioctl32.c +++ b/fs/xfs/xfs_ioctl32.c @@ -532,12 +532,8 @@ xfs_file_compat_ioctl( struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; void __user *arg = (void __user *)p; - int ioflags = 0; int error; - if (filp->f_mode & FMODE_NOCMTIME) - ioflags |= XFS_IO_INVIS; - trace_xfs_file_compat_ioctl(ip); switch (cmd) { @@ -589,7 +585,7 @@ xfs_file_compat_ioctl( if (xfs_compat_flock64_copyin(&bf, arg)) return -EFAULT; cmd = _NATIVE_IOC(cmd, struct xfs_flock64); - return xfs_ioc_space(ip, inode, filp, ioflags, cmd, &bf); + return xfs_ioc_space(filp, cmd, &bf); } case XFS_IOC_FSGEOMETRY_V1_32: return xfs_compat_ioc_fsgeometry_v1(mp, arg); diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index d81bdc080370..620fc9120444 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -15,6 +15,7 @@ * along with this program; if not, write the Free Software Foundation, * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ +#include <linux/iomap.h> #include "xfs.h" #include "xfs_fs.h" #include "xfs_shared.h" @@ -132,6 +133,7 @@ xfs_iomap_write_direct( int error; int lockmode; int bmapi_flags = XFS_BMAPI_PREALLOC; + uint tflags = 0; rt = XFS_IS_REALTIME_INODE(ip); extsz = xfs_get_extsz_hint(ip); @@ -192,11 +194,6 @@ xfs_iomap_write_direct( return error; /* - * Allocate and setup the transaction - */ - tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT); - - /* * For DAX, we do not allocate unwritten extents, but instead we zero * the block before we commit the transaction. Ideally we'd like to do * this outside the transaction context, but if we commit and then crash @@ -209,23 +206,17 @@ xfs_iomap_write_direct( * the reserve block pool for bmbt block allocation if there is no space * left but we need to do unwritten extent conversion. */ - if (IS_DAX(VFS_I(ip))) { bmapi_flags = XFS_BMAPI_CONVERT | XFS_BMAPI_ZERO; if (ISUNWRITTEN(imap)) { - tp->t_flags |= XFS_TRANS_RESERVE; + tflags |= XFS_TRANS_RESERVE; resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0) << 1; } } - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write, - resblks, resrtextents); - /* - * Check for running out of space, note: need lock to return - */ - if (error) { - xfs_trans_cancel(tp); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, resrtextents, + tflags, &tp); + if (error) return error; - } lockmode = XFS_ILOCK_EXCL; xfs_ilock(ip, lockmode); @@ -726,15 +717,13 @@ xfs_iomap_write_allocate( nimaps = 0; while (nimaps == 0) { - tp = xfs_trans_alloc(mp, XFS_TRANS_STRAT_WRITE); - tp->t_flags |= XFS_TRANS_RESERVE; nres = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK); - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write, - nres, 0); - if (error) { - xfs_trans_cancel(tp); + + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, nres, + 0, XFS_TRANS_RESERVE, &tp); + if (error) return error; - } + xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, 0); @@ -878,25 +867,18 @@ xfs_iomap_write_unwritten( do { /* - * set up a transaction to convert the range of extents + * Set up a transaction to convert the range of extents * from unwritten to real. Do allocations in a loop until * we have covered the range passed in. * - * Note that we open code the transaction allocation here - * to pass KM_NOFS--we can't risk to recursing back into - * the filesystem here as we might be asked to write out - * the same inode that we complete here and might deadlock - * on the iolock. + * Note that we can't risk to recursing back into the filesystem + * here as we might be asked to write out the same inode that we + * complete here and might deadlock on the iolock. */ - sb_start_intwrite(mp->m_super); - tp = _xfs_trans_alloc(mp, XFS_TRANS_STRAT_WRITE, KM_NOFS); - tp->t_flags |= XFS_TRANS_RESERVE | XFS_TRANS_FREEZE_PROT; - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write, - resblks, 0); - if (error) { - xfs_trans_cancel(tp); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, + XFS_TRANS_RESERVE | XFS_TRANS_NOFS, &tp); + if (error) return error; - } xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, 0); @@ -959,3 +941,173 @@ error_on_bmapi_transaction: xfs_iunlock(ip, XFS_ILOCK_EXCL); return error; } + +void +xfs_bmbt_to_iomap( + struct xfs_inode *ip, + struct iomap *iomap, + struct xfs_bmbt_irec *imap) +{ + struct xfs_mount *mp = ip->i_mount; + + if (imap->br_startblock == HOLESTARTBLOCK) { + iomap->blkno = IOMAP_NULL_BLOCK; + iomap->type = IOMAP_HOLE; + } else if (imap->br_startblock == DELAYSTARTBLOCK) { + iomap->blkno = IOMAP_NULL_BLOCK; + iomap->type = IOMAP_DELALLOC; + } else { + iomap->blkno = xfs_fsb_to_db(ip, imap->br_startblock); + if (imap->br_state == XFS_EXT_UNWRITTEN) + iomap->type = IOMAP_UNWRITTEN; + else + iomap->type = IOMAP_MAPPED; + } + iomap->offset = XFS_FSB_TO_B(mp, imap->br_startoff); + iomap->length = XFS_FSB_TO_B(mp, imap->br_blockcount); + iomap->bdev = xfs_find_bdev_for_inode(VFS_I(ip)); +} + +static inline bool imap_needs_alloc(struct xfs_bmbt_irec *imap, int nimaps) +{ + return !nimaps || + imap->br_startblock == HOLESTARTBLOCK || + imap->br_startblock == DELAYSTARTBLOCK; +} + +static int +xfs_file_iomap_begin( + struct inode *inode, + loff_t offset, + loff_t length, + unsigned flags, + struct iomap *iomap) +{ + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + struct xfs_bmbt_irec imap; + xfs_fileoff_t offset_fsb, end_fsb; + int nimaps = 1, error = 0; + + if (XFS_FORCED_SHUTDOWN(mp)) + return -EIO; + + xfs_ilock(ip, XFS_ILOCK_EXCL); + + ASSERT(offset <= mp->m_super->s_maxbytes); + if ((xfs_fsize_t)offset + length > mp->m_super->s_maxbytes) + length = mp->m_super->s_maxbytes - offset; + offset_fsb = XFS_B_TO_FSBT(mp, offset); + end_fsb = XFS_B_TO_FSB(mp, offset + length); + + error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap, + &nimaps, XFS_BMAPI_ENTIRE); + if (error) { + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return error; + } + + if ((flags & IOMAP_WRITE) && imap_needs_alloc(&imap, nimaps)) { + /* + * We cap the maximum length we map here to MAX_WRITEBACK_PAGES + * pages to keep the chunks of work done where somewhat symmetric + * with the work writeback does. This is a completely arbitrary + * number pulled out of thin air as a best guess for initial + * testing. + * + * Note that the values needs to be less than 32-bits wide until + * the lower level functions are updated. + */ + length = min_t(loff_t, length, 1024 * PAGE_SIZE); + if (xfs_get_extsz_hint(ip)) { + /* + * xfs_iomap_write_direct() expects the shared lock. It + * is unlocked on return. + */ + xfs_ilock_demote(ip, XFS_ILOCK_EXCL); + error = xfs_iomap_write_direct(ip, offset, length, &imap, + nimaps); + } else { + error = xfs_iomap_write_delay(ip, offset, length, &imap); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + } + + if (error) + return error; + + trace_xfs_iomap_alloc(ip, offset, length, 0, &imap); + xfs_bmbt_to_iomap(ip, iomap, &imap); + } else if (nimaps) { + xfs_iunlock(ip, XFS_ILOCK_EXCL); + trace_xfs_iomap_found(ip, offset, length, 0, &imap); + xfs_bmbt_to_iomap(ip, iomap, &imap); + } else { + xfs_iunlock(ip, XFS_ILOCK_EXCL); + trace_xfs_iomap_not_found(ip, offset, length, 0, &imap); + iomap->blkno = IOMAP_NULL_BLOCK; + iomap->type = IOMAP_HOLE; + iomap->offset = offset; + iomap->length = length; + } + + return 0; +} + +static int +xfs_file_iomap_end_delalloc( + struct xfs_inode *ip, + loff_t offset, + loff_t length, + ssize_t written) +{ + struct xfs_mount *mp = ip->i_mount; + xfs_fileoff_t start_fsb; + xfs_fileoff_t end_fsb; + int error = 0; + + start_fsb = XFS_B_TO_FSB(mp, offset + written); + end_fsb = XFS_B_TO_FSB(mp, offset + length); + + /* + * Trim back delalloc blocks if we didn't manage to write the whole + * range reserved. + * + * We don't need to care about racing delalloc as we hold i_mutex + * across the reserve/allocate/unreserve calls. If there are delalloc + * blocks in the range, they are ours. + */ + if (start_fsb < end_fsb) { + xfs_ilock(ip, XFS_ILOCK_EXCL); + error = xfs_bmap_punch_delalloc_range(ip, start_fsb, + end_fsb - start_fsb); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + + if (error && !XFS_FORCED_SHUTDOWN(mp)) { + xfs_alert(mp, "%s: unable to clean up ino %lld", + __func__, ip->i_ino); + return error; + } + } + + return 0; +} + +static int +xfs_file_iomap_end( + struct inode *inode, + loff_t offset, + loff_t length, + ssize_t written, + unsigned flags, + struct iomap *iomap) +{ + if ((flags & IOMAP_WRITE) && iomap->type == IOMAP_DELALLOC) + return xfs_file_iomap_end_delalloc(XFS_I(inode), offset, + length, written); + return 0; +} + +struct iomap_ops xfs_iomap_ops = { + .iomap_begin = xfs_file_iomap_begin, + .iomap_end = xfs_file_iomap_end, +}; diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h index 8688e663d744..e066d045e2ff 100644 --- a/fs/xfs/xfs_iomap.h +++ b/fs/xfs/xfs_iomap.h @@ -18,6 +18,8 @@ #ifndef __XFS_IOMAP_H__ #define __XFS_IOMAP_H__ +#include <linux/iomap.h> + struct xfs_inode; struct xfs_bmbt_irec; @@ -29,4 +31,9 @@ int xfs_iomap_write_allocate(struct xfs_inode *, xfs_off_t, struct xfs_bmbt_irec *); int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, xfs_off_t); +void xfs_bmbt_to_iomap(struct xfs_inode *, struct iomap *, + struct xfs_bmbt_irec *); + +extern struct iomap_ops xfs_iomap_ops; + #endif /* __XFS_IOMAP_H__*/ diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index fb7dc61f4a29..ab820f84ed50 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -38,12 +38,13 @@ #include "xfs_dir2.h" #include "xfs_trans_space.h" #include "xfs_pnfs.h" +#include "xfs_iomap.h" #include <linux/capability.h> #include <linux/xattr.h> #include <linux/posix_acl.h> #include <linux/security.h> -#include <linux/fiemap.h> +#include <linux/iomap.h> #include <linux/slab.h> /* @@ -181,6 +182,8 @@ xfs_generic_create( } #endif + xfs_setup_iops(ip); + if (tmpfile) d_tmpfile(dentry, inode); else @@ -368,6 +371,8 @@ xfs_vn_symlink( if (unlikely(error)) goto out_cleanup_inode; + xfs_setup_iops(cip); + d_instantiate(dentry, inode); xfs_finish_inode_setup(cip); return 0; @@ -442,6 +447,16 @@ xfs_vn_get_link( return ERR_PTR(error); } +STATIC const char * +xfs_vn_get_link_inline( + struct dentry *dentry, + struct inode *inode, + struct delayed_call *done) +{ + ASSERT(XFS_I(inode)->i_df.if_flags & XFS_IFINLINE); + return XFS_I(inode)->i_df.if_u1.if_data; +} + STATIC int xfs_vn_getattr( struct vfsmount *mnt, @@ -599,12 +614,12 @@ xfs_setattr_nonsize( return error; } - tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE); - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp); if (error) - goto out_trans_cancel; + goto out_dqrele; xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, 0); /* * Change file ownership. Must be the owner or privileged. @@ -633,12 +648,10 @@ xfs_setattr_nonsize( NULL, capable(CAP_FOWNER) ? XFS_QMOPT_FORCE_RES : 0); if (error) /* out of quota */ - goto out_unlock; + goto out_cancel; } } - xfs_trans_ijoin(tp, ip, 0); - /* * Change file ownership. Must be the owner or privileged. */ @@ -722,10 +735,9 @@ xfs_setattr_nonsize( return 0; -out_unlock: - xfs_iunlock(ip, XFS_ILOCK_EXCL); -out_trans_cancel: +out_cancel: xfs_trans_cancel(tp); +out_dqrele: xfs_qm_dqrele(udqp); xfs_qm_dqrele(gdqp); return error; @@ -790,20 +802,30 @@ xfs_setattr_size( return error; /* + * Wait for all direct I/O to complete. + */ + inode_dio_wait(inode); + + /* * File data changes must be complete before we start the transaction to * modify the inode. This needs to be done before joining the inode to * the transaction because the inode cannot be unlocked once it is a * part of the transaction. * - * Start with zeroing any data block beyond EOF that we may expose on - * file extension. + * Start with zeroing any data beyond EOF that we may expose on file + * extension, or zeroing out the rest of the block on a downward + * truncate. */ if (newsize > oldsize) { error = xfs_zero_eof(ip, newsize, oldsize, &did_zeroing); - if (error) - return error; + } else { + error = iomap_truncate_page(inode, newsize, &did_zeroing, + &xfs_iomap_ops); } + if (error) + return error; + /* * We are going to log the inode size change in this transaction so * any previous writes that are beyond the on disk EOF and the new @@ -812,17 +834,14 @@ xfs_setattr_size( * problem. Note that this includes any block zeroing we did above; * otherwise those blocks may not be zeroed after a crash. */ - if (newsize > ip->i_d.di_size && - (oldsize != ip->i_d.di_size || did_zeroing)) { + if (did_zeroing || + (newsize > ip->i_d.di_size && oldsize != ip->i_d.di_size)) { error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping, ip->i_d.di_size, newsize); if (error) return error; } - /* Now wait for all direct I/O to complete. */ - inode_dio_wait(inode); - /* * We've already locked out new page faults, so now we can safely remove * pages from the page cache knowing they won't get refaulted until we @@ -834,25 +853,17 @@ xfs_setattr_size( * We have to do all the page cache truncate work outside the * transaction context as the "lock" order is page lock->log space * reservation as defined by extent allocation in the writeback path. - * Hence a truncate can fail with ENOMEM from xfs_trans_reserve(), but + * Hence a truncate can fail with ENOMEM from xfs_trans_alloc(), but * having already truncated the in-memory version of the file (i.e. made * user visible changes). There's not much we can do about this, except * to hope that the caller sees ENOMEM and retries the truncate * operation. */ - if (IS_DAX(inode)) - error = dax_truncate_page(inode, newsize, xfs_get_blocks_direct); - else - error = block_truncate_page(inode->i_mapping, newsize, - xfs_get_blocks); - if (error) - return error; truncate_setsize(inode, newsize); - tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_SIZE); - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp); if (error) - goto out_trans_cancel; + return error; lock_flags |= XFS_ILOCK_EXCL; xfs_ilock(ip, XFS_ILOCK_EXCL); @@ -971,12 +982,9 @@ xfs_vn_update_time( trace_xfs_update_time(ip); - tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS); - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_fsyncts, 0, 0); - if (error) { - xfs_trans_cancel(tp); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_fsyncts, 0, 0, 0, &tp); + if (error) return error; - } xfs_ilock(ip, XFS_ILOCK_EXCL); if (flags & S_CTIME) @@ -991,51 +999,6 @@ xfs_vn_update_time( return xfs_trans_commit(tp); } -#define XFS_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR) - -/* - * Call fiemap helper to fill in user data. - * Returns positive errors to xfs_getbmap. - */ -STATIC int -xfs_fiemap_format( - void **arg, - struct getbmapx *bmv, - int *full) -{ - int error; - struct fiemap_extent_info *fieinfo = *arg; - u32 fiemap_flags = 0; - u64 logical, physical, length; - - /* Do nothing for a hole */ - if (bmv->bmv_block == -1LL) - return 0; - - logical = BBTOB(bmv->bmv_offset); - physical = BBTOB(bmv->bmv_block); - length = BBTOB(bmv->bmv_length); - - if (bmv->bmv_oflags & BMV_OF_PREALLOC) - fiemap_flags |= FIEMAP_EXTENT_UNWRITTEN; - else if (bmv->bmv_oflags & BMV_OF_DELALLOC) { - fiemap_flags |= (FIEMAP_EXTENT_DELALLOC | - FIEMAP_EXTENT_UNKNOWN); - physical = 0; /* no block yet */ - } - if (bmv->bmv_oflags & BMV_OF_LAST) - fiemap_flags |= FIEMAP_EXTENT_LAST; - - error = fiemap_fill_next_extent(fieinfo, logical, physical, - length, fiemap_flags); - if (error > 0) { - error = 0; - *full = 1; /* user array now full */ - } - - return error; -} - STATIC int xfs_vn_fiemap( struct inode *inode, @@ -1043,38 +1006,13 @@ xfs_vn_fiemap( u64 start, u64 length) { - xfs_inode_t *ip = XFS_I(inode); - struct getbmapx bm; int error; - error = fiemap_check_flags(fieinfo, XFS_FIEMAP_FLAGS); - if (error) - return error; - - /* Set up bmap header for xfs internal routine */ - bm.bmv_offset = BTOBBT(start); - /* Special case for whole file */ - if (length == FIEMAP_MAX_OFFSET) - bm.bmv_length = -1LL; - else - bm.bmv_length = BTOBB(start + length) - bm.bmv_offset; - - /* We add one because in getbmap world count includes the header */ - bm.bmv_count = !fieinfo->fi_extents_max ? MAXEXTNUM : - fieinfo->fi_extents_max + 1; - bm.bmv_count = min_t(__s32, bm.bmv_count, - (PAGE_SIZE * 16 / sizeof(struct getbmapx))); - bm.bmv_iflags = BMV_IF_PREALLOC | BMV_IF_NO_HOLES; - if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) - bm.bmv_iflags |= BMV_IF_ATTRFORK; - if (!(fieinfo->fi_flags & FIEMAP_FLAG_SYNC)) - bm.bmv_iflags |= BMV_IF_DELALLOC; - - error = xfs_getbmap(ip, &bm, xfs_fiemap_format, fieinfo); - if (error) - return error; + xfs_ilock(XFS_I(inode), XFS_IOLOCK_SHARED); + error = iomap_fiemap(inode, fieinfo, start, length, &xfs_iomap_ops); + xfs_iunlock(XFS_I(inode), XFS_IOLOCK_SHARED); - return 0; + return error; } STATIC int @@ -1167,6 +1105,18 @@ static const struct inode_operations xfs_symlink_inode_operations = { .update_time = xfs_vn_update_time, }; +static const struct inode_operations xfs_inline_symlink_inode_operations = { + .readlink = generic_readlink, + .get_link = xfs_vn_get_link_inline, + .getattr = xfs_vn_getattr, + .setattr = xfs_vn_setattr, + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, + .removexattr = generic_removexattr, + .listxattr = xfs_vn_listxattr, + .update_time = xfs_vn_update_time, +}; + STATIC void xfs_diflags_to_iflags( struct inode *inode, @@ -1193,7 +1143,7 @@ xfs_diflags_to_iflags( } /* - * Initialize the Linux inode and set up the operation vectors. + * Initialize the Linux inode. * * When reading existing inodes from disk this is called directly from xfs_iget, * when creating a new inode it is called from xfs_ialloc after setting up the @@ -1232,32 +1182,12 @@ xfs_setup_inode( i_size_write(inode, ip->i_d.di_size); xfs_diflags_to_iflags(inode, ip); - ip->d_ops = ip->i_mount->m_nondir_inode_ops; - lockdep_set_class(&ip->i_lock.mr_lock, &xfs_nondir_ilock_class); - switch (inode->i_mode & S_IFMT) { - case S_IFREG: - inode->i_op = &xfs_inode_operations; - inode->i_fop = &xfs_file_operations; - inode->i_mapping->a_ops = &xfs_address_space_operations; - break; - case S_IFDIR: + if (S_ISDIR(inode->i_mode)) { lockdep_set_class(&ip->i_lock.mr_lock, &xfs_dir_ilock_class); - if (xfs_sb_version_hasasciici(&XFS_M(inode->i_sb)->m_sb)) - inode->i_op = &xfs_dir_ci_inode_operations; - else - inode->i_op = &xfs_dir_inode_operations; - inode->i_fop = &xfs_dir_file_operations; ip->d_ops = ip->i_mount->m_dir_inode_ops; - break; - case S_IFLNK: - inode->i_op = &xfs_symlink_inode_operations; - if (!(ip->i_df.if_flags & XFS_IFINLINE)) - inode->i_mapping->a_ops = &xfs_address_space_operations; - break; - default: - inode->i_op = &xfs_inode_operations; - init_special_inode(inode, inode->i_mode, inode->i_rdev); - break; + } else { + ip->d_ops = ip->i_mount->m_nondir_inode_ops; + lockdep_set_class(&ip->i_lock.mr_lock, &xfs_nondir_ilock_class); } /* @@ -1277,3 +1207,35 @@ xfs_setup_inode( cache_no_acl(inode); } } + +void +xfs_setup_iops( + struct xfs_inode *ip) +{ + struct inode *inode = &ip->i_vnode; + + switch (inode->i_mode & S_IFMT) { + case S_IFREG: + inode->i_op = &xfs_inode_operations; + inode->i_fop = &xfs_file_operations; + inode->i_mapping->a_ops = &xfs_address_space_operations; + break; + case S_IFDIR: + if (xfs_sb_version_hasasciici(&XFS_M(inode->i_sb)->m_sb)) + inode->i_op = &xfs_dir_ci_inode_operations; + else + inode->i_op = &xfs_dir_inode_operations; + inode->i_fop = &xfs_dir_file_operations; + break; + case S_IFLNK: + if (ip->i_df.if_flags & XFS_IFINLINE) + inode->i_op = &xfs_inline_symlink_inode_operations; + else + inode->i_op = &xfs_symlink_inode_operations; + break; + default: + inode->i_op = &xfs_inode_operations; + init_special_inode(inode, inode->i_mode, inode->i_rdev); + break; + } +} diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h index a8192dc797dc..b8d64d520e12 100644 --- a/fs/xfs/xfs_linux.h +++ b/fs/xfs/xfs_linux.h @@ -328,13 +328,6 @@ static inline __uint64_t howmany_64(__uint64_t x, __uint32_t y) return x; } -/* ARM old ABI has some weird alignment/padding */ -#if defined(__arm__) && !defined(__ARM_EABI__) -#define __arch_pack __attribute__((packed)) -#else -#define __arch_pack -#endif - #define ASSERT_ALWAYS(expr) \ (unlikely(expr) ? (void)0 : assfail(#expr, __FILE__, __LINE__)) diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index b49ccf5c1d75..3b74fa011bb1 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -435,8 +435,7 @@ xfs_log_reserve( int cnt, struct xlog_ticket **ticp, __uint8_t client, - bool permanent, - uint t_type) + bool permanent) { struct xlog *log = mp->m_log; struct xlog_ticket *tic; @@ -456,7 +455,6 @@ xfs_log_reserve( if (!tic) return -ENOMEM; - tic->t_trans_type = t_type; *ticp = tic; xlog_grant_push_ail(log, tic->t_cnt ? tic->t_unit_res * tic->t_cnt @@ -790,7 +788,7 @@ xfs_log_mount_cancel( * As far as I know, there weren't any dependencies on the old behaviour. */ -int +static int xfs_log_unmount_write(xfs_mount_t *mp) { struct xlog *log = mp->m_log; @@ -823,8 +821,7 @@ xfs_log_unmount_write(xfs_mount_t *mp) } while (iclog != first_iclog); #endif if (! (XLOG_FORCED_SHUTDOWN(log))) { - error = xfs_log_reserve(mp, 600, 1, &tic, - XFS_LOG, 0, XLOG_UNMOUNT_REC_TYPE); + error = xfs_log_reserve(mp, 600, 1, &tic, XFS_LOG, 0); if (!error) { /* the data section must be 32 bit size aligned */ struct { @@ -1039,7 +1036,7 @@ xfs_log_space_wake( * there's no point in running a dummy transaction at this point because we * can't start trying to idle the log until both the CIL and AIL are empty. */ -int +static int xfs_log_need_covered(xfs_mount_t *mp) { struct xlog *log = mp->m_log; @@ -1180,7 +1177,7 @@ xlog_space_left( * The log manager needs its own routine, in order to control what * happens with the buffer after the write completes. */ -void +static void xlog_iodone(xfs_buf_t *bp) { struct xlog_in_core *iclog = bp->b_fspriv; @@ -1305,7 +1302,7 @@ xfs_log_work_queue( * disk. If there is nothing dirty, then we might need to cover the log to * indicate that the filesystem is idle. */ -void +static void xfs_log_worker( struct work_struct *work) { @@ -1418,7 +1415,7 @@ xlog_alloc_log( */ error = -ENOMEM; bp = xfs_buf_alloc(mp->m_logdev_targp, XFS_BUF_DADDR_NULL, - BTOBB(log->l_iclog_size), 0); + BTOBB(log->l_iclog_size), XBF_NO_IOACCT); if (!bp) goto out_free_log; @@ -1457,7 +1454,8 @@ xlog_alloc_log( prev_iclog = iclog; bp = xfs_buf_get_uncached(mp->m_logdev_targp, - BTOBB(log->l_iclog_size), 0); + BTOBB(log->l_iclog_size), + XBF_NO_IOACCT); if (!bp) goto out_free_iclog; @@ -2032,58 +2030,8 @@ xlog_print_tic_res( REG_TYPE_STR(ICREATE, "inode create") }; #undef REG_TYPE_STR -#define TRANS_TYPE_STR(type) [XFS_TRANS_##type] = #type - static char *trans_type_str[XFS_TRANS_TYPE_MAX] = { - TRANS_TYPE_STR(SETATTR_NOT_SIZE), - TRANS_TYPE_STR(SETATTR_SIZE), - TRANS_TYPE_STR(INACTIVE), - TRANS_TYPE_STR(CREATE), - TRANS_TYPE_STR(CREATE_TRUNC), - TRANS_TYPE_STR(TRUNCATE_FILE), - TRANS_TYPE_STR(REMOVE), - TRANS_TYPE_STR(LINK), - TRANS_TYPE_STR(RENAME), - TRANS_TYPE_STR(MKDIR), - TRANS_TYPE_STR(RMDIR), - TRANS_TYPE_STR(SYMLINK), - TRANS_TYPE_STR(SET_DMATTRS), - TRANS_TYPE_STR(GROWFS), - TRANS_TYPE_STR(STRAT_WRITE), - TRANS_TYPE_STR(DIOSTRAT), - TRANS_TYPE_STR(WRITEID), - TRANS_TYPE_STR(ADDAFORK), - TRANS_TYPE_STR(ATTRINVAL), - TRANS_TYPE_STR(ATRUNCATE), - TRANS_TYPE_STR(ATTR_SET), - TRANS_TYPE_STR(ATTR_RM), - TRANS_TYPE_STR(ATTR_FLAG), - TRANS_TYPE_STR(CLEAR_AGI_BUCKET), - TRANS_TYPE_STR(SB_CHANGE), - TRANS_TYPE_STR(DUMMY1), - TRANS_TYPE_STR(DUMMY2), - TRANS_TYPE_STR(QM_QUOTAOFF), - TRANS_TYPE_STR(QM_DQALLOC), - TRANS_TYPE_STR(QM_SETQLIM), - TRANS_TYPE_STR(QM_DQCLUSTER), - TRANS_TYPE_STR(QM_QINOCREATE), - TRANS_TYPE_STR(QM_QUOTAOFF_END), - TRANS_TYPE_STR(FSYNC_TS), - TRANS_TYPE_STR(GROWFSRT_ALLOC), - TRANS_TYPE_STR(GROWFSRT_ZERO), - TRANS_TYPE_STR(GROWFSRT_FREE), - TRANS_TYPE_STR(SWAPEXT), - TRANS_TYPE_STR(CHECKPOINT), - TRANS_TYPE_STR(ICREATE), - TRANS_TYPE_STR(CREATE_TMPFILE) - }; -#undef TRANS_TYPE_STR xfs_warn(mp, "xlog_write: reservation summary:"); - xfs_warn(mp, " trans type = %s (%u)", - ((ticket->t_trans_type <= 0 || - ticket->t_trans_type > XFS_TRANS_TYPE_MAX) ? - "bad-trans-type" : trans_type_str[ticket->t_trans_type]), - ticket->t_trans_type); xfs_warn(mp, " unit res = %d bytes", ticket->t_unit_res); xfs_warn(mp, " current res = %d bytes", @@ -3378,7 +3326,7 @@ xfs_log_force( { int error; - trace_xfs_log_force(mp, 0); + trace_xfs_log_force(mp, 0, _RET_IP_); error = _xfs_log_force(mp, flags, NULL); if (error) xfs_warn(mp, "%s: error %d returned.", __func__, error); @@ -3527,7 +3475,7 @@ xfs_log_force_lsn( { int error; - trace_xfs_log_force(mp, lsn); + trace_xfs_log_force(mp, lsn, _RET_IP_); error = _xfs_log_force_lsn(mp, lsn, flags, NULL); if (error) xfs_warn(mp, "%s: error %d returned.", __func__, error); @@ -3709,7 +3657,6 @@ xlog_ticket_alloc( tic->t_tid = prandom_u32(); tic->t_clientid = client; tic->t_flags = XLOG_TIC_INITED; - tic->t_trans_type = 0; if (permanent) tic->t_flags |= XLOG_TIC_PERM_RESERV; diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h index aa533a7d50f2..b5e71072fde5 100644 --- a/fs/xfs/xfs_log.h +++ b/fs/xfs/xfs_log.h @@ -161,15 +161,10 @@ int xfs_log_reserve(struct xfs_mount *mp, int count, struct xlog_ticket **ticket, __uint8_t clientid, - bool permanent, - uint t_type); + bool permanent); int xfs_log_regrant(struct xfs_mount *mp, struct xlog_ticket *tic); -int xfs_log_unmount_write(struct xfs_mount *mp); void xfs_log_unmount(struct xfs_mount *mp); int xfs_log_force_umount(struct xfs_mount *mp, int logerror); -int xfs_log_need_covered(struct xfs_mount *mp); - -void xlog_iodone(struct xfs_buf *); struct xlog_ticket *xfs_log_ticket_get(struct xlog_ticket *ticket); void xfs_log_ticket_put(struct xlog_ticket *ticket); @@ -179,7 +174,6 @@ void xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp, bool xfs_log_item_in_current_chkpt(struct xfs_log_item *lip); void xfs_log_work_queue(struct xfs_mount *mp); -void xfs_log_worker(struct work_struct *work); void xfs_log_quiesce(struct xfs_mount *mp); bool xfs_log_check_lsn(struct xfs_mount *, xfs_lsn_t); diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index 4e7649351f5a..a4ab192e1792 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -51,7 +51,6 @@ xlog_cil_ticket_alloc( tic = xlog_ticket_alloc(log, 0, 1, XFS_TRANSACTION, 0, KM_SLEEP|KM_NOFS); - tic->t_trans_type = XFS_TRANS_CHECKPOINT; /* * set the current reservation to zero so we know to steal the basic @@ -79,6 +78,157 @@ xlog_cil_init_post_recovery( log->l_cilp->xc_ctx->sequence = 1; } +static inline int +xlog_cil_iovec_space( + uint niovecs) +{ + return round_up((sizeof(struct xfs_log_vec) + + niovecs * sizeof(struct xfs_log_iovec)), + sizeof(uint64_t)); +} + +/* + * Allocate or pin log vector buffers for CIL insertion. + * + * The CIL currently uses disposable buffers for copying a snapshot of the + * modified items into the log during a push. The biggest problem with this is + * the requirement to allocate the disposable buffer during the commit if: + * a) does not exist; or + * b) it is too small + * + * If we do this allocation within xlog_cil_insert_format_items(), it is done + * under the xc_ctx_lock, which means that a CIL push cannot occur during + * the memory allocation. This means that we have a potential deadlock situation + * under low memory conditions when we have lots of dirty metadata pinned in + * the CIL and we need a CIL commit to occur to free memory. + * + * To avoid this, we need to move the memory allocation outside the + * xc_ctx_lock, but because the log vector buffers are disposable, that opens + * up a TOCTOU race condition w.r.t. the CIL committing and removing the log + * vector buffers between the check and the formatting of the item into the + * log vector buffer within the xc_ctx_lock. + * + * Because the log vector buffer needs to be unchanged during the CIL push + * process, we cannot share the buffer between the transaction commit (which + * modifies the buffer) and the CIL push context that is writing the changes + * into the log. This means skipping preallocation of buffer space is + * unreliable, but we most definitely do not want to be allocating and freeing + * buffers unnecessarily during commits when overwrites can be done safely. + * + * The simplest solution to this problem is to allocate a shadow buffer when a + * log item is committed for the second time, and then to only use this buffer + * if necessary. The buffer can remain attached to the log item until such time + * it is needed, and this is the buffer that is reallocated to match the size of + * the incoming modification. Then during the formatting of the item we can swap + * the active buffer with the new one if we can't reuse the existing buffer. We + * don't free the old buffer as it may be reused on the next modification if + * it's size is right, otherwise we'll free and reallocate it at that point. + * + * This function builds a vector for the changes in each log item in the + * transaction. It then works out the length of the buffer needed for each log + * item, allocates them and attaches the vector to the log item in preparation + * for the formatting step which occurs under the xc_ctx_lock. + * + * While this means the memory footprint goes up, it avoids the repeated + * alloc/free pattern that repeated modifications of an item would otherwise + * cause, and hence minimises the CPU overhead of such behaviour. + */ +static void +xlog_cil_alloc_shadow_bufs( + struct xlog *log, + struct xfs_trans *tp) +{ + struct xfs_log_item_desc *lidp; + + list_for_each_entry(lidp, &tp->t_items, lid_trans) { + struct xfs_log_item *lip = lidp->lid_item; + struct xfs_log_vec *lv; + int niovecs = 0; + int nbytes = 0; + int buf_size; + bool ordered = false; + + /* Skip items which aren't dirty in this transaction. */ + if (!(lidp->lid_flags & XFS_LID_DIRTY)) + continue; + + /* get number of vecs and size of data to be stored */ + lip->li_ops->iop_size(lip, &niovecs, &nbytes); + + /* + * Ordered items need to be tracked but we do not wish to write + * them. We need a logvec to track the object, but we do not + * need an iovec or buffer to be allocated for copying data. + */ + if (niovecs == XFS_LOG_VEC_ORDERED) { + ordered = true; + niovecs = 0; + nbytes = 0; + } + + /* + * We 64-bit align the length of each iovec so that the start + * of the next one is naturally aligned. We'll need to + * account for that slack space here. Then round nbytes up + * to 64-bit alignment so that the initial buffer alignment is + * easy to calculate and verify. + */ + nbytes += niovecs * sizeof(uint64_t); + nbytes = round_up(nbytes, sizeof(uint64_t)); + + /* + * The data buffer needs to start 64-bit aligned, so round up + * that space to ensure we can align it appropriately and not + * overrun the buffer. + */ + buf_size = nbytes + xlog_cil_iovec_space(niovecs); + + /* + * if we have no shadow buffer, or it is too small, we need to + * reallocate it. + */ + if (!lip->li_lv_shadow || + buf_size > lip->li_lv_shadow->lv_size) { + + /* + * We free and allocate here as a realloc would copy + * unecessary data. We don't use kmem_zalloc() for the + * same reason - we don't need to zero the data area in + * the buffer, only the log vector header and the iovec + * storage. + */ + kmem_free(lip->li_lv_shadow); + + lv = kmem_alloc(buf_size, KM_SLEEP|KM_NOFS); + memset(lv, 0, xlog_cil_iovec_space(niovecs)); + + lv->lv_item = lip; + lv->lv_size = buf_size; + if (ordered) + lv->lv_buf_len = XFS_LOG_VEC_ORDERED; + else + lv->lv_iovecp = (struct xfs_log_iovec *)&lv[1]; + lip->li_lv_shadow = lv; + } else { + /* same or smaller, optimise common overwrite case */ + lv = lip->li_lv_shadow; + if (ordered) + lv->lv_buf_len = XFS_LOG_VEC_ORDERED; + else + lv->lv_buf_len = 0; + lv->lv_bytes = 0; + lv->lv_next = NULL; + } + + /* Ensure the lv is set up according to ->iop_size */ + lv->lv_niovecs = niovecs; + + /* The allocated data region lies beyond the iovec region */ + lv->lv_buf = (char *)lv + xlog_cil_iovec_space(niovecs); + } + +} + /* * Prepare the log item for insertion into the CIL. Calculate the difference in * log space and vectors it will consume, and if it is a new item pin it as @@ -101,16 +251,19 @@ xfs_cil_prepare_item( /* * If there is no old LV, this is the first time we've seen the item in * this CIL context and so we need to pin it. If we are replacing the - * old_lv, then remove the space it accounts for and free it. + * old_lv, then remove the space it accounts for and make it the shadow + * buffer for later freeing. In both cases we are now switching to the + * shadow buffer, so update the the pointer to it appropriately. */ - if (!old_lv) + if (!old_lv) { lv->lv_item->li_ops->iop_pin(lv->lv_item); - else if (old_lv != lv) { + lv->lv_item->li_lv_shadow = NULL; + } else if (old_lv != lv) { ASSERT(lv->lv_buf_len != XFS_LOG_VEC_ORDERED); *diff_len -= old_lv->lv_bytes; *diff_iovecs -= old_lv->lv_niovecs; - kmem_free(old_lv); + lv->lv_item->li_lv_shadow = old_lv; } /* attach new log vector to log item */ @@ -134,11 +287,13 @@ xfs_cil_prepare_item( * write it out asynchronously without needing to relock the object that was * modified at the time it gets written into the iclog. * - * This function builds a vector for the changes in each log item in the - * transaction. It then works out the length of the buffer needed for each log - * item, allocates them and formats the vector for the item into the buffer. - * The buffer is then attached to the log item are then inserted into the - * Committed Item List for tracking until the next checkpoint is written out. + * This function takes the prepared log vectors attached to each log item, and + * formats the changes into the log vector buffer. The buffer it uses is + * dependent on the current state of the vector in the CIL - the shadow lv is + * guaranteed to be large enough for the current modification, but we will only + * use that if we can't reuse the existing lv. If we can't reuse the existing + * lv, then simple swap it out for the shadow lv. We don't free it - that is + * done lazily either by th enext modification or the freeing of the log item. * * We don't set up region headers during this process; we simply copy the * regions into the flat buffer. We can do this because we still have to do a @@ -171,59 +326,29 @@ xlog_cil_insert_format_items( list_for_each_entry(lidp, &tp->t_items, lid_trans) { struct xfs_log_item *lip = lidp->lid_item; struct xfs_log_vec *lv; - struct xfs_log_vec *old_lv; - int niovecs = 0; - int nbytes = 0; - int buf_size; + struct xfs_log_vec *old_lv = NULL; + struct xfs_log_vec *shadow; bool ordered = false; /* Skip items which aren't dirty in this transaction. */ if (!(lidp->lid_flags & XFS_LID_DIRTY)) continue; - /* get number of vecs and size of data to be stored */ - lip->li_ops->iop_size(lip, &niovecs, &nbytes); - - /* Skip items that do not have any vectors for writing */ - if (!niovecs) - continue; - /* - * Ordered items need to be tracked but we do not wish to write - * them. We need a logvec to track the object, but we do not - * need an iovec or buffer to be allocated for copying data. + * The formatting size information is already attached to + * the shadow lv on the log item. */ - if (niovecs == XFS_LOG_VEC_ORDERED) { + shadow = lip->li_lv_shadow; + if (shadow->lv_buf_len == XFS_LOG_VEC_ORDERED) ordered = true; - niovecs = 0; - nbytes = 0; - } - /* - * We 64-bit align the length of each iovec so that the start - * of the next one is naturally aligned. We'll need to - * account for that slack space here. Then round nbytes up - * to 64-bit alignment so that the initial buffer alignment is - * easy to calculate and verify. - */ - nbytes += niovecs * sizeof(uint64_t); - nbytes = round_up(nbytes, sizeof(uint64_t)); - - /* grab the old item if it exists for reservation accounting */ - old_lv = lip->li_lv; - - /* - * The data buffer needs to start 64-bit aligned, so round up - * that space to ensure we can align it appropriately and not - * overrun the buffer. - */ - buf_size = nbytes + - round_up((sizeof(struct xfs_log_vec) + - niovecs * sizeof(struct xfs_log_iovec)), - sizeof(uint64_t)); + /* Skip items that do not have any vectors for writing */ + if (!shadow->lv_niovecs && !ordered) + continue; /* compare to existing item size */ - if (lip->li_lv && buf_size <= lip->li_lv->lv_size) { + old_lv = lip->li_lv; + if (lip->li_lv && shadow->lv_size <= lip->li_lv->lv_size) { /* same or smaller, optimise common overwrite case */ lv = lip->li_lv; lv->lv_next = NULL; @@ -237,32 +362,29 @@ xlog_cil_insert_format_items( */ *diff_iovecs -= lv->lv_niovecs; *diff_len -= lv->lv_bytes; + + /* Ensure the lv is set up according to ->iop_size */ + lv->lv_niovecs = shadow->lv_niovecs; + + /* reset the lv buffer information for new formatting */ + lv->lv_buf_len = 0; + lv->lv_bytes = 0; + lv->lv_buf = (char *)lv + + xlog_cil_iovec_space(lv->lv_niovecs); } else { - /* allocate new data chunk */ - lv = kmem_zalloc(buf_size, KM_SLEEP|KM_NOFS); + /* switch to shadow buffer! */ + lv = shadow; lv->lv_item = lip; - lv->lv_size = buf_size; if (ordered) { /* track as an ordered logvec */ ASSERT(lip->li_lv == NULL); - lv->lv_buf_len = XFS_LOG_VEC_ORDERED; goto insert; } - lv->lv_iovecp = (struct xfs_log_iovec *)&lv[1]; } - /* Ensure the lv is set up according to ->iop_size */ - lv->lv_niovecs = niovecs; - - /* The allocated data region lies beyond the iovec region */ - lv->lv_buf_len = 0; - lv->lv_bytes = 0; - lv->lv_buf = (char *)lv + buf_size - nbytes; ASSERT(IS_ALIGNED((unsigned long)lv->lv_buf, sizeof(uint64_t))); - lip->li_ops->iop_format(lip, lv); insert: - ASSERT(lv->lv_buf_len <= nbytes); xfs_cil_prepare_item(log, lv, old_lv, diff_len, diff_iovecs); } } @@ -784,6 +906,13 @@ xfs_log_commit_cil( struct xlog *log = mp->m_log; struct xfs_cil *cil = log->l_cilp; + /* + * Do all necessary memory allocation before we lock the CIL. + * This ensures the allocation does not deadlock with a CIL + * push in memory reclaim (e.g. from kswapd). + */ + xlog_cil_alloc_shadow_bufs(log, tp); + /* lock out background commit */ down_read(&cil->xc_ctx_lock); diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index ed8896310c00..765f084759b5 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -175,7 +175,6 @@ typedef struct xlog_ticket { char t_cnt; /* current count : 1 */ char t_clientid; /* who does this belong to; : 1 */ char t_flags; /* properties of reservation : 1 */ - uint t_trans_type; /* transaction type : 4 */ /* reservation array fields */ uint t_res_num; /* num in array : 4 */ diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 396565f43247..835997843846 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -3843,7 +3843,7 @@ xlog_recover_add_to_cont_trans( old_ptr = item->ri_buf[item->ri_cnt-1].i_addr; old_len = item->ri_buf[item->ri_cnt-1].i_len; - ptr = kmem_realloc(old_ptr, len+old_len, old_len, KM_SLEEP); + ptr = kmem_realloc(old_ptr, len + old_len, KM_SLEEP); memcpy(&ptr[old_len], dp, len); item->ri_buf[item->ri_cnt-1].i_len += len; item->ri_buf[item->ri_cnt-1].i_addr = ptr; @@ -4205,10 +4205,9 @@ xlog_recover_process_efi( } } - tp = xfs_trans_alloc(mp, 0); - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp); if (error) - goto abort_error; + return error; efdp = xfs_trans_get_efd(tp, efip, efip->efi_format.efi_nextents); for (i = 0; i < efip->efi_format.efi_nextents; i++) { @@ -4355,10 +4354,9 @@ xlog_recover_clear_agi_bucket( int offset; int error; - tp = xfs_trans_alloc(mp, XFS_TRANS_CLEAR_AGI_BUCKET); - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_clearagi, 0, 0); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_clearagi, 0, 0, 0, &tp); if (error) - goto out_abort; + goto out_error; error = xfs_read_agi(mp, tp, agno, &agibp); if (error) diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index cfd4210dd015..970c19ba2f56 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -89,7 +89,6 @@ xfs_uuid_mount( if (hole < 0) { xfs_uuid_table = kmem_realloc(xfs_uuid_table, (xfs_uuid_table_size + 1) * sizeof(*xfs_uuid_table), - xfs_uuid_table_size * sizeof(*xfs_uuid_table), KM_SLEEP); hole = xfs_uuid_table_size++; } @@ -273,13 +272,15 @@ xfs_readsb( buf_ops = NULL; /* - * Allocate a (locked) buffer to hold the superblock. - * This will be kept around at all times to optimize - * access to the superblock. + * Allocate a (locked) buffer to hold the superblock. This will be kept + * around at all times to optimize access to the superblock. Therefore, + * set XBF_NO_IOACCT to make sure it doesn't hold the buftarg count + * elevated. */ reread: error = xfs_buf_read_uncached(mp->m_ddev_targp, XFS_SB_DADDR, - BTOBB(sector_size), 0, &bp, buf_ops); + BTOBB(sector_size), XBF_NO_IOACCT, &bp, + buf_ops); if (error) { if (loud) xfs_warn(mp, "SB validate failed with error %d.", error); @@ -681,6 +682,9 @@ xfs_mountfs( xfs_set_maxicount(mp); + /* enable fail_at_unmount as default */ + mp->m_fail_unmount = 1; + error = xfs_sysfs_init(&mp->m_kobj, &xfs_mp_ktype, NULL, mp->m_fsname); if (error) goto out; @@ -690,10 +694,15 @@ xfs_mountfs( if (error) goto out_remove_sysfs; - error = xfs_uuid_mount(mp); + error = xfs_error_sysfs_init(mp); if (error) goto out_del_stats; + + error = xfs_uuid_mount(mp); + if (error) + goto out_remove_error_sysfs; + /* * Set the minimum read and write sizes */ @@ -957,6 +966,7 @@ xfs_mountfs( cancel_delayed_work_sync(&mp->m_reclaim_work); xfs_reclaim_inodes(mp, SYNC_WAIT); out_log_dealloc: + mp->m_flags |= XFS_MOUNT_UNMOUNTING; xfs_log_mount_cancel(mp); out_fail_wait: if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp) @@ -968,6 +978,8 @@ xfs_mountfs( xfs_da_unmount(mp); out_remove_uuid: xfs_uuid_unmount(mp); + out_remove_error_sysfs: + xfs_error_sysfs_del(mp); out_del_stats: xfs_sysfs_del(&mp->m_stats.xs_kobj); out_remove_sysfs: @@ -1006,6 +1018,14 @@ xfs_unmountfs( xfs_log_force(mp, XFS_LOG_SYNC); /* + * We now need to tell the world we are unmounting. This will allow + * us to detect that the filesystem is going away and we should error + * out anything that we have been retrying in the background. This will + * prevent neverending retries in AIL pushing from hanging the unmount. + */ + mp->m_flags |= XFS_MOUNT_UNMOUNTING; + + /* * Flush all pending changes from the AIL. */ xfs_ail_push_all_sync(mp->m_ail); @@ -1056,6 +1076,7 @@ xfs_unmountfs( #endif xfs_free_perag(mp); + xfs_error_sysfs_del(mp); xfs_sysfs_del(&mp->m_stats.xs_kobj); xfs_sysfs_del(&mp->m_kobj); } diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index eafe257b357a..c1b798c72126 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -37,6 +37,32 @@ enum { XFS_LOWSP_MAX, }; +/* + * Error Configuration + * + * Error classes define the subsystem the configuration belongs to. + * Error numbers define the errors that are configurable. + */ +enum { + XFS_ERR_METADATA, + XFS_ERR_CLASS_MAX, +}; +enum { + XFS_ERR_DEFAULT, + XFS_ERR_EIO, + XFS_ERR_ENOSPC, + XFS_ERR_ENODEV, + XFS_ERR_ERRNO_MAX, +}; + +#define XFS_ERR_RETRY_FOREVER -1 + +struct xfs_error_cfg { + struct xfs_kobj kobj; + int max_retries; + unsigned long retry_timeout; /* in jiffies, 0 = no timeout */ +}; + typedef struct xfs_mount { struct super_block *m_super; xfs_tid_t m_tid; /* next unused tid for fs */ @@ -127,6 +153,9 @@ typedef struct xfs_mount { int64_t m_low_space[XFS_LOWSP_MAX]; /* low free space thresholds */ struct xfs_kobj m_kobj; + struct xfs_kobj m_error_kobj; + struct xfs_kobj m_error_meta_kobj; + struct xfs_error_cfg m_error_cfg[XFS_ERR_CLASS_MAX][XFS_ERR_ERRNO_MAX]; struct xstats m_stats; /* per-fs stats */ struct workqueue_struct *m_buf_workqueue; @@ -148,6 +177,7 @@ typedef struct xfs_mount { */ __uint32_t m_generation; + bool m_fail_unmount; #ifdef DEBUG /* * DEBUG mode instrumentation to test and/or trigger delayed allocation @@ -166,6 +196,7 @@ typedef struct xfs_mount { #define XFS_MOUNT_WSYNC (1ULL << 0) /* for nfs - all metadata ops must be synchronous except for space allocations */ +#define XFS_MOUNT_UNMOUNTING (1ULL << 1) /* filesystem is unmounting */ #define XFS_MOUNT_WAS_CLEAN (1ULL << 3) #define XFS_MOUNT_FS_SHUTDOWN (1ULL << 4) /* atomic stop of all filesystem operations, typically for @@ -364,4 +395,7 @@ extern void xfs_set_low_space_thresholds(struct xfs_mount *); int xfs_zero_extent(struct xfs_inode *ip, xfs_fsblock_t start_fsb, xfs_off_t count_fsb); +struct xfs_error_cfg * xfs_error_get_cfg(struct xfs_mount *mp, + int error_class, int error); + #endif /* __XFS_MOUNT_H__ */ diff --git a/fs/xfs/xfs_ondisk.h b/fs/xfs/xfs_ondisk.h index 184c44effdd5..0cc8d8f74356 100644 --- a/fs/xfs/xfs_ondisk.h +++ b/fs/xfs/xfs_ondisk.h @@ -22,6 +22,11 @@ BUILD_BUG_ON_MSG(sizeof(structname) != (size), "XFS: sizeof(" \ #structname ") is wrong, expected " #size) +#define XFS_CHECK_OFFSET(structname, member, off) \ + BUILD_BUG_ON_MSG(offsetof(structname, member) != (off), \ + "XFS: offsetof(" #structname ", " #member ") is wrong, " \ + "expected " #off) + static inline void __init xfs_check_ondisk_structs(void) { @@ -34,6 +39,8 @@ xfs_check_ondisk_structs(void) XFS_CHECK_STRUCT_SIZE(struct xfs_bmbt_key, 8); XFS_CHECK_STRUCT_SIZE(struct xfs_bmbt_rec, 16); XFS_CHECK_STRUCT_SIZE(struct xfs_bmdr_block, 4); + XFS_CHECK_STRUCT_SIZE(struct xfs_btree_block_shdr, 48); + XFS_CHECK_STRUCT_SIZE(struct xfs_btree_block_lhdr, 64); XFS_CHECK_STRUCT_SIZE(struct xfs_btree_block, 72); XFS_CHECK_STRUCT_SIZE(struct xfs_dinode, 176); XFS_CHECK_STRUCT_SIZE(struct xfs_disk_dquot, 104); @@ -75,27 +82,39 @@ xfs_check_ondisk_structs(void) XFS_CHECK_STRUCT_SIZE(xfs_attr_leaf_name_remote_t, 12); */ + XFS_CHECK_OFFSET(xfs_attr_leaf_name_local_t, valuelen, 0); + XFS_CHECK_OFFSET(xfs_attr_leaf_name_local_t, namelen, 2); + XFS_CHECK_OFFSET(xfs_attr_leaf_name_local_t, nameval, 3); + XFS_CHECK_OFFSET(xfs_attr_leaf_name_remote_t, valueblk, 0); + XFS_CHECK_OFFSET(xfs_attr_leaf_name_remote_t, valuelen, 4); + XFS_CHECK_OFFSET(xfs_attr_leaf_name_remote_t, namelen, 8); + XFS_CHECK_OFFSET(xfs_attr_leaf_name_remote_t, name, 9); XFS_CHECK_STRUCT_SIZE(xfs_attr_leafblock_t, 40); - XFS_CHECK_STRUCT_SIZE(xfs_attr_shortform_t, 8); + XFS_CHECK_OFFSET(xfs_attr_shortform_t, hdr.totsize, 0); + XFS_CHECK_OFFSET(xfs_attr_shortform_t, hdr.count, 2); + XFS_CHECK_OFFSET(xfs_attr_shortform_t, list[0].namelen, 4); + XFS_CHECK_OFFSET(xfs_attr_shortform_t, list[0].valuelen, 5); + XFS_CHECK_OFFSET(xfs_attr_shortform_t, list[0].flags, 6); + XFS_CHECK_OFFSET(xfs_attr_shortform_t, list[0].nameval, 7); XFS_CHECK_STRUCT_SIZE(xfs_da_blkinfo_t, 12); XFS_CHECK_STRUCT_SIZE(xfs_da_intnode_t, 16); XFS_CHECK_STRUCT_SIZE(xfs_da_node_entry_t, 8); XFS_CHECK_STRUCT_SIZE(xfs_da_node_hdr_t, 16); XFS_CHECK_STRUCT_SIZE(xfs_dir2_data_free_t, 4); XFS_CHECK_STRUCT_SIZE(xfs_dir2_data_hdr_t, 16); - XFS_CHECK_STRUCT_SIZE(xfs_dir2_data_unused_t, 6); + XFS_CHECK_OFFSET(xfs_dir2_data_unused_t, freetag, 0); + XFS_CHECK_OFFSET(xfs_dir2_data_unused_t, length, 2); XFS_CHECK_STRUCT_SIZE(xfs_dir2_free_hdr_t, 16); XFS_CHECK_STRUCT_SIZE(xfs_dir2_free_t, 16); - XFS_CHECK_STRUCT_SIZE(xfs_dir2_ino4_t, 4); - XFS_CHECK_STRUCT_SIZE(xfs_dir2_ino8_t, 8); - XFS_CHECK_STRUCT_SIZE(xfs_dir2_inou_t, 8); XFS_CHECK_STRUCT_SIZE(xfs_dir2_leaf_entry_t, 8); XFS_CHECK_STRUCT_SIZE(xfs_dir2_leaf_hdr_t, 16); XFS_CHECK_STRUCT_SIZE(xfs_dir2_leaf_t, 16); XFS_CHECK_STRUCT_SIZE(xfs_dir2_leaf_tail_t, 4); XFS_CHECK_STRUCT_SIZE(xfs_dir2_sf_entry_t, 3); + XFS_CHECK_OFFSET(xfs_dir2_sf_entry_t, namelen, 0); + XFS_CHECK_OFFSET(xfs_dir2_sf_entry_t, offset, 1); + XFS_CHECK_OFFSET(xfs_dir2_sf_entry_t, name, 3); XFS_CHECK_STRUCT_SIZE(xfs_dir2_sf_hdr_t, 10); - XFS_CHECK_STRUCT_SIZE(xfs_dir2_sf_off_t, 2); /* log structures */ XFS_CHECK_STRUCT_SIZE(struct xfs_dq_logformat, 24); diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c index 51ddaf2c2b8c..0f14b2e4bf6c 100644 --- a/fs/xfs/xfs_pnfs.c +++ b/fs/xfs/xfs_pnfs.c @@ -1,6 +1,7 @@ /* * Copyright (c) 2014 Christoph Hellwig. */ +#include <linux/iomap.h> #include "xfs.h" #include "xfs_format.h" #include "xfs_log_format.h" @@ -79,32 +80,6 @@ xfs_fs_get_uuid( return 0; } -static void -xfs_bmbt_to_iomap( - struct xfs_inode *ip, - struct iomap *iomap, - struct xfs_bmbt_irec *imap) -{ - struct xfs_mount *mp = ip->i_mount; - - if (imap->br_startblock == HOLESTARTBLOCK) { - iomap->blkno = IOMAP_NULL_BLOCK; - iomap->type = IOMAP_HOLE; - } else if (imap->br_startblock == DELAYSTARTBLOCK) { - iomap->blkno = IOMAP_NULL_BLOCK; - iomap->type = IOMAP_DELALLOC; - } else { - iomap->blkno = - XFS_FSB_TO_DADDR(ip->i_mount, imap->br_startblock); - if (imap->br_state == XFS_EXT_UNWRITTEN) - iomap->type = IOMAP_UNWRITTEN; - else - iomap->type = IOMAP_MAPPED; - } - iomap->offset = XFS_FSB_TO_B(mp, imap->br_startoff); - iomap->length = XFS_FSB_TO_B(mp, imap->br_blockcount); -} - /* * Get a layout for the pNFS client. */ @@ -308,12 +283,9 @@ xfs_fs_commit_blocks( goto out_drop_iolock; } - tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE); - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0); - if (error) { - xfs_trans_cancel(tp); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp); + if (error) goto out_drop_iolock; - } xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index be125e1758c1..a60d9e2739d1 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -783,13 +783,10 @@ xfs_qm_qino_alloc( } } - tp = xfs_trans_alloc(mp, XFS_TRANS_QM_QINOCREATE); - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_create, - XFS_QM_QINOCREATE_SPACE_RES(mp), 0); - if (error) { - xfs_trans_cancel(tp); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_create, + XFS_QM_QINOCREATE_SPACE_RES(mp), 0, 0, &tp); + if (error) return error; - } if (need_alloc) { error = xfs_dir_ialloc(&tp, NULL, S_IFREG, 1, 0, 0, 1, ip, diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c index f4d0e0a8f517..475a3882a81f 100644 --- a/fs/xfs/xfs_qm_syscalls.c +++ b/fs/xfs/xfs_qm_syscalls.c @@ -236,10 +236,8 @@ xfs_qm_scall_trunc_qfile( xfs_ilock(ip, XFS_IOLOCK_EXCL); - tp = xfs_trans_alloc(mp, XFS_TRANS_TRUNCATE_FILE); - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp); if (error) { - xfs_trans_cancel(tp); xfs_iunlock(ip, XFS_IOLOCK_EXCL); goto out_put; } @@ -436,12 +434,9 @@ xfs_qm_scall_setqlim( defq = xfs_get_defquota(dqp, q); xfs_dqunlock(dqp); - tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SETQLIM); - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_qm_setqlim, 0, 0); - if (error) { - xfs_trans_cancel(tp); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_qm_setqlim, 0, 0, 0, &tp); + if (error) goto out_rele; - } xfs_dqlock(dqp); xfs_trans_dqjoin(tp, dqp); @@ -569,13 +564,9 @@ xfs_qm_log_quotaoff_end( int error; xfs_qoff_logitem_t *qoffi; - tp = xfs_trans_alloc(mp, XFS_TRANS_QM_QUOTAOFF_END); - - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_qm_equotaoff, 0, 0); - if (error) { - xfs_trans_cancel(tp); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_qm_equotaoff, 0, 0, 0, &tp); + if (error) return error; - } qoffi = xfs_trans_get_qoff_item(tp, startqoff, flags & XFS_ALL_QUOTA_ACCT); @@ -603,12 +594,9 @@ xfs_qm_log_quotaoff( *qoffstartp = NULL; - tp = xfs_trans_alloc(mp, XFS_TRANS_QM_QUOTAOFF); - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_qm_quotaoff, 0, 0); - if (error) { - xfs_trans_cancel(tp); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_qm_quotaoff, 0, 0, 0, &tp); + if (error) goto out; - } qoffi = xfs_trans_get_qoff_item(tp, NULL, flags & XFS_ALL_QUOTA_ACCT); xfs_trans_log_quotaoff_item(tp, qoffi); diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index abf44435d04a..3938b37d1043 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -780,15 +780,14 @@ xfs_growfs_rt_alloc( * Allocate space to the file, as necessary. */ while (oblocks < nblocks) { - tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFSRT_ALLOC); resblks = XFS_GROWFSRT_SPACE_RES(mp, nblocks - oblocks); /* * Reserve space & log for one extent added to the file. */ - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_growrtalloc, - resblks, 0); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_growrtalloc, resblks, + 0, 0, &tp); if (error) - goto out_trans_cancel; + return error; /* * Lock the inode. */ @@ -823,14 +822,13 @@ xfs_growfs_rt_alloc( for (bno = map.br_startoff, fsbno = map.br_startblock; bno < map.br_startoff + map.br_blockcount; bno++, fsbno++) { - tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFSRT_ZERO); /* * Reserve log for one block zeroing. */ - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_growrtzero, - 0, 0); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_growrtzero, + 0, 0, 0, &tp); if (error) - goto out_trans_cancel; + return error; /* * Lock the bitmap inode. */ @@ -994,11 +992,10 @@ xfs_growfs_rt( /* * Start a transaction, get the log reservation. */ - tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFSRT_FREE); - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_growrtfree, - 0, 0); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_growrtfree, 0, 0, 0, + &tp); if (error) - goto error_cancel; + break; /* * Lock out other callers by grabbing the bitmap inode lock. */ diff --git a/fs/xfs/xfs_rtalloc.h b/fs/xfs/xfs_rtalloc.h index 76c0a4a9bb17..355dd9e1cb64 100644 --- a/fs/xfs/xfs_rtalloc.h +++ b/fs/xfs/xfs_rtalloc.h @@ -98,8 +98,6 @@ xfs_growfs_rt( /* * From xfs_rtbitmap.c */ -int xfs_rtbuf_get(struct xfs_mount *mp, struct xfs_trans *tp, - xfs_rtblock_t block, int issum, struct xfs_buf **bpp); int xfs_rtcheck_range(struct xfs_mount *mp, struct xfs_trans *tp, xfs_rtblock_t start, xfs_extlen_t len, int val, xfs_rtblock_t *new, int *stat); diff --git a/fs/xfs/xfs_stats.c b/fs/xfs/xfs_stats.c index 8686df6c7609..d266e835ecc3 100644 --- a/fs/xfs/xfs_stats.c +++ b/fs/xfs/xfs_stats.c @@ -128,7 +128,6 @@ static int xqm_proc_open(struct inode *inode, struct file *file) } static const struct file_operations xqm_proc_fops = { - .owner = THIS_MODULE, .open = xqm_proc_open, .read = seq_read, .llseek = seq_lseek, diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 187e14b696c2..0303f1005f88 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -58,8 +58,7 @@ #include <linux/parser.h> static const struct super_operations xfs_super_operations; -static kmem_zone_t *xfs_ioend_zone; -mempool_t *xfs_ioend_pool; +struct bio_set *xfs_ioend_bioset; static struct kset *xfs_kset; /* top-level xfs sysfs dir */ #ifdef DEBUG @@ -350,6 +349,7 @@ xfs_parseargs( case Opt_pqnoenforce: mp->m_qflags |= (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE); mp->m_qflags &= ~XFS_PQUOTA_ENFD; + break; case Opt_gquota: case Opt_grpquota: mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE | @@ -546,7 +546,7 @@ xfs_showargs( return 0; } -__uint64_t +static __uint64_t xfs_max_file_offset( unsigned int blockshift) { @@ -928,7 +928,7 @@ xfs_fs_alloc_inode( /* * Now that the generic code is guaranteed not to be accessing - * the linux inode, we can reclaim the inode. + * the linux inode, we can inactivate and reclaim the inode. */ STATIC void xfs_fs_destroy_inode( @@ -938,9 +938,14 @@ xfs_fs_destroy_inode( trace_xfs_destroy_inode(ip); - XFS_STATS_INC(ip->i_mount, vn_reclaim); + ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock)); + XFS_STATS_INC(ip->i_mount, vn_rele); + XFS_STATS_INC(ip->i_mount, vn_remove); + + xfs_inactive(ip); ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || ip->i_delayed_blks == 0); + XFS_STATS_INC(ip->i_mount, vn_reclaim); /* * We should never get here with one of the reclaim flags already set. @@ -987,24 +992,6 @@ xfs_fs_inode_init_once( "xfsino", ip->i_ino); } -STATIC void -xfs_fs_evict_inode( - struct inode *inode) -{ - xfs_inode_t *ip = XFS_I(inode); - - ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock)); - - trace_xfs_evict_inode(ip); - - truncate_inode_pages_final(&inode->i_data); - clear_inode(inode); - XFS_STATS_INC(ip->i_mount, vn_rele); - XFS_STATS_INC(ip->i_mount, vn_remove); - - xfs_inactive(ip); -} - /* * We do an unlocked check for XFS_IDONTCACHE here because we are already * serialised against cache hits here via the inode->i_lock and igrab() in @@ -1276,6 +1263,16 @@ xfs_fs_remount( return -EINVAL; } + if (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 && + xfs_sb_has_ro_compat_feature(sbp, + XFS_SB_FEAT_RO_COMPAT_UNKNOWN)) { + xfs_warn(mp, +"ro->rw transition prohibited on unknown (0x%x) ro-compat filesystem", + (sbp->sb_features_ro_compat & + XFS_SB_FEAT_RO_COMPAT_UNKNOWN)); + return -EINVAL; + } + mp->m_flags &= ~XFS_MOUNT_RDONLY; /* @@ -1297,6 +1294,7 @@ xfs_fs_remount( */ xfs_restore_resvblks(mp); xfs_log_work_queue(mp); + xfs_queue_eofblocks(mp); } /* rw -> ro */ @@ -1309,6 +1307,13 @@ xfs_fs_remount( * return it to the same size. */ xfs_save_resvblks(mp); + + /* + * Cancel background eofb scanning so it cannot race with the + * final log force+buftarg wait and deadlock the remount. + */ + cancel_delayed_work_sync(&mp->m_eofblocks_work); + xfs_quiesce_attr(mp); mp->m_flags |= XFS_MOUNT_RDONLY; } @@ -1558,22 +1563,16 @@ xfs_fs_fill_super( if (mp->m_flags & XFS_MOUNT_DAX) { xfs_warn(mp, - "DAX enabled. Warning: EXPERIMENTAL, use at your own risk"); - if (sb->s_blocksize != PAGE_SIZE) { - xfs_alert(mp, - "Filesystem block size invalid for DAX Turning DAX off."); - mp->m_flags &= ~XFS_MOUNT_DAX; - } else if (!sb->s_bdev->bd_disk->fops->direct_access) { + "DAX enabled. Warning: EXPERIMENTAL, use at your own risk"); + + error = bdev_dax_supported(sb, sb->s_blocksize); + if (error) { xfs_alert(mp, - "Block device does not support DAX Turning DAX off."); + "DAX unsupported by block device. Turning off DAX."); mp->m_flags &= ~XFS_MOUNT_DAX; } } - if (xfs_sb_version_hassparseinodes(&mp->m_sb)) - xfs_alert(mp, - "EXPERIMENTAL sparse inode feature enabled. Use at your own risk!"); - error = xfs_mountfs(mp); if (error) goto out_filestream_unmount; @@ -1663,7 +1662,6 @@ xfs_fs_free_cached_objects( static const struct super_operations xfs_super_operations = { .alloc_inode = xfs_fs_alloc_inode, .destroy_inode = xfs_fs_destroy_inode, - .evict_inode = xfs_fs_evict_inode, .drop_inode = xfs_fs_drop_inode, .put_super = xfs_fs_put_super, .sync_fs = xfs_fs_sync_fs, @@ -1688,23 +1686,19 @@ MODULE_ALIAS_FS("xfs"); STATIC int __init xfs_init_zones(void) { - - xfs_ioend_zone = kmem_zone_init(sizeof(xfs_ioend_t), "xfs_ioend"); - if (!xfs_ioend_zone) + xfs_ioend_bioset = bioset_create(4 * MAX_BUF_PER_PAGE, + offsetof(struct xfs_ioend, io_inline_bio)); + if (!xfs_ioend_bioset) goto out; - xfs_ioend_pool = mempool_create_slab_pool(4 * MAX_BUF_PER_PAGE, - xfs_ioend_zone); - if (!xfs_ioend_pool) - goto out_destroy_ioend_zone; - xfs_log_ticket_zone = kmem_zone_init(sizeof(xlog_ticket_t), "xfs_log_ticket"); if (!xfs_log_ticket_zone) - goto out_destroy_ioend_pool; + goto out_free_ioend_bioset; - xfs_bmap_free_item_zone = kmem_zone_init(sizeof(xfs_bmap_free_item_t), - "xfs_bmap_free_item"); + xfs_bmap_free_item_zone = kmem_zone_init( + sizeof(struct xfs_bmap_free_item), + "xfs_bmap_free_item"); if (!xfs_bmap_free_item_zone) goto out_destroy_log_ticket_zone; @@ -1797,10 +1791,8 @@ xfs_init_zones(void) kmem_zone_destroy(xfs_bmap_free_item_zone); out_destroy_log_ticket_zone: kmem_zone_destroy(xfs_log_ticket_zone); - out_destroy_ioend_pool: - mempool_destroy(xfs_ioend_pool); - out_destroy_ioend_zone: - kmem_zone_destroy(xfs_ioend_zone); + out_free_ioend_bioset: + bioset_free(xfs_ioend_bioset); out: return -ENOMEM; } @@ -1826,9 +1818,7 @@ xfs_destroy_zones(void) kmem_zone_destroy(xfs_btree_cur_zone); kmem_zone_destroy(xfs_bmap_free_item_zone); kmem_zone_destroy(xfs_log_ticket_zone); - mempool_destroy(xfs_ioend_pool); - kmem_zone_destroy(xfs_ioend_zone); - + bioset_free(xfs_ioend_bioset); } STATIC int __init diff --git a/fs/xfs/xfs_super.h b/fs/xfs/xfs_super.h index 2dfb1ce4585f..529bce9fc37e 100644 --- a/fs/xfs/xfs_super.h +++ b/fs/xfs/xfs_super.h @@ -61,8 +61,6 @@ struct xfs_mount; struct xfs_buftarg; struct block_device; -extern __uint64_t xfs_max_file_offset(unsigned int); - extern void xfs_flush_inodes(struct xfs_mount *mp); extern void xfs_blkdev_issue_flush(struct xfs_buftarg *); extern xfs_agnumber_t xfs_set_inode_alloc(struct xfs_mount *, diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c index b44284c1adda..08a46c6181fd 100644 --- a/fs/xfs/xfs_symlink.c +++ b/fs/xfs/xfs_symlink.c @@ -131,6 +131,8 @@ xfs_readlink( trace_xfs_readlink(ip); + ASSERT(!(ip->i_df.if_flags & XFS_IFINLINE)); + if (XFS_FORCED_SHUTDOWN(mp)) return -EIO; @@ -150,12 +152,7 @@ xfs_readlink( } - if (ip->i_df.if_flags & XFS_IFINLINE) { - memcpy(link, ip->i_df.if_u1.if_data, pathlen); - link[pathlen] = '\0'; - } else { - error = xfs_readlink_bmap(ip, link); - } + error = xfs_readlink_bmap(ip, link); out: xfs_iunlock(ip, XFS_ILOCK_SHARED); @@ -221,7 +218,6 @@ xfs_symlink( if (error) return error; - tp = xfs_trans_alloc(mp, XFS_TRANS_SYMLINK); /* * The symlink will fit into the inode data fork? * There can't be any attributes so we get the whole variable part. @@ -231,13 +227,15 @@ xfs_symlink( else fs_blocks = xfs_symlink_blocks(mp, pathlen); resblks = XFS_SYMLINK_SPACE_RES(mp, link_name->len, fs_blocks); - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_symlink, resblks, 0); + + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_symlink, resblks, 0, 0, &tp); if (error == -ENOSPC && fs_blocks == 0) { resblks = 0; - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_symlink, 0, 0); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_symlink, 0, 0, 0, + &tp); } if (error) - goto out_trans_cancel; + goto out_release_inode; xfs_ilock(dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL | XFS_IOLOCK_PARENT | XFS_ILOCK_PARENT); @@ -302,19 +300,11 @@ xfs_symlink( * If the symlink will fit into the inode, write it inline. */ if (pathlen <= XFS_IFORK_DSIZE(ip)) { - xfs_idata_realloc(ip, pathlen, XFS_DATA_FORK); - memcpy(ip->i_df.if_u1.if_data, target_path, pathlen); - ip->i_d.di_size = pathlen; - - /* - * The inode was initially created in extent format. - */ - ip->i_df.if_flags &= ~(XFS_IFEXTENTS | XFS_IFBROOT); - ip->i_df.if_flags |= XFS_IFINLINE; + xfs_init_local_fork(ip, XFS_DATA_FORK, target_path, pathlen); + ip->i_d.di_size = pathlen; ip->i_d.di_format = XFS_DINODE_FMT_LOCAL; xfs_trans_log_inode(tp, ip, XFS_ILOG_DDATA | XFS_ILOG_CORE); - } else { int offset; @@ -455,12 +445,9 @@ xfs_inactive_symlink_rmt( */ ASSERT(ip->i_d.di_nextents > 0 && ip->i_d.di_nextents <= 2); - tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE); - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0); - if (error) { - xfs_trans_cancel(tp); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp); + if (error) return error; - } xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, 0); diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c index 6ced4f143494..79cfd3fc5324 100644 --- a/fs/xfs/xfs_sysfs.c +++ b/fs/xfs/xfs_sysfs.c @@ -17,10 +17,11 @@ */ #include "xfs.h" -#include "xfs_sysfs.h" +#include "xfs_shared.h" #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" +#include "xfs_sysfs.h" #include "xfs_log.h" #include "xfs_log_priv.h" #include "xfs_stats.h" @@ -362,3 +363,294 @@ struct kobj_type xfs_log_ktype = { .sysfs_ops = &xfs_sysfs_ops, .default_attrs = xfs_log_attrs, }; + +/* + * Metadata IO error configuration + * + * The sysfs structure here is: + * ...xfs/<dev>/error/<class>/<errno>/<error_attrs> + * + * where <class> allows us to discriminate between data IO and metadata IO, + * and any other future type of IO (e.g. special inode or directory error + * handling) we care to support. + */ +static inline struct xfs_error_cfg * +to_error_cfg(struct kobject *kobject) +{ + struct xfs_kobj *kobj = to_kobj(kobject); + return container_of(kobj, struct xfs_error_cfg, kobj); +} + +static inline struct xfs_mount * +err_to_mp(struct kobject *kobject) +{ + struct xfs_kobj *kobj = to_kobj(kobject); + return container_of(kobj, struct xfs_mount, m_error_kobj); +} + +static ssize_t +max_retries_show( + struct kobject *kobject, + char *buf) +{ + struct xfs_error_cfg *cfg = to_error_cfg(kobject); + + return snprintf(buf, PAGE_SIZE, "%d\n", cfg->max_retries); +} + +static ssize_t +max_retries_store( + struct kobject *kobject, + const char *buf, + size_t count) +{ + struct xfs_error_cfg *cfg = to_error_cfg(kobject); + int ret; + int val; + + ret = kstrtoint(buf, 0, &val); + if (ret) + return ret; + + if (val < -1) + return -EINVAL; + + cfg->max_retries = val; + return count; +} +XFS_SYSFS_ATTR_RW(max_retries); + +static ssize_t +retry_timeout_seconds_show( + struct kobject *kobject, + char *buf) +{ + struct xfs_error_cfg *cfg = to_error_cfg(kobject); + + return snprintf(buf, PAGE_SIZE, "%ld\n", + jiffies_to_msecs(cfg->retry_timeout) / MSEC_PER_SEC); +} + +static ssize_t +retry_timeout_seconds_store( + struct kobject *kobject, + const char *buf, + size_t count) +{ + struct xfs_error_cfg *cfg = to_error_cfg(kobject); + int ret; + int val; + + ret = kstrtoint(buf, 0, &val); + if (ret) + return ret; + + /* 1 day timeout maximum */ + if (val < 0 || val > 86400) + return -EINVAL; + + cfg->retry_timeout = msecs_to_jiffies(val * MSEC_PER_SEC); + return count; +} +XFS_SYSFS_ATTR_RW(retry_timeout_seconds); + +static ssize_t +fail_at_unmount_show( + struct kobject *kobject, + char *buf) +{ + struct xfs_mount *mp = err_to_mp(kobject); + + return snprintf(buf, PAGE_SIZE, "%d\n", mp->m_fail_unmount); +} + +static ssize_t +fail_at_unmount_store( + struct kobject *kobject, + const char *buf, + size_t count) +{ + struct xfs_mount *mp = err_to_mp(kobject); + int ret; + int val; + + ret = kstrtoint(buf, 0, &val); + if (ret) + return ret; + + if (val < 0 || val > 1) + return -EINVAL; + + mp->m_fail_unmount = val; + return count; +} +XFS_SYSFS_ATTR_RW(fail_at_unmount); + +static struct attribute *xfs_error_attrs[] = { + ATTR_LIST(max_retries), + ATTR_LIST(retry_timeout_seconds), + NULL, +}; + + +struct kobj_type xfs_error_cfg_ktype = { + .release = xfs_sysfs_release, + .sysfs_ops = &xfs_sysfs_ops, + .default_attrs = xfs_error_attrs, +}; + +struct kobj_type xfs_error_ktype = { + .release = xfs_sysfs_release, + .sysfs_ops = &xfs_sysfs_ops, +}; + +/* + * Error initialization tables. These need to be ordered in the same + * order as the enums used to index the array. All class init tables need to + * define a "default" behaviour as the first entry, all other entries can be + * empty. + */ +struct xfs_error_init { + char *name; + int max_retries; + int retry_timeout; /* in seconds */ +}; + +static const struct xfs_error_init xfs_error_meta_init[XFS_ERR_ERRNO_MAX] = { + { .name = "default", + .max_retries = XFS_ERR_RETRY_FOREVER, + .retry_timeout = 0, + }, + { .name = "EIO", + .max_retries = XFS_ERR_RETRY_FOREVER, + .retry_timeout = 0, + }, + { .name = "ENOSPC", + .max_retries = XFS_ERR_RETRY_FOREVER, + .retry_timeout = 0, + }, + { .name = "ENODEV", + .max_retries = 0, + }, +}; + +static int +xfs_error_sysfs_init_class( + struct xfs_mount *mp, + int class, + const char *parent_name, + struct xfs_kobj *parent_kobj, + const struct xfs_error_init init[]) +{ + struct xfs_error_cfg *cfg; + int error; + int i; + + ASSERT(class < XFS_ERR_CLASS_MAX); + + error = xfs_sysfs_init(parent_kobj, &xfs_error_ktype, + &mp->m_error_kobj, parent_name); + if (error) + return error; + + for (i = 0; i < XFS_ERR_ERRNO_MAX; i++) { + cfg = &mp->m_error_cfg[class][i]; + error = xfs_sysfs_init(&cfg->kobj, &xfs_error_cfg_ktype, + parent_kobj, init[i].name); + if (error) + goto out_error; + + cfg->max_retries = init[i].max_retries; + cfg->retry_timeout = msecs_to_jiffies( + init[i].retry_timeout * MSEC_PER_SEC); + } + return 0; + +out_error: + /* unwind the entries that succeeded */ + for (i--; i >= 0; i--) { + cfg = &mp->m_error_cfg[class][i]; + xfs_sysfs_del(&cfg->kobj); + } + xfs_sysfs_del(parent_kobj); + return error; +} + +int +xfs_error_sysfs_init( + struct xfs_mount *mp) +{ + int error; + + /* .../xfs/<dev>/error/ */ + error = xfs_sysfs_init(&mp->m_error_kobj, &xfs_error_ktype, + &mp->m_kobj, "error"); + if (error) + return error; + + error = sysfs_create_file(&mp->m_error_kobj.kobject, + ATTR_LIST(fail_at_unmount)); + + if (error) + goto out_error; + + /* .../xfs/<dev>/error/metadata/ */ + error = xfs_error_sysfs_init_class(mp, XFS_ERR_METADATA, + "metadata", &mp->m_error_meta_kobj, + xfs_error_meta_init); + if (error) + goto out_error; + + return 0; + +out_error: + xfs_sysfs_del(&mp->m_error_kobj); + return error; +} + +void +xfs_error_sysfs_del( + struct xfs_mount *mp) +{ + struct xfs_error_cfg *cfg; + int i, j; + + for (i = 0; i < XFS_ERR_CLASS_MAX; i++) { + for (j = 0; j < XFS_ERR_ERRNO_MAX; j++) { + cfg = &mp->m_error_cfg[i][j]; + + xfs_sysfs_del(&cfg->kobj); + } + } + xfs_sysfs_del(&mp->m_error_meta_kobj); + xfs_sysfs_del(&mp->m_error_kobj); +} + +struct xfs_error_cfg * +xfs_error_get_cfg( + struct xfs_mount *mp, + int error_class, + int error) +{ + struct xfs_error_cfg *cfg; + + if (error < 0) + error = -error; + + switch (error) { + case EIO: + cfg = &mp->m_error_cfg[error_class][XFS_ERR_EIO]; + break; + case ENOSPC: + cfg = &mp->m_error_cfg[error_class][XFS_ERR_ENOSPC]; + break; + case ENODEV: + cfg = &mp->m_error_cfg[error_class][XFS_ERR_ENODEV]; + break; + default: + cfg = &mp->m_error_cfg[error_class][XFS_ERR_DEFAULT]; + break; + } + + return cfg; +} diff --git a/fs/xfs/xfs_sysfs.h b/fs/xfs/xfs_sysfs.h index be692e59938d..d04637181ef2 100644 --- a/fs/xfs/xfs_sysfs.h +++ b/fs/xfs/xfs_sysfs.h @@ -58,4 +58,7 @@ xfs_sysfs_del( wait_for_completion(&kobj->complete); } +int xfs_error_sysfs_init(struct xfs_mount *mp); +void xfs_error_sysfs_del(struct xfs_mount *mp); + #endif /* __XFS_SYSFS_H__ */ diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index c8d58426008e..145169093fe0 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -354,6 +354,7 @@ DEFINE_BUF_EVENT(xfs_buf_submit_wait); DEFINE_BUF_EVENT(xfs_buf_bawrite); DEFINE_BUF_EVENT(xfs_buf_lock); DEFINE_BUF_EVENT(xfs_buf_lock_done); +DEFINE_BUF_EVENT(xfs_buf_trylock_fail); DEFINE_BUF_EVENT(xfs_buf_trylock); DEFINE_BUF_EVENT(xfs_buf_unlock); DEFINE_BUF_EVENT(xfs_buf_iowait); @@ -364,7 +365,6 @@ DEFINE_BUF_EVENT(xfs_buf_delwri_split); DEFINE_BUF_EVENT(xfs_buf_get_uncached); DEFINE_BUF_EVENT(xfs_bdstrat_shut); DEFINE_BUF_EVENT(xfs_buf_item_relse); -DEFINE_BUF_EVENT(xfs_buf_item_iodone); DEFINE_BUF_EVENT(xfs_buf_item_iodone_async); DEFINE_BUF_EVENT(xfs_buf_error_relse); DEFINE_BUF_EVENT(xfs_buf_wait_buftarg); @@ -944,7 +944,6 @@ DECLARE_EVENT_CLASS(xfs_loggrant_class, TP_ARGS(log, tic), TP_STRUCT__entry( __field(dev_t, dev) - __field(unsigned, trans_type) __field(char, ocnt) __field(char, cnt) __field(int, curr_res) @@ -962,7 +961,6 @@ DECLARE_EVENT_CLASS(xfs_loggrant_class, ), TP_fast_assign( __entry->dev = log->l_mp->m_super->s_dev; - __entry->trans_type = tic->t_trans_type; __entry->ocnt = tic->t_ocnt; __entry->cnt = tic->t_cnt; __entry->curr_res = tic->t_curr_res; @@ -980,14 +978,13 @@ DECLARE_EVENT_CLASS(xfs_loggrant_class, __entry->curr_block = log->l_curr_block; __entry->tail_lsn = atomic64_read(&log->l_tail_lsn); ), - TP_printk("dev %d:%d type %s t_ocnt %u t_cnt %u t_curr_res %u " + TP_printk("dev %d:%d t_ocnt %u t_cnt %u t_curr_res %u " "t_unit_res %u t_flags %s reserveq %s " "writeq %s grant_reserve_cycle %d " "grant_reserve_bytes %d grant_write_cycle %d " "grant_write_bytes %d curr_cycle %d curr_block %d " "tail_cycle %d tail_block %d", MAJOR(__entry->dev), MINOR(__entry->dev), - __print_symbolic(__entry->trans_type, XFS_TRANS_TYPES), __entry->ocnt, __entry->cnt, __entry->curr_res, @@ -1053,19 +1050,21 @@ DECLARE_EVENT_CLASS(xfs_log_item_class, ) TRACE_EVENT(xfs_log_force, - TP_PROTO(struct xfs_mount *mp, xfs_lsn_t lsn), - TP_ARGS(mp, lsn), + TP_PROTO(struct xfs_mount *mp, xfs_lsn_t lsn, unsigned long caller_ip), + TP_ARGS(mp, lsn, caller_ip), TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_lsn_t, lsn) + __field(unsigned long, caller_ip) ), TP_fast_assign( __entry->dev = mp->m_super->s_dev; __entry->lsn = lsn; + __entry->caller_ip = caller_ip; ), - TP_printk("dev %d:%d lsn 0x%llx", + TP_printk("dev %d:%d lsn 0x%llx caller %ps", MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->lsn) + __entry->lsn, (void *)__entry->caller_ip) ) #define DEFINE_LOG_ITEM_EVENT(name) \ @@ -1136,15 +1135,14 @@ TRACE_EVENT(xfs_log_assign_tail_lsn, ) DECLARE_EVENT_CLASS(xfs_file_class, - TP_PROTO(struct xfs_inode *ip, size_t count, loff_t offset, int flags), - TP_ARGS(ip, count, offset, flags), + TP_PROTO(struct xfs_inode *ip, size_t count, loff_t offset), + TP_ARGS(ip, count, offset), TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_ino_t, ino) __field(xfs_fsize_t, size) __field(loff_t, offset) __field(size_t, count) - __field(int, flags) ), TP_fast_assign( __entry->dev = VFS_I(ip)->i_sb->s_dev; @@ -1152,25 +1150,25 @@ DECLARE_EVENT_CLASS(xfs_file_class, __entry->size = ip->i_d.di_size; __entry->offset = offset; __entry->count = count; - __entry->flags = flags; ), - TP_printk("dev %d:%d ino 0x%llx size 0x%llx " - "offset 0x%llx count 0x%zx ioflags %s", + TP_printk("dev %d:%d ino 0x%llx size 0x%llx offset 0x%llx count 0x%zx", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->size, __entry->offset, - __entry->count, - __print_flags(__entry->flags, "|", XFS_IO_FLAGS)) + __entry->count) ) #define DEFINE_RW_EVENT(name) \ DEFINE_EVENT(xfs_file_class, name, \ - TP_PROTO(struct xfs_inode *ip, size_t count, loff_t offset, int flags), \ - TP_ARGS(ip, count, offset, flags)) -DEFINE_RW_EVENT(xfs_file_read); + TP_PROTO(struct xfs_inode *ip, size_t count, loff_t offset), \ + TP_ARGS(ip, count, offset)) +DEFINE_RW_EVENT(xfs_file_buffered_read); +DEFINE_RW_EVENT(xfs_file_direct_read); +DEFINE_RW_EVENT(xfs_file_dax_read); DEFINE_RW_EVENT(xfs_file_buffered_write); DEFINE_RW_EVENT(xfs_file_direct_write); +DEFINE_RW_EVENT(xfs_file_dax_write); DEFINE_RW_EVENT(xfs_file_splice_read); DECLARE_EVENT_CLASS(xfs_page_class, @@ -1297,6 +1295,9 @@ DEFINE_IOMAP_EVENT(xfs_map_blocks_alloc); DEFINE_IOMAP_EVENT(xfs_get_blocks_found); DEFINE_IOMAP_EVENT(xfs_get_blocks_alloc); DEFINE_IOMAP_EVENT(xfs_get_blocks_map_direct); +DEFINE_IOMAP_EVENT(xfs_iomap_alloc); +DEFINE_IOMAP_EVENT(xfs_iomap_found); +DEFINE_IOMAP_EVENT(xfs_iomap_not_found); DECLARE_EVENT_CLASS(xfs_simple_io_class, TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count), diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index 20c53666cb4b..5f3d33d16e67 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -47,47 +47,6 @@ xfs_trans_init( } /* - * This routine is called to allocate a transaction structure. - * The type parameter indicates the type of the transaction. These - * are enumerated in xfs_trans.h. - * - * Dynamically allocate the transaction structure from the transaction - * zone, initialize it, and return it to the caller. - */ -xfs_trans_t * -xfs_trans_alloc( - xfs_mount_t *mp, - uint type) -{ - xfs_trans_t *tp; - - sb_start_intwrite(mp->m_super); - tp = _xfs_trans_alloc(mp, type, KM_SLEEP); - tp->t_flags |= XFS_TRANS_FREEZE_PROT; - return tp; -} - -xfs_trans_t * -_xfs_trans_alloc( - xfs_mount_t *mp, - uint type, - xfs_km_flags_t memflags) -{ - xfs_trans_t *tp; - - WARN_ON(mp->m_super->s_writers.frozen == SB_FREEZE_COMPLETE); - atomic_inc(&mp->m_active_trans); - - tp = kmem_zone_zalloc(xfs_trans_zone, memflags); - tp->t_magic = XFS_TRANS_HEADER_MAGIC; - tp->t_type = type; - tp->t_mountp = mp; - INIT_LIST_HEAD(&tp->t_items); - INIT_LIST_HEAD(&tp->t_busy); - return tp; -} - -/* * Free the transaction structure. If there is more clean up * to do when the structure is freed, add it here. */ @@ -99,7 +58,7 @@ xfs_trans_free( xfs_extent_busy_clear(tp->t_mountp, &tp->t_busy, false); atomic_dec(&tp->t_mountp->m_active_trans); - if (tp->t_flags & XFS_TRANS_FREEZE_PROT) + if (!(tp->t_flags & XFS_TRANS_NO_WRITECOUNT)) sb_end_intwrite(tp->t_mountp->m_super); xfs_trans_free_dqinfo(tp); kmem_zone_free(xfs_trans_zone, tp); @@ -125,7 +84,6 @@ xfs_trans_dup( * Initialize the new transaction structure. */ ntp->t_magic = XFS_TRANS_HEADER_MAGIC; - ntp->t_type = tp->t_type; ntp->t_mountp = tp->t_mountp; INIT_LIST_HEAD(&ntp->t_items); INIT_LIST_HEAD(&ntp->t_busy); @@ -135,9 +93,9 @@ xfs_trans_dup( ntp->t_flags = XFS_TRANS_PERM_LOG_RES | (tp->t_flags & XFS_TRANS_RESERVE) | - (tp->t_flags & XFS_TRANS_FREEZE_PROT); + (tp->t_flags & XFS_TRANS_NO_WRITECOUNT); /* We gave our writer reference to the new transaction */ - tp->t_flags &= ~XFS_TRANS_FREEZE_PROT; + tp->t_flags |= XFS_TRANS_NO_WRITECOUNT; ntp->t_ticket = xfs_log_ticket_get(tp->t_ticket); ntp->t_blk_res = tp->t_blk_res - tp->t_blk_res_used; tp->t_blk_res = tp->t_blk_res_used; @@ -165,7 +123,7 @@ xfs_trans_dup( * This does not do quota reservations. That typically is done by the * caller afterwards. */ -int +static int xfs_trans_reserve( struct xfs_trans *tp, struct xfs_trans_res *resp, @@ -219,7 +177,7 @@ xfs_trans_reserve( resp->tr_logres, resp->tr_logcount, &tp->t_ticket, XFS_TRANSACTION, - permanent, tp->t_type); + permanent); } if (error) @@ -268,6 +226,42 @@ undo_blocks: return error; } +int +xfs_trans_alloc( + struct xfs_mount *mp, + struct xfs_trans_res *resp, + uint blocks, + uint rtextents, + uint flags, + struct xfs_trans **tpp) +{ + struct xfs_trans *tp; + int error; + + if (!(flags & XFS_TRANS_NO_WRITECOUNT)) + sb_start_intwrite(mp->m_super); + + WARN_ON(mp->m_super->s_writers.frozen == SB_FREEZE_COMPLETE); + atomic_inc(&mp->m_active_trans); + + tp = kmem_zone_zalloc(xfs_trans_zone, + (flags & XFS_TRANS_NOFS) ? KM_NOFS : KM_SLEEP); + tp->t_magic = XFS_TRANS_HEADER_MAGIC; + tp->t_flags = flags; + tp->t_mountp = mp; + INIT_LIST_HEAD(&tp->t_items); + INIT_LIST_HEAD(&tp->t_busy); + + error = xfs_trans_reserve(tp, resp, blocks, rtextents); + if (error) { + xfs_trans_cancel(tp); + return error; + } + + *tpp = tp; + return 0; +} + /* * Record the indicated change to the given field for application * to the file system's superblock when the transaction commits. diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index e7c49cf43fbc..9b2b9fa89331 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -52,6 +52,7 @@ typedef struct xfs_log_item { /* delayed logging */ struct list_head li_cil; /* CIL pointers */ struct xfs_log_vec *li_lv; /* active log vector */ + struct xfs_log_vec *li_lv_shadow; /* standby vector */ xfs_lsn_t li_seq; /* CIL commit seq */ } xfs_log_item_t; @@ -90,7 +91,6 @@ void xfs_log_item_init(struct xfs_mount *mp, struct xfs_log_item *item, */ typedef struct xfs_trans { unsigned int t_magic; /* magic number */ - unsigned int t_type; /* transaction type */ unsigned int t_log_res; /* amt of log space resvd */ unsigned int t_log_count; /* count for perm log res */ unsigned int t_blk_res; /* # of blocks resvd */ @@ -148,10 +148,9 @@ typedef struct xfs_trans { /* * XFS transaction mechanism exported interfaces. */ -xfs_trans_t *xfs_trans_alloc(struct xfs_mount *, uint); -xfs_trans_t *_xfs_trans_alloc(struct xfs_mount *, uint, xfs_km_flags_t); -int xfs_trans_reserve(struct xfs_trans *, struct xfs_trans_res *, - uint, uint); +int xfs_trans_alloc(struct xfs_mount *mp, struct xfs_trans_res *resp, + uint blocks, uint rtextents, uint flags, + struct xfs_trans **tpp); void xfs_trans_mod_sb(xfs_trans_t *, uint, int64_t); struct xfs_buf *xfs_trans_get_buf_map(struct xfs_trans *tp, diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c index d111f691f313..ea62245fee26 100644 --- a/fs/xfs/xfs_xattr.c +++ b/fs/xfs/xfs_xattr.c @@ -74,11 +74,12 @@ xfs_forget_acl( } static int -xfs_xattr_set(const struct xattr_handler *handler, struct dentry *dentry, - const char *name, const void *value, size_t size, int flags) +xfs_xattr_set(const struct xattr_handler *handler, struct dentry *unused, + struct inode *inode, const char *name, const void *value, + size_t size, int flags) { int xflags = handler->flags; - struct xfs_inode *ip = XFS_I(d_inode(dentry)); + struct xfs_inode *ip = XFS_I(inode); int error; /* Convert Linux syscall to XFS internal ATTR flags */ @@ -92,7 +93,7 @@ xfs_xattr_set(const struct xattr_handler *handler, struct dentry *dentry, error = xfs_attr_set(ip, (unsigned char *)name, (void *)value, size, xflags); if (!error) - xfs_forget_acl(d_inode(dentry), name, xflags); + xfs_forget_acl(inode, name, xflags); return error; } @@ -146,7 +147,7 @@ __xfs_xattr_put_listent( arraytop = context->count + prefix_len + namelen + 1; if (arraytop > context->firstu) { context->count = -1; /* insufficient space */ - return 1; + return 0; } offset = (char *)context->alist + context->count; strncpy(offset, prefix, prefix_len); @@ -166,8 +167,7 @@ xfs_xattr_put_listent( int flags, unsigned char *name, int namelen, - int valuelen, - unsigned char *value) + int valuelen) { char *prefix; int prefix_len; @@ -221,11 +221,15 @@ xfs_xattr_put_listent( } ssize_t -xfs_vn_listxattr(struct dentry *dentry, char *data, size_t size) +xfs_vn_listxattr( + struct dentry *dentry, + char *data, + size_t size) { struct xfs_attr_list_context context; struct attrlist_cursor_kern cursor = { 0 }; - struct inode *inode = d_inode(dentry); + struct inode *inode = d_inode(dentry); + int error; /* * First read the regular on-disk attributes. @@ -239,7 +243,9 @@ xfs_vn_listxattr(struct dentry *dentry, char *data, size_t size) context.firstu = context.bufsize; context.put_listent = xfs_xattr_put_listent; - xfs_attr_list_int(&context); + error = xfs_attr_list_int(&context); + if (error) + return error; if (context.count < 0) return -ERANGE; |