From bc98a42c1f7d0f886c0c1b75a92a004976a46d9f Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 17 Jul 2017 08:45:34 +0100 Subject: VFS: Convert sb->s_flags & MS_RDONLY to sb_rdonly(sb) Firstly by applying the following with coccinelle's spatch: @@ expression SB; @@ -SB->s_flags & MS_RDONLY +sb_rdonly(SB) to effect the conversion to sb_rdonly(sb), then by applying: @@ expression A, SB; @@ ( -(!sb_rdonly(SB)) && A +!sb_rdonly(SB) && A | -A != (sb_rdonly(SB)) +A != sb_rdonly(SB) | -A == (sb_rdonly(SB)) +A == sb_rdonly(SB) | -!(sb_rdonly(SB)) +!sb_rdonly(SB) | -A && (sb_rdonly(SB)) +A && sb_rdonly(SB) | -A || (sb_rdonly(SB)) +A || sb_rdonly(SB) | -(sb_rdonly(SB)) != A +sb_rdonly(SB) != A | -(sb_rdonly(SB)) == A +sb_rdonly(SB) == A | -(sb_rdonly(SB)) && A +sb_rdonly(SB) && A | -(sb_rdonly(SB)) || A +sb_rdonly(SB) || A ) @@ expression A, B, SB; @@ ( -(sb_rdonly(SB)) ? 1 : 0 +sb_rdonly(SB) | -(sb_rdonly(SB)) ? A : B +sb_rdonly(SB) ? A : B ) to remove left over excess bracketage and finally by applying: @@ expression A, SB; @@ ( -(A & MS_RDONLY) != sb_rdonly(SB) +(bool)(A & MS_RDONLY) != sb_rdonly(SB) | -(A & MS_RDONLY) == sb_rdonly(SB) +(bool)(A & MS_RDONLY) == sb_rdonly(SB) ) to make comparisons against the result of sb_rdonly() (which is a bool) work correctly. Signed-off-by: David Howells --- fs/btrfs/super.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'fs/btrfs/super.c') diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 12540b6104b5..086819312a44 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -102,7 +102,7 @@ static void btrfs_handle_error(struct btrfs_fs_info *fs_info) { struct super_block *sb = fs_info->sb; - if (sb->s_flags & MS_RDONLY) + if (sb_rdonly(sb)) return; if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) { @@ -138,7 +138,7 @@ void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function * Special case: if the error is EROFS, and we're already * under MS_RDONLY, then it is safe here. */ - if (errno == -EROFS && (sb->s_flags & MS_RDONLY)) + if (errno == -EROFS && sb_rdonly(sb)) return; #ifdef CONFIG_PRINTK @@ -1687,8 +1687,7 @@ static inline void btrfs_remount_cleanup(struct btrfs_fs_info *fs_info, * close or the filesystem is read only. */ if (btrfs_raw_test_opt(old_opts, AUTO_DEFRAG) && - (!btrfs_raw_test_opt(fs_info->mount_opt, AUTO_DEFRAG) || - (fs_info->sb->s_flags & MS_RDONLY))) { + (!btrfs_raw_test_opt(fs_info->mount_opt, AUTO_DEFRAG) || sb_rdonly(fs_info->sb))) { btrfs_cleanup_defrag_inodes(fs_info); } @@ -1735,7 +1734,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) btrfs_resize_thread_pool(fs_info, fs_info->thread_pool_size, old_thread_pool_size); - if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) + if ((bool)(*flags & MS_RDONLY) == sb_rdonly(sb)) goto out; if (*flags & MS_RDONLY) { @@ -1835,7 +1834,7 @@ out: restore: /* We've hit an error - don't reset MS_RDONLY */ - if (sb->s_flags & MS_RDONLY) + if (sb_rdonly(sb)) old_flags |= MS_RDONLY; sb->s_flags = old_flags; fs_info->mount_opt = old_opts; -- cgit v1.2.3 From 5c1aab1dd5445ed8bdcdbb575abc1b0d7ee5b2e7 Mon Sep 17 00:00:00 2001 From: Nick Terrell Date: Wed, 9 Aug 2017 19:39:02 -0700 Subject: btrfs: Add zstd support Add zstd compression and decompression support to BtrFS. zstd at its fastest level compresses almost as well as zlib, while offering much faster compression and decompression, approaching lzo speeds. I benchmarked btrfs with zstd compression against no compression, lzo compression, and zlib compression. I benchmarked two scenarios. Copying a set of files to btrfs, and then reading the files. Copying a tarball to btrfs, extracting it to btrfs, and then reading the extracted files. After every operation, I call `sync` and include the sync time. Between every pair of operations I unmount and remount the filesystem to avoid caching. The benchmark files can be found in the upstream zstd source repository under `contrib/linux-kernel/{btrfs-benchmark.sh,btrfs-extract-benchmark.sh}` [1] [2]. I ran the benchmarks on a Ubuntu 14.04 VM with 2 cores and 4 GiB of RAM. The VM is running on a MacBook Pro with a 3.1 GHz Intel Core i7 processor, 16 GB of RAM, and a SSD. The first compression benchmark is copying 10 copies of the unzipped Silesia corpus [3] into a BtrFS filesystem mounted with `-o compress-force=Method`. The decompression benchmark times how long it takes to `tar` all 10 copies into `/dev/null`. The compression ratio is measured by comparing the output of `df` and `du`. See the benchmark file [1] for details. I benchmarked multiple zstd compression levels, although the patch uses zstd level 1. | Method | Ratio | Compression MB/s | Decompression speed | |---------|-------|------------------|---------------------| | None | 0.99 | 504 | 686 | | lzo | 1.66 | 398 | 442 | | zlib | 2.58 | 65 | 241 | | zstd 1 | 2.57 | 260 | 383 | | zstd 3 | 2.71 | 174 | 408 | | zstd 6 | 2.87 | 70 | 398 | | zstd 9 | 2.92 | 43 | 406 | | zstd 12 | 2.93 | 21 | 408 | | zstd 15 | 3.01 | 11 | 354 | The next benchmark first copies `linux-4.11.6.tar` [4] to btrfs. Then it measures the compression ratio, extracts the tar, and deletes the tar. Then it measures the compression ratio again, and `tar`s the extracted files into `/dev/null`. See the benchmark file [2] for details. | Method | Tar Ratio | Extract Ratio | Copy (s) | Extract (s)| Read (s) | |--------|-----------|---------------|----------|------------|----------| | None | 0.97 | 0.78 | 0.981 | 5.501 | 8.807 | | lzo | 2.06 | 1.38 | 1.631 | 8.458 | 8.585 | | zlib | 3.40 | 1.86 | 7.750 | 21.544 | 11.744 | | zstd 1 | 3.57 | 1.85 | 2.579 | 11.479 | 9.389 | [1] https://github.com/facebook/zstd/blob/dev/contrib/linux-kernel/btrfs-benchmark.sh [2] https://github.com/facebook/zstd/blob/dev/contrib/linux-kernel/btrfs-extract-benchmark.sh [3] http://sun.aei.polsl.pl/~sdeor/index.php?page=silesia [4] https://cdn.kernel.org/pub/linux/kernel/v4.x/linux-4.11.6.tar.xz zstd source repository: https://github.com/facebook/zstd Signed-off-by: Nick Terrell Signed-off-by: Chris Mason --- fs/btrfs/Kconfig | 2 + fs/btrfs/Makefile | 2 +- fs/btrfs/compression.c | 1 + fs/btrfs/compression.h | 6 +- fs/btrfs/ctree.h | 1 + fs/btrfs/disk-io.c | 2 + fs/btrfs/ioctl.c | 6 +- fs/btrfs/props.c | 6 + fs/btrfs/super.c | 12 +- fs/btrfs/sysfs.c | 2 + fs/btrfs/zstd.c | 432 +++++++++++++++++++++++++++++++++++++++++++++ include/uapi/linux/btrfs.h | 8 +- 12 files changed, 468 insertions(+), 12 deletions(-) create mode 100644 fs/btrfs/zstd.c (limited to 'fs/btrfs/super.c') diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig index 80e9c18ea64f..a26c63b4ad68 100644 --- a/fs/btrfs/Kconfig +++ b/fs/btrfs/Kconfig @@ -6,6 +6,8 @@ config BTRFS_FS select ZLIB_DEFLATE select LZO_COMPRESS select LZO_DECOMPRESS + select ZSTD_COMPRESS + select ZSTD_DECOMPRESS select RAID6_PQ select XOR_BLOCKS select SRCU diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index 128ce17a80b0..962a95aefb81 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -6,7 +6,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ transaction.o inode.o file.o tree-defrag.o \ extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \ extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \ - export.o tree-log.o free-space-cache.o zlib.o lzo.o \ + export.o tree-log.o free-space-cache.o zlib.o lzo.o zstd.o \ compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \ reada.o backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \ uuid-tree.o props.o hash.o free-space-tree.o diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index d2ef9ac2a630..4ff42d18a64d 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -704,6 +704,7 @@ static struct { static const struct btrfs_compress_op * const btrfs_compress_op[] = { &btrfs_zlib_compress, &btrfs_lzo_compress, + &btrfs_zstd_compress, }; void __init btrfs_init_compress(void) diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index 87f6d3332163..2269e00854d8 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -99,8 +99,9 @@ enum btrfs_compression_type { BTRFS_COMPRESS_NONE = 0, BTRFS_COMPRESS_ZLIB = 1, BTRFS_COMPRESS_LZO = 2, - BTRFS_COMPRESS_TYPES = 2, - BTRFS_COMPRESS_LAST = 3, + BTRFS_COMPRESS_ZSTD = 3, + BTRFS_COMPRESS_TYPES = 3, + BTRFS_COMPRESS_LAST = 4, }; struct btrfs_compress_op { @@ -128,5 +129,6 @@ struct btrfs_compress_op { extern const struct btrfs_compress_op btrfs_zlib_compress; extern const struct btrfs_compress_op btrfs_lzo_compress; +extern const struct btrfs_compress_op btrfs_zstd_compress; #endif diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 3f3eb7b17cac..845d77c097d6 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -270,6 +270,7 @@ struct btrfs_super_block { BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS | \ BTRFS_FEATURE_INCOMPAT_BIG_METADATA | \ BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO | \ + BTRFS_FEATURE_INCOMPAT_COMPRESS_ZSTD | \ BTRFS_FEATURE_INCOMPAT_RAID56 | \ BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF | \ BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA | \ diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 080e2ebb8aa0..04632f4de933 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2828,6 +2828,8 @@ int open_ctree(struct super_block *sb, features |= BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF; if (fs_info->compress_type == BTRFS_COMPRESS_LZO) features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO; + else if (fs_info->compress_type == BTRFS_COMPRESS_ZSTD) + features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_ZSTD; if (features & BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA) btrfs_info(fs_info, "has skinny extents"); diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index fa1b78cf25f6..b9963d94d727 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -327,8 +327,10 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) if (fs_info->compress_type == BTRFS_COMPRESS_LZO) comp = "lzo"; - else + else if (fs_info->compress_type == BTRFS_COMPRESS_ZLIB) comp = "zlib"; + else + comp = "zstd"; ret = btrfs_set_prop(inode, "btrfs.compression", comp, strlen(comp), 0); if (ret) @@ -1466,6 +1468,8 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, if (range->compress_type == BTRFS_COMPRESS_LZO) { btrfs_set_fs_incompat(fs_info, COMPRESS_LZO); + } else if (range->compress_type == BTRFS_COMPRESS_ZSTD) { + btrfs_set_fs_incompat(fs_info, COMPRESS_ZSTD); } ret = defrag_count; diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c index 4b23ae5d0e5c..20631e9273a0 100644 --- a/fs/btrfs/props.c +++ b/fs/btrfs/props.c @@ -390,6 +390,8 @@ static int prop_compression_validate(const char *value, size_t len) return 0; else if (!strncmp("zlib", value, len)) return 0; + else if (!strncmp("zstd", value, len)) + return 0; return -EINVAL; } @@ -412,6 +414,8 @@ static int prop_compression_apply(struct inode *inode, type = BTRFS_COMPRESS_LZO; else if (!strncmp("zlib", value, len)) type = BTRFS_COMPRESS_ZLIB; + else if (!strncmp("zstd", value, len)) + type = BTRFS_COMPRESS_ZSTD; else return -EINVAL; @@ -429,6 +433,8 @@ static const char *prop_compression_extract(struct inode *inode) return "zlib"; case BTRFS_COMPRESS_LZO: return "lzo"; + case BTRFS_COMPRESS_ZSTD: + return "zstd"; } return NULL; diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 12540b6104b5..c370deadb790 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -513,6 +513,14 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, btrfs_clear_opt(info->mount_opt, NODATASUM); btrfs_set_fs_incompat(info, COMPRESS_LZO); no_compress = 0; + } else if (strcmp(args[0].from, "zstd") == 0) { + compress_type = "zstd"; + info->compress_type = BTRFS_COMPRESS_ZSTD; + btrfs_set_opt(info->mount_opt, COMPRESS); + btrfs_clear_opt(info->mount_opt, NODATACOW); + btrfs_clear_opt(info->mount_opt, NODATASUM); + btrfs_set_fs_incompat(info, COMPRESS_ZSTD); + no_compress = 0; } else if (strncmp(args[0].from, "no", 2) == 0) { compress_type = "no"; btrfs_clear_opt(info->mount_opt, COMPRESS); @@ -1227,8 +1235,10 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry) if (btrfs_test_opt(info, COMPRESS)) { if (info->compress_type == BTRFS_COMPRESS_ZLIB) compress_type = "zlib"; - else + else if (info->compress_type == BTRFS_COMPRESS_LZO) compress_type = "lzo"; + else + compress_type = "zstd"; if (btrfs_test_opt(info, FORCE_COMPRESS)) seq_printf(seq, ",compress-force=%s", compress_type); else diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index c2d5f3580b4c..2b6d37c09a81 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -200,6 +200,7 @@ BTRFS_FEAT_ATTR_INCOMPAT(mixed_backref, MIXED_BACKREF); BTRFS_FEAT_ATTR_INCOMPAT(default_subvol, DEFAULT_SUBVOL); BTRFS_FEAT_ATTR_INCOMPAT(mixed_groups, MIXED_GROUPS); BTRFS_FEAT_ATTR_INCOMPAT(compress_lzo, COMPRESS_LZO); +BTRFS_FEAT_ATTR_INCOMPAT(compress_zstd, COMPRESS_ZSTD); BTRFS_FEAT_ATTR_INCOMPAT(big_metadata, BIG_METADATA); BTRFS_FEAT_ATTR_INCOMPAT(extended_iref, EXTENDED_IREF); BTRFS_FEAT_ATTR_INCOMPAT(raid56, RAID56); @@ -212,6 +213,7 @@ static struct attribute *btrfs_supported_feature_attrs[] = { BTRFS_FEAT_ATTR_PTR(default_subvol), BTRFS_FEAT_ATTR_PTR(mixed_groups), BTRFS_FEAT_ATTR_PTR(compress_lzo), + BTRFS_FEAT_ATTR_PTR(compress_zstd), BTRFS_FEAT_ATTR_PTR(big_metadata), BTRFS_FEAT_ATTR_PTR(extended_iref), BTRFS_FEAT_ATTR_PTR(raid56), diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c new file mode 100644 index 000000000000..607ce47b483a --- /dev/null +++ b/fs/btrfs/zstd.c @@ -0,0 +1,432 @@ +/* + * Copyright (c) 2016-present, Facebook, Inc. + * All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "compression.h" + +#define ZSTD_BTRFS_MAX_WINDOWLOG 17 +#define ZSTD_BTRFS_MAX_INPUT (1 << ZSTD_BTRFS_MAX_WINDOWLOG) +#define ZSTD_BTRFS_DEFAULT_LEVEL 3 + +static ZSTD_parameters zstd_get_btrfs_parameters(size_t src_len) +{ + ZSTD_parameters params = ZSTD_getParams(ZSTD_BTRFS_DEFAULT_LEVEL, + src_len, 0); + + if (params.cParams.windowLog > ZSTD_BTRFS_MAX_WINDOWLOG) + params.cParams.windowLog = ZSTD_BTRFS_MAX_WINDOWLOG; + WARN_ON(src_len > ZSTD_BTRFS_MAX_INPUT); + return params; +} + +struct workspace { + void *mem; + size_t size; + char *buf; + struct list_head list; +}; + +static void zstd_free_workspace(struct list_head *ws) +{ + struct workspace *workspace = list_entry(ws, struct workspace, list); + + kvfree(workspace->mem); + kfree(workspace->buf); + kfree(workspace); +} + +static struct list_head *zstd_alloc_workspace(void) +{ + ZSTD_parameters params = + zstd_get_btrfs_parameters(ZSTD_BTRFS_MAX_INPUT); + struct workspace *workspace; + + workspace = kzalloc(sizeof(*workspace), GFP_KERNEL); + if (!workspace) + return ERR_PTR(-ENOMEM); + + workspace->size = max_t(size_t, + ZSTD_CStreamWorkspaceBound(params.cParams), + ZSTD_DStreamWorkspaceBound(ZSTD_BTRFS_MAX_INPUT)); + workspace->mem = kvmalloc(workspace->size, GFP_KERNEL); + workspace->buf = kmalloc(PAGE_SIZE, GFP_KERNEL); + if (!workspace->mem || !workspace->buf) + goto fail; + + INIT_LIST_HEAD(&workspace->list); + + return &workspace->list; +fail: + zstd_free_workspace(&workspace->list); + return ERR_PTR(-ENOMEM); +} + +static int zstd_compress_pages(struct list_head *ws, + struct address_space *mapping, + u64 start, + struct page **pages, + unsigned long *out_pages, + unsigned long *total_in, + unsigned long *total_out) +{ + struct workspace *workspace = list_entry(ws, struct workspace, list); + ZSTD_CStream *stream; + int ret = 0; + int nr_pages = 0; + struct page *in_page = NULL; /* The current page to read */ + struct page *out_page = NULL; /* The current page to write to */ + ZSTD_inBuffer in_buf = { NULL, 0, 0 }; + ZSTD_outBuffer out_buf = { NULL, 0, 0 }; + unsigned long tot_in = 0; + unsigned long tot_out = 0; + unsigned long len = *total_out; + const unsigned long nr_dest_pages = *out_pages; + unsigned long max_out = nr_dest_pages * PAGE_SIZE; + ZSTD_parameters params = zstd_get_btrfs_parameters(len); + + *out_pages = 0; + *total_out = 0; + *total_in = 0; + + /* Initialize the stream */ + stream = ZSTD_initCStream(params, len, workspace->mem, + workspace->size); + if (!stream) { + pr_warn("BTRFS: ZSTD_initCStream failed\n"); + ret = -EIO; + goto out; + } + + /* map in the first page of input data */ + in_page = find_get_page(mapping, start >> PAGE_SHIFT); + in_buf.src = kmap(in_page); + in_buf.pos = 0; + in_buf.size = min_t(size_t, len, PAGE_SIZE); + + + /* Allocate and map in the output buffer */ + out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); + if (out_page == NULL) { + ret = -ENOMEM; + goto out; + } + pages[nr_pages++] = out_page; + out_buf.dst = kmap(out_page); + out_buf.pos = 0; + out_buf.size = min_t(size_t, max_out, PAGE_SIZE); + + while (1) { + size_t ret2; + + ret2 = ZSTD_compressStream(stream, &out_buf, &in_buf); + if (ZSTD_isError(ret2)) { + pr_debug("BTRFS: ZSTD_compressStream returned %d\n", + ZSTD_getErrorCode(ret2)); + ret = -EIO; + goto out; + } + + /* Check to see if we are making it bigger */ + if (tot_in + in_buf.pos > 8192 && + tot_in + in_buf.pos < + tot_out + out_buf.pos) { + ret = -E2BIG; + goto out; + } + + /* We've reached the end of our output range */ + if (out_buf.pos >= max_out) { + tot_out += out_buf.pos; + ret = -E2BIG; + goto out; + } + + /* Check if we need more output space */ + if (out_buf.pos == out_buf.size) { + tot_out += PAGE_SIZE; + max_out -= PAGE_SIZE; + kunmap(out_page); + if (nr_pages == nr_dest_pages) { + out_page = NULL; + ret = -E2BIG; + goto out; + } + out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); + if (out_page == NULL) { + ret = -ENOMEM; + goto out; + } + pages[nr_pages++] = out_page; + out_buf.dst = kmap(out_page); + out_buf.pos = 0; + out_buf.size = min_t(size_t, max_out, PAGE_SIZE); + } + + /* We've reached the end of the input */ + if (in_buf.pos >= len) { + tot_in += in_buf.pos; + break; + } + + /* Check if we need more input */ + if (in_buf.pos == in_buf.size) { + tot_in += PAGE_SIZE; + kunmap(in_page); + put_page(in_page); + + start += PAGE_SIZE; + len -= PAGE_SIZE; + in_page = find_get_page(mapping, start >> PAGE_SHIFT); + in_buf.src = kmap(in_page); + in_buf.pos = 0; + in_buf.size = min_t(size_t, len, PAGE_SIZE); + } + } + while (1) { + size_t ret2; + + ret2 = ZSTD_endStream(stream, &out_buf); + if (ZSTD_isError(ret2)) { + pr_debug("BTRFS: ZSTD_endStream returned %d\n", + ZSTD_getErrorCode(ret2)); + ret = -EIO; + goto out; + } + if (ret2 == 0) { + tot_out += out_buf.pos; + break; + } + if (out_buf.pos >= max_out) { + tot_out += out_buf.pos; + ret = -E2BIG; + goto out; + } + + tot_out += PAGE_SIZE; + max_out -= PAGE_SIZE; + kunmap(out_page); + if (nr_pages == nr_dest_pages) { + out_page = NULL; + ret = -E2BIG; + goto out; + } + out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); + if (out_page == NULL) { + ret = -ENOMEM; + goto out; + } + pages[nr_pages++] = out_page; + out_buf.dst = kmap(out_page); + out_buf.pos = 0; + out_buf.size = min_t(size_t, max_out, PAGE_SIZE); + } + + if (tot_out >= tot_in) { + ret = -E2BIG; + goto out; + } + + ret = 0; + *total_in = tot_in; + *total_out = tot_out; +out: + *out_pages = nr_pages; + /* Cleanup */ + if (in_page) { + kunmap(in_page); + put_page(in_page); + } + if (out_page) + kunmap(out_page); + return ret; +} + +static int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb) +{ + struct workspace *workspace = list_entry(ws, struct workspace, list); + struct page **pages_in = cb->compressed_pages; + u64 disk_start = cb->start; + struct bio *orig_bio = cb->orig_bio; + size_t srclen = cb->compressed_len; + ZSTD_DStream *stream; + int ret = 0; + unsigned long page_in_index = 0; + unsigned long total_pages_in = DIV_ROUND_UP(srclen, PAGE_SIZE); + unsigned long buf_start; + unsigned long total_out = 0; + ZSTD_inBuffer in_buf = { NULL, 0, 0 }; + ZSTD_outBuffer out_buf = { NULL, 0, 0 }; + + stream = ZSTD_initDStream( + ZSTD_BTRFS_MAX_INPUT, workspace->mem, workspace->size); + if (!stream) { + pr_debug("BTRFS: ZSTD_initDStream failed\n"); + ret = -EIO; + goto done; + } + + in_buf.src = kmap(pages_in[page_in_index]); + in_buf.pos = 0; + in_buf.size = min_t(size_t, srclen, PAGE_SIZE); + + out_buf.dst = workspace->buf; + out_buf.pos = 0; + out_buf.size = PAGE_SIZE; + + while (1) { + size_t ret2; + + ret2 = ZSTD_decompressStream(stream, &out_buf, &in_buf); + if (ZSTD_isError(ret2)) { + pr_debug("BTRFS: ZSTD_decompressStream returned %d\n", + ZSTD_getErrorCode(ret2)); + ret = -EIO; + goto done; + } + buf_start = total_out; + total_out += out_buf.pos; + out_buf.pos = 0; + + ret = btrfs_decompress_buf2page(out_buf.dst, buf_start, + total_out, disk_start, orig_bio); + if (ret == 0) + break; + + if (in_buf.pos >= srclen) + break; + + /* Check if we've hit the end of a frame */ + if (ret2 == 0) + break; + + if (in_buf.pos == in_buf.size) { + kunmap(pages_in[page_in_index++]); + if (page_in_index >= total_pages_in) { + in_buf.src = NULL; + ret = -EIO; + goto done; + } + srclen -= PAGE_SIZE; + in_buf.src = kmap(pages_in[page_in_index]); + in_buf.pos = 0; + in_buf.size = min_t(size_t, srclen, PAGE_SIZE); + } + } + ret = 0; + zero_fill_bio(orig_bio); +done: + if (in_buf.src) + kunmap(pages_in[page_in_index]); + return ret; +} + +static int zstd_decompress(struct list_head *ws, unsigned char *data_in, + struct page *dest_page, + unsigned long start_byte, + size_t srclen, size_t destlen) +{ + struct workspace *workspace = list_entry(ws, struct workspace, list); + ZSTD_DStream *stream; + int ret = 0; + size_t ret2; + ZSTD_inBuffer in_buf = { NULL, 0, 0 }; + ZSTD_outBuffer out_buf = { NULL, 0, 0 }; + unsigned long total_out = 0; + unsigned long pg_offset = 0; + char *kaddr; + + stream = ZSTD_initDStream( + ZSTD_BTRFS_MAX_INPUT, workspace->mem, workspace->size); + if (!stream) { + pr_warn("BTRFS: ZSTD_initDStream failed\n"); + ret = -EIO; + goto finish; + } + + destlen = min_t(size_t, destlen, PAGE_SIZE); + + in_buf.src = data_in; + in_buf.pos = 0; + in_buf.size = srclen; + + out_buf.dst = workspace->buf; + out_buf.pos = 0; + out_buf.size = PAGE_SIZE; + + ret2 = 1; + while (pg_offset < destlen && in_buf.pos < in_buf.size) { + unsigned long buf_start; + unsigned long buf_offset; + unsigned long bytes; + + /* Check if the frame is over and we still need more input */ + if (ret2 == 0) { + pr_debug("BTRFS: ZSTD_decompressStream ended early\n"); + ret = -EIO; + goto finish; + } + ret2 = ZSTD_decompressStream(stream, &out_buf, &in_buf); + if (ZSTD_isError(ret2)) { + pr_debug("BTRFS: ZSTD_decompressStream returned %d\n", + ZSTD_getErrorCode(ret2)); + ret = -EIO; + goto finish; + } + + buf_start = total_out; + total_out += out_buf.pos; + out_buf.pos = 0; + + if (total_out <= start_byte) + continue; + + if (total_out > start_byte && buf_start < start_byte) + buf_offset = start_byte - buf_start; + else + buf_offset = 0; + + bytes = min_t(unsigned long, destlen - pg_offset, + out_buf.size - buf_offset); + + kaddr = kmap_atomic(dest_page); + memcpy(kaddr + pg_offset, out_buf.dst + buf_offset, bytes); + kunmap_atomic(kaddr); + + pg_offset += bytes; + } + ret = 0; +finish: + if (pg_offset < destlen) { + kaddr = kmap_atomic(dest_page); + memset(kaddr + pg_offset, 0, destlen - pg_offset); + kunmap_atomic(kaddr); + } + return ret; +} + +const struct btrfs_compress_op btrfs_zstd_compress = { + .alloc_workspace = zstd_alloc_workspace, + .free_workspace = zstd_free_workspace, + .compress_pages = zstd_compress_pages, + .decompress_bio = zstd_decompress_bio, + .decompress = zstd_decompress, +}; diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h index 9aa74f317747..378230c163d5 100644 --- a/include/uapi/linux/btrfs.h +++ b/include/uapi/linux/btrfs.h @@ -255,13 +255,7 @@ struct btrfs_ioctl_fs_info_args { #define BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL (1ULL << 1) #define BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS (1ULL << 2) #define BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO (1ULL << 3) -/* - * some patches floated around with a second compression method - * lets save that incompat here for when they do get in - * Note we don't actually support it, we're just reserving the - * number - */ -#define BTRFS_FEATURE_INCOMPAT_COMPRESS_LZOv2 (1ULL << 4) +#define BTRFS_FEATURE_INCOMPAT_COMPRESS_ZSTD (1ULL << 4) /* * older kernels tried to do bigger metadata blocks, but the -- cgit v1.2.3 From 00142756e1f8015d2f8ce96532d156689db7e448 Mon Sep 17 00:00:00 2001 From: Jeff Mahoney Date: Wed, 12 Jul 2017 16:20:08 -0600 Subject: btrfs: backref, add tracepoints for prelim_ref insertion and merging This patch adds a tracepoint event for prelim_ref insertion and merging. For each, the ref being inserted or merged and the count of tree nodes is issued. Signed-off-by: Jeff Mahoney Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/backref.c | 118 ++++++++++++++++++++++--------------------- fs/btrfs/backref.h | 12 +++++ fs/btrfs/super.c | 1 + include/trace/events/btrfs.h | 58 +++++++++++++++++++++ 4 files changed, 131 insertions(+), 58 deletions(-) (limited to 'fs/btrfs/super.c') diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 297f33850425..4cda81964dd4 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -18,6 +18,7 @@ #include #include +#include #include "ctree.h" #include "disk-io.h" #include "backref.h" @@ -120,20 +121,6 @@ static int find_extent_in_eb(const struct extent_buffer *eb, return 0; } -/* - * this structure records all encountered refs on the way up to the root - */ -struct prelim_ref { - struct rb_node rbnode; - u64 root_id; - struct btrfs_key key_for_search; - int level; - int count; - struct extent_inode_elem *inode_list; - u64 parent; - u64 wanted_disk_byte; -}; - struct preftree { struct rb_root root; unsigned int count; @@ -212,7 +199,8 @@ static int prelim_ref_compare(struct prelim_ref *ref1, * * Callers should assumed that newref has been freed after calling. */ -static void prelim_ref_insert(struct preftree *preftree, +static void prelim_ref_insert(const struct btrfs_fs_info *fs_info, + struct preftree *preftree, struct prelim_ref *newref) { struct rb_root *root; @@ -243,6 +231,8 @@ static void prelim_ref_insert(struct preftree *preftree, ref->inode_list = newref->inode_list; else eie->next = newref->inode_list; + trace_btrfs_prelim_ref_merge(fs_info, ref, newref, + preftree->count); ref->count += newref->count; free_pref(newref); return; @@ -250,6 +240,7 @@ static void prelim_ref_insert(struct preftree *preftree, } preftree->count++; + trace_btrfs_prelim_ref_insert(fs_info, newref, NULL, preftree->count); rb_link_node(&newref->rbnode, parent, p); rb_insert_color(&newref->rbnode, root); } @@ -308,7 +299,8 @@ static void prelim_release(struct preftree *preftree) * additional information that's available but not required to find the parent * block might help in merging entries to gain some speed. */ -static int add_prelim_ref(struct preftree *preftree, u64 root_id, +static int add_prelim_ref(const struct btrfs_fs_info *fs_info, + struct preftree *preftree, u64 root_id, const struct btrfs_key *key, int level, u64 parent, u64 wanted_disk_byte, int count, gfp_t gfp_mask) { @@ -355,21 +347,23 @@ static int add_prelim_ref(struct preftree *preftree, u64 root_id, ref->count = count; ref->parent = parent; ref->wanted_disk_byte = wanted_disk_byte; - prelim_ref_insert(preftree, ref); + prelim_ref_insert(fs_info, preftree, ref); return 0; } /* direct refs use root == 0, key == NULL */ -static int add_direct_ref(struct preftrees *preftrees, int level, u64 parent, +static int add_direct_ref(const struct btrfs_fs_info *fs_info, + struct preftrees *preftrees, int level, u64 parent, u64 wanted_disk_byte, int count, gfp_t gfp_mask) { - return add_prelim_ref(&preftrees->direct, 0, NULL, level, parent, - wanted_disk_byte, count, gfp_mask); + return add_prelim_ref(fs_info, &preftrees->direct, 0, NULL, level, + parent, wanted_disk_byte, count, gfp_mask); } /* indirect refs use parent == 0 */ -static int add_indirect_ref(struct preftrees *preftrees, u64 root_id, +static int add_indirect_ref(const struct btrfs_fs_info *fs_info, + struct preftrees *preftrees, u64 root_id, const struct btrfs_key *key, int level, u64 wanted_disk_byte, int count, gfp_t gfp_mask) { @@ -377,7 +371,7 @@ static int add_indirect_ref(struct preftrees *preftrees, u64 root_id, if (!key) tree = &preftrees->indirect_missing_keys; - return add_prelim_ref(tree, root_id, key, level, 0, + return add_prelim_ref(fs_info, tree, root_id, key, level, 0, wanted_disk_byte, count, gfp_mask); } @@ -631,7 +625,7 @@ static int resolve_indirect_refs(struct btrfs_fs_info *fs_info, * and return directly. */ if (err == -ENOENT) { - prelim_ref_insert(&preftrees->direct, ref); + prelim_ref_insert(fs_info, &preftrees->direct, ref); continue; } else if (err) { free_pref(ref); @@ -659,11 +653,11 @@ static int resolve_indirect_refs(struct btrfs_fs_info *fs_info, memcpy(new_ref, ref, sizeof(*ref)); new_ref->parent = node->val; new_ref->inode_list = unode_aux_to_inode_list(node); - prelim_ref_insert(&preftrees->direct, new_ref); + prelim_ref_insert(fs_info, &preftrees->direct, new_ref); } /* Now it's a direct ref, put it in the the direct tree */ - prelim_ref_insert(&preftrees->direct, ref); + prelim_ref_insert(fs_info, &preftrees->direct, ref); ulist_reinit(parents); } @@ -707,7 +701,7 @@ static int add_missing_keys(struct btrfs_fs_info *fs_info, btrfs_node_key_to_cpu(eb, &ref->key_for_search, 0); btrfs_tree_read_unlock(eb); free_extent_buffer(eb); - prelim_ref_insert(&preftrees->indirect, ref); + prelim_ref_insert(fs_info, &preftrees->indirect, ref); } return 0; } @@ -716,7 +710,8 @@ static int add_missing_keys(struct btrfs_fs_info *fs_info, * add all currently queued delayed refs from this head whose seq nr is * smaller or equal that seq to the list */ -static int add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq, +static int add_delayed_refs(const struct btrfs_fs_info *fs_info, + struct btrfs_delayed_ref_head *head, u64 seq, struct preftrees *preftrees, u64 *total_refs, u64 inum) { @@ -759,8 +754,9 @@ static int add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq, struct btrfs_delayed_tree_ref *ref; ref = btrfs_delayed_node_to_tree_ref(node); - ret = add_indirect_ref(preftrees, ref->root, &tmp_op_key, - ref->level + 1, node->bytenr, + ret = add_indirect_ref(fs_info, preftrees, ref->root, + &tmp_op_key, ref->level + 1, + node->bytenr, node->ref_mod * sgn, GFP_ATOMIC); break; @@ -771,9 +767,9 @@ static int add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq, ref = btrfs_delayed_node_to_tree_ref(node); - ret = add_direct_ref(preftrees, ref->level + 1, - ref->parent, node->bytenr, - node->ref_mod * sgn, + ret = add_direct_ref(fs_info, preftrees, + ref->level + 1, ref->parent, + node->bytenr, node->ref_mod * sgn, GFP_ATOMIC); break; } @@ -795,8 +791,8 @@ static int add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq, break; } - ret = add_indirect_ref(preftrees, ref->root, &key, 0, - node->bytenr, + ret = add_indirect_ref(fs_info, preftrees, ref->root, + &key, 0, node->bytenr, node->ref_mod * sgn, GFP_ATOMIC); break; @@ -807,8 +803,8 @@ static int add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq, ref = btrfs_delayed_node_to_data_ref(node); - ret = add_direct_ref(preftrees, 0, ref->parent, - node->bytenr, + ret = add_direct_ref(fs_info, preftrees, 0, + ref->parent, node->bytenr, node->ref_mod * sgn, GFP_ATOMIC); break; @@ -826,7 +822,8 @@ static int add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq, /* * add all inline backrefs for bytenr to the list */ -static int add_inline_refs(struct btrfs_path *path, u64 bytenr, +static int add_inline_refs(const struct btrfs_fs_info *fs_info, + struct btrfs_path *path, u64 bytenr, int *info_level, struct preftrees *preftrees, u64 *total_refs, u64 inum) { @@ -883,7 +880,8 @@ static int add_inline_refs(struct btrfs_path *path, u64 bytenr, switch (type) { case BTRFS_SHARED_BLOCK_REF_KEY: - ret = add_direct_ref(preftrees, *info_level + 1, offset, + ret = add_direct_ref(fs_info, preftrees, + *info_level + 1, offset, bytenr, 1, GFP_NOFS); break; case BTRFS_SHARED_DATA_REF_KEY: { @@ -893,14 +891,14 @@ static int add_inline_refs(struct btrfs_path *path, u64 bytenr, sdref = (struct btrfs_shared_data_ref *)(iref + 1); count = btrfs_shared_data_ref_count(leaf, sdref); - ret = add_direct_ref(preftrees, 0, offset, + ret = add_direct_ref(fs_info, preftrees, 0, offset, bytenr, count, GFP_NOFS); break; } case BTRFS_TREE_BLOCK_REF_KEY: - ret = add_indirect_ref(preftrees, offset, NULL, - *info_level + 1, bytenr, 1, - GFP_NOFS); + ret = add_indirect_ref(fs_info, preftrees, offset, + NULL, *info_level + 1, + bytenr, 1, GFP_NOFS); break; case BTRFS_EXTENT_DATA_REF_KEY: { struct btrfs_extent_data_ref *dref; @@ -921,8 +919,9 @@ static int add_inline_refs(struct btrfs_path *path, u64 bytenr, root = btrfs_extent_data_ref_root(leaf, dref); - ret = add_indirect_ref(preftrees, root, &key, 0, bytenr, - count, GFP_NOFS); + ret = add_indirect_ref(fs_info, preftrees, root, + &key, 0, bytenr, count, + GFP_NOFS); break; } default: @@ -973,9 +972,9 @@ static int add_keyed_refs(struct btrfs_fs_info *fs_info, switch (key.type) { case BTRFS_SHARED_BLOCK_REF_KEY: /* SHARED DIRECT METADATA backref */ - ret = add_direct_ref(preftrees, info_level + 1, - key.offset, bytenr, 1, - GFP_NOFS); + ret = add_direct_ref(fs_info, preftrees, + info_level + 1, key.offset, + bytenr, 1, GFP_NOFS); break; case BTRFS_SHARED_DATA_REF_KEY: { /* SHARED DIRECT FULL backref */ @@ -985,15 +984,16 @@ static int add_keyed_refs(struct btrfs_fs_info *fs_info, sdref = btrfs_item_ptr(leaf, slot, struct btrfs_shared_data_ref); count = btrfs_shared_data_ref_count(leaf, sdref); - ret = add_direct_ref(preftrees, 0, key.offset, bytenr, - count, GFP_NOFS); + ret = add_direct_ref(fs_info, preftrees, 0, + key.offset, bytenr, count, + GFP_NOFS); break; } case BTRFS_TREE_BLOCK_REF_KEY: /* NORMAL INDIRECT METADATA backref */ - ret = add_indirect_ref(preftrees, key.offset, NULL, - info_level + 1, bytenr, 1, - GFP_NOFS); + ret = add_indirect_ref(fs_info, preftrees, key.offset, + NULL, info_level + 1, bytenr, + 1, GFP_NOFS); break; case BTRFS_EXTENT_DATA_REF_KEY: { /* NORMAL INDIRECT DATA backref */ @@ -1015,8 +1015,9 @@ static int add_keyed_refs(struct btrfs_fs_info *fs_info, } root = btrfs_extent_data_ref_root(leaf, dref); - ret = add_indirect_ref(preftrees, root, &key, 0, bytenr, - count, GFP_NOFS); + ret = add_indirect_ref(fs_info, preftrees, root, + &key, 0, bytenr, count, + GFP_NOFS); break; } default: @@ -1129,8 +1130,8 @@ again: goto again; } spin_unlock(&delayed_refs->lock); - ret = add_delayed_refs(head, time_seq, &preftrees, - &total_refs, inum); + ret = add_delayed_refs(fs_info, head, time_seq, + &preftrees, &total_refs, inum); mutex_unlock(&head->mutex); if (ret) goto out; @@ -1150,8 +1151,9 @@ again: if (key.objectid == bytenr && (key.type == BTRFS_EXTENT_ITEM_KEY || key.type == BTRFS_METADATA_ITEM_KEY)) { - ret = add_inline_refs(path, bytenr, &info_level, - &preftrees, &total_refs, inum); + ret = add_inline_refs(fs_info, path, bytenr, + &info_level, &preftrees, + &total_refs, inum); if (ret) goto out; ret = add_keyed_refs(fs_info, path, bytenr, info_level, diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h index f9428aaaa77a..e410335841aa 100644 --- a/fs/btrfs/backref.h +++ b/fs/btrfs/backref.h @@ -72,4 +72,16 @@ int btrfs_check_shared(struct btrfs_root *root, u64 inum, u64 bytenr); int __init btrfs_prelim_ref_init(void); void btrfs_prelim_ref_exit(void); + +struct prelim_ref { + struct rb_node rbnode; + u64 root_id; + struct btrfs_key key_for_search; + int level; + int count; + struct extent_inode_elem *inode_list; + u64 parent; + u64 wanted_disk_byte; +}; + #endif diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 12540b6104b5..58650f2e0f17 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -61,6 +61,7 @@ #include "tests/btrfs-tests.h" #include "qgroup.h" +#include "backref.h" #define CREATE_TRACE_POINTS #include diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h index 42560feb9920..90d25085762f 100644 --- a/include/trace/events/btrfs.h +++ b/include/trace/events/btrfs.h @@ -26,6 +26,7 @@ struct btrfs_work; struct __btrfs_workqueue; struct btrfs_qgroup_extent_record; struct btrfs_qgroup; +struct prelim_ref; #define show_ref_type(type) \ __print_symbolic(type, \ @@ -1636,6 +1637,63 @@ TRACE_EVENT(qgroup_meta_reserve, show_root_type(__entry->refroot), __entry->diff) ); +DECLARE_EVENT_CLASS(btrfs__prelim_ref, + TP_PROTO(const struct btrfs_fs_info *fs_info, + const struct prelim_ref *oldref, + const struct prelim_ref *newref, u64 tree_size), + TP_ARGS(fs_info, newref, oldref, tree_size), + + TP_STRUCT__entry_btrfs( + __field( u64, root_id ) + __field( u64, objectid ) + __field( u8, type ) + __field( u64, offset ) + __field( int, level ) + __field( int, old_count ) + __field( u64, parent ) + __field( u64, bytenr ) + __field( int, mod_count ) + __field( u64, tree_size ) + ), + + TP_fast_assign_btrfs(fs_info, + __entry->root_id = oldref->root_id; + __entry->objectid = oldref->key_for_search.objectid; + __entry->type = oldref->key_for_search.type; + __entry->offset = oldref->key_for_search.offset; + __entry->level = oldref->level; + __entry->old_count = oldref->count; + __entry->parent = oldref->parent; + __entry->bytenr = oldref->wanted_disk_byte; + __entry->mod_count = newref ? newref->count : 0; + __entry->tree_size = tree_size; + ), + + TP_printk_btrfs("root_id=%llu key=[%llu,%u,%llu] level=%d count=[%d+%d=%d] parent=%llu wanted_disk_byte=%llu nodes=%llu", + (unsigned long long)__entry->root_id, + (unsigned long long)__entry->objectid, __entry->type, + (unsigned long long)__entry->offset, __entry->level, + __entry->old_count, __entry->mod_count, + __entry->old_count + __entry->mod_count, + (unsigned long long)__entry->parent, + (unsigned long long)__entry->bytenr, + (unsigned long long)__entry->tree_size) +); + +DEFINE_EVENT(btrfs__prelim_ref, btrfs_prelim_ref_merge, + TP_PROTO(const struct btrfs_fs_info *fs_info, + const struct prelim_ref *oldref, + const struct prelim_ref *newref, u64 tree_size), + TP_ARGS(fs_info, oldref, newref, tree_size) +); + +DEFINE_EVENT(btrfs__prelim_ref, btrfs_prelim_ref_insert, + TP_PROTO(const struct btrfs_fs_info *fs_info, + const struct prelim_ref *oldref, + const struct prelim_ref *newref, u64 tree_size), + TP_ARGS(fs_info, oldref, newref, tree_size) +); + #endif /* _TRACE_BTRFS_H */ /* This part must be outside protection */ -- cgit v1.2.3 From 6c6b5a39c4bf3dbd8cf629c9f5450e983c19dbb9 Mon Sep 17 00:00:00 2001 From: Aleksa Sarai Date: Tue, 4 Jul 2017 21:49:06 +1000 Subject: btrfs: resume qgroup rescan on rw remount Several distributions mount the "proper root" as ro during initrd and then remount it as rw before pivot_root(2). Thus, if a rescan had been aborted by a previous shutdown, the rescan would never be resumed. This issue would manifest itself as several btrfs ioctl(2)s causing the entire machine to hang when btrfs_qgroup_wait_for_completion was hit (due to the fs_info->qgroup_rescan_running flag being set but the rescan itself not being resumed). Notably, Docker's btrfs storage driver makes regular use of BTRFS_QUOTA_CTL_DISABLE and BTRFS_IOC_QUOTA_RESCAN_WAIT (causing this problem to be manifested on boot for some machines). Cc: # v3.11+ Cc: Jeff Mahoney Fixes: b382a324b60f ("Btrfs: fix qgroup rescan resume on mount") Signed-off-by: Aleksa Sarai Reviewed-by: Nikolay Borisov Tested-by: Nikolay Borisov Signed-off-by: David Sterba --- fs/btrfs/super.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs/btrfs/super.c') diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 58650f2e0f17..2351794fbc44 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1815,6 +1815,8 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) goto restore; } + btrfs_qgroup_rescan_resume(fs_info); + if (!fs_info->uuid_root) { btrfs_info(fs_info, "creating UUID tree"); ret = btrfs_create_uuid_tree(fs_info); -- cgit v1.2.3 From b382cfe889da5d39aeceb79e261f862c3a53eafb Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Thu, 9 Mar 2017 09:34:38 +0800 Subject: btrfs: Do chunk level check for degraded remount Just the same for mount time check, use btrfs_check_rw_degradable() to check if we are OK to be remounted rw. Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/super.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs/btrfs/super.c') diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 2351794fbc44..e4c268c250f5 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1781,8 +1781,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) goto restore; } - if (fs_info->fs_devices->missing_devices > - fs_info->num_tolerated_disk_barrier_failures) { + if (!btrfs_check_rw_degradable(fs_info)) { btrfs_warn(fs_info, "too many missing devices, writeable remount is not allowed"); ret = -EACCES; -- cgit v1.2.3 From 3ec836211183eee87609f832a949dfe711af2b5a Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 22 Jun 2017 02:26:54 +0200 Subject: btrfs: use GFP_KERNEL in mount and remount We don't need to restrict the allocation flags in btrfs_mount or _remount. No big filesystem locks are held (possibly s_umount but that does no count here). Signed-off-by: David Sterba --- fs/btrfs/super.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'fs/btrfs/super.c') diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index e4c268c250f5..64981fc8e39e 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -426,7 +426,7 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, * strsep changes the string, duplicate it because parse_options * gets called twice */ - options = kstrdup(options, GFP_NOFS); + options = kstrdup(options, GFP_KERNEL); if (!options) return -ENOMEM; @@ -950,7 +950,7 @@ static char *get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info, } path->leave_spinning = 1; - name = kmalloc(PATH_MAX, GFP_NOFS); + name = kmalloc(PATH_MAX, GFP_KERNEL); if (!name) { ret = -ENOMEM; goto err; @@ -1336,10 +1336,11 @@ static char *setup_root_args(char *args) char *buf, *dst, *sep; if (!args) - return kstrdup("subvolid=0", GFP_NOFS); + return kstrdup("subvolid=0", GFP_KERNEL); /* The worst case is that we add ",subvolid=0" to the end. */ - buf = dst = kmalloc(strlen(args) + strlen(",subvolid=0") + 1, GFP_NOFS); + buf = dst = kmalloc(strlen(args) + strlen(",subvolid=0") + 1, + GFP_KERNEL); if (!buf) return NULL; @@ -1568,7 +1569,7 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, * it for searching for existing supers, so this lets us do that and * then open_ctree will properly initialize everything later. */ - fs_info = kzalloc(sizeof(struct btrfs_fs_info), GFP_NOFS); + fs_info = kzalloc(sizeof(struct btrfs_fs_info), GFP_KERNEL); if (!fs_info) { error = -ENOMEM; goto error_sec_opts; @@ -1576,8 +1577,8 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, fs_info->fs_devices = fs_devices; - fs_info->super_copy = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_NOFS); - fs_info->super_for_commit = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_NOFS); + fs_info->super_copy = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_KERNEL); + fs_info->super_for_commit = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_KERNEL); security_init_mnt_opts(&fs_info->security_opts); if (!fs_info->super_copy || !fs_info->super_for_commit) { error = -ENOMEM; -- cgit v1.2.3 From a7164fa4e055daf6368cb68ed946aa5a362a1a75 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Mon, 17 Jul 2017 18:11:10 +0200 Subject: btrfs: prepare for extensions in compression options This is a minimal patch intended to be backported to older kernels. We're going to extend the string specifying the compression method and this would fail on kernels before that change (the string is compared exactly). Relax the string matching only to the prefix, ie. ignoring anything that goes after "zlib" or "lzo", regardless of th format extension we decide to use. This applies to the mount options and properties. That way, patched old kernels could be booted on systems already utilizing the new compression spec. Applicable since commit 63541927c8d11, v3.14. Signed-off-by: David Sterba --- fs/btrfs/props.c | 4 ++-- fs/btrfs/super.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'fs/btrfs/super.c') diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c index 916f5cf9b292..09c0266f248d 100644 --- a/fs/btrfs/props.c +++ b/fs/btrfs/props.c @@ -408,9 +408,9 @@ static int prop_compression_apply(struct inode *inode, return 0; } - if (!strncmp("lzo", value, len)) + if (!strncmp("lzo", value, 3)) type = BTRFS_COMPRESS_LZO; - else if (!strncmp("zlib", value, len)) + else if (!strncmp("zlib", value, 4)) type = BTRFS_COMPRESS_ZLIB; else return -EINVAL; diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 64981fc8e39e..8a9bcad3b06a 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -499,14 +499,14 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, btrfs_test_opt(info, FORCE_COMPRESS); if (token == Opt_compress || token == Opt_compress_force || - strcmp(args[0].from, "zlib") == 0) { + strncmp(args[0].from, "zlib", 4) == 0) { compress_type = "zlib"; info->compress_type = BTRFS_COMPRESS_ZLIB; btrfs_set_opt(info->mount_opt, COMPRESS); btrfs_clear_opt(info->mount_opt, NODATACOW); btrfs_clear_opt(info->mount_opt, NODATASUM); no_compress = 0; - } else if (strcmp(args[0].from, "lzo") == 0) { + } else if (strncmp(args[0].from, "lzo", 3) == 0) { compress_type = "lzo"; info->compress_type = BTRFS_COMPRESS_LZO; btrfs_set_opt(info->mount_opt, COMPRESS); -- cgit v1.2.3 From 583b723151794e2ff1691f1510b4e43710293875 Mon Sep 17 00:00:00 2001 From: Hans van Kranenburg Date: Fri, 28 Jul 2017 08:31:28 +0200 Subject: btrfs: Do not use data_alloc_cluster in ssd mode This patch provides a band aid to improve the 'out of the box' behaviour of btrfs for disks that are detected as being an ssd. In a general purpose mixed workload scenario, the current ssd mode causes overallocation of available raw disk space for data, while leaving behind increasing amounts of unused fragmented free space. This situation leads to early ENOSPC problems which are harming user experience and adoption of btrfs as a general purpose filesystem. This patch modifies the data extent allocation behaviour of the ssd mode to make it behave identical to nossd mode. The metadata behaviour and additional ssd_spread option stay untouched so far. Recommendations for future development are to reconsider the current oversimplified nossd / ssd distinction and the broken detection mechanism based on the rotational attribute in sysfs and provide experienced users with a more flexible way to choose allocator behaviour for data and metadata, optimized for certain use cases, while keeping sane 'out of the box' default settings. The internals of the current btrfs code have more potential than what currently gets exposed to the user to choose from. The SSD story... In the first year of btrfs development, around early 2008, btrfs gained a mount option which enables specific functionality for filesystems on solid state devices. The first occurance of this functionality is in commit e18e4809, labeled "Add mount -o ssd, which includes optimizations for seek free storage". The effect on allocating free space for doing (data) writes is to 'cluster' writes together, writing them out in contiguous space, as opposed to a 'tetris' way of putting all separate writes into any free space fragment that fits (which is what the -o nossd behaviour does). A somewhat simplified explanation of what happens is that, when for example, the 'cluster' size is set to 2MiB, when we do some writes, the data allocator will search for a free space block that is 2MiB big, and put the writes in there. The ssd mode itself might allow a 2MiB cluster to be composed of multiple free space extents with some existing data in between, while the additional ssd_spread mount option kills off this option and requires fully free space. The idea behind this is (commit 536ac8ae): "The [...] clusters make it more likely a given IO will completely overwrite the ssd block, so it doesn't have to do an internal rwm cycle."; ssd block meaning nand erase block. So, effectively this means applying a "locality based algorithm" and trying to outsmart the actual ssd. Since then, various changes have been made to the involved code, but the basic idea is still present, and gets activated whenever the ssd mount option is active. This also happens by default, when the rotational flag as seen at /sys/block//queue/rotational is set to 0. However, there's a number of problems with this approach. First, what the optimization is trying to do is outsmart the ssd by assuming there is a relation between the physical address space of the block device as seen by btrfs and the actual physical storage of the ssd, and then adjusting data placement. However, since the introduction of the Flash Translation Layer (FTL) which is a part of the internal controller of an ssd, these attempts are futile. The use of good quality FTL in consumer ssd products might have been limited in 2008, but this situation has changed drastically soon after that time. Today, even the flash memory in your automatic cat feeding machine or your grandma's wheelchair has a full featured one. Second, the behaviour as described above results in the filesystem being filled up with badly fragmented free space extents because of relatively small pieces of space that are freed up by deletes, but not selected again as part of a 'cluster'. Since the algorithm prefers allocating a new chunk over going back to tetris mode, the end result is a filesystem in which all raw space is allocated, but which is composed of underutilized chunks with a 'shotgun blast' pattern of fragmented free space. Usually, the next problematic thing that happens is the filesystem wanting to allocate new space for metadata, which causes the filesystem to fail in spectacular ways. Third, the default mount options you get for an ssd ('ssd' mode enabled, 'discard' not enabled), in combination with spreading out writes over the full address space and ignoring freed up space leads to worst case behaviour in providing information to the ssd itself, since it will never learn that all the free space left behind is actually free. There are two ways to let an ssd know previously written data does not have to be preserved, which are sending explicit signals using discard or fstrim, or by simply overwriting the space with new data. The worst case behaviour is the btrfs ssd_spread mount option in combination with not having discard enabled. It has a side effect of minimizing the reuse of free space previously written in. Fourth, the rotational flag in /sys/ does not reliably indicate if the device is a locally attached ssd. For example, iSCSI or NBD displays as non-rotational, while a loop device on an ssd shows up as rotational. The combination of the second and third problem effectively means that despite all the good intentions, the btrfs ssd mode reliably causes the ssd hardware and the filesystem structures and performance to be choked to death. The clickbait version of the title of this story would have been "Btrfs ssd optimizations considered harmful for ssds". The current nossd 'tetris' mode (even still without discard) allows a pattern of overwriting much more previously used space, causing many more implicit discards to happen because of the overwrite information the ssd gets. The actual location in the physical address space, as seen from the point of view of btrfs is irrelevant, because the actual writes to the low level flash are reordered anyway thanks to the FTL. Changes made in the code 1. Make ssd mode data allocation identical to tetris mode, like nossd. 2. Adjust and clean up filesystem mount messages so that we can easily identify if a kernel has this patch applied or not, when providing support to end users. Also, make better use of the *_and_info helpers to only trigger messages on actual state changes. Backporting notes Notes for whoever wants to backport this patch to their 4.9 LTS kernel: * First apply commit 951e7966 "btrfs: drop the nossd flag when remounting with -o ssd", or fixup the differences manually. * The rest of the conflicts are because of the fs_info refactoring. So, for example, instead of using fs_info, it's root->fs_info in extent-tree.c Signed-off-by: Hans van Kranenburg Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 6 +++--- fs/btrfs/disk-io.c | 6 ++---- fs/btrfs/extent-tree.c | 11 ++++++----- fs/btrfs/super.c | 16 +++++++++------- 4 files changed, 20 insertions(+), 19 deletions(-) (limited to 'fs/btrfs/super.c') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index a3db2b1738aa..02edcddbcc9c 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -470,8 +470,8 @@ struct btrfs_block_rsv { /* * free clusters are used to claim free space in relatively large chunks, - * allowing us to do less seeky writes. They are used for all metadata - * allocations and data allocations in ssd mode. + * allowing us to do less seeky writes. They are used for all metadata + * allocations. In ssd_spread mode they are also used for data allocations. */ struct btrfs_free_cluster { spinlock_t lock; @@ -967,7 +967,7 @@ struct btrfs_fs_info { struct reloc_control *reloc_ctl; - /* data_alloc_cluster is only used in ssd mode */ + /* data_alloc_cluster is only used in ssd_spread mode */ struct btrfs_free_cluster data_alloc_cluster; /* all metadata allocations go through this cluster */ diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 195634098f21..90b967ae46d0 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3053,11 +3053,9 @@ retry_root_backup: if (IS_ERR(fs_info->transaction_kthread)) goto fail_cleaner; - if (!btrfs_test_opt(fs_info, SSD) && - !btrfs_test_opt(fs_info, NOSSD) && + if (!btrfs_test_opt(fs_info, NOSSD) && !fs_info->fs_devices->rotating) { - btrfs_info(fs_info, "detected SSD devices, enabling SSD mode"); - btrfs_set_opt(fs_info->mount_opt, SSD); + btrfs_set_and_info(fs_info, SSD, "enabling ssd optimizations"); } /* diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index eff674bfd162..6a8da7c19182 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -6654,19 +6654,20 @@ fetch_cluster_info(struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info, u64 *empty_cluster) { struct btrfs_free_cluster *ret = NULL; - bool ssd = btrfs_test_opt(fs_info, SSD); *empty_cluster = 0; if (btrfs_mixed_space_info(space_info)) return ret; - if (ssd) - *empty_cluster = SZ_2M; if (space_info->flags & BTRFS_BLOCK_GROUP_METADATA) { ret = &fs_info->meta_alloc_cluster; - if (!ssd) + if (btrfs_test_opt(fs_info, SSD)) + *empty_cluster = SZ_2M; + else *empty_cluster = SZ_64K; - } else if ((space_info->flags & BTRFS_BLOCK_GROUP_DATA) && ssd) { + } else if ((space_info->flags & BTRFS_BLOCK_GROUP_DATA) && + btrfs_test_opt(fs_info, SSD_SPREAD)) { + *empty_cluster = SZ_2M; ret = &fs_info->data_alloc_cluster; } diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 8a9bcad3b06a..0b7a1d8cd08b 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -549,20 +549,22 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, break; case Opt_ssd: btrfs_set_and_info(info, SSD, - "use ssd allocation scheme"); + "enabling ssd optimizations"); btrfs_clear_opt(info->mount_opt, NOSSD); break; case Opt_ssd_spread: + btrfs_set_and_info(info, SSD, + "enabling ssd optimizations"); btrfs_set_and_info(info, SSD_SPREAD, - "use spread ssd allocation scheme"); - btrfs_set_opt(info->mount_opt, SSD); + "using spread ssd allocation scheme"); btrfs_clear_opt(info->mount_opt, NOSSD); break; case Opt_nossd: - btrfs_set_and_info(info, NOSSD, - "not using ssd allocation scheme"); - btrfs_clear_opt(info->mount_opt, SSD); - btrfs_clear_opt(info->mount_opt, SSD_SPREAD); + btrfs_set_opt(info->mount_opt, NOSSD); + btrfs_clear_and_info(info, SSD, + "not using ssd optimizations"); + btrfs_clear_and_info(info, SSD_SPREAD, + "not using spread ssd allocation scheme"); break; case Opt_barrier: btrfs_clear_and_info(info, NOBARRIER, -- cgit v1.2.3