diff options
Diffstat (limited to 'fs/btrfs')
41 files changed, 5444 insertions, 2135 deletions
diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig index 7bb3c020e570..ecb9fd3be143 100644 --- a/fs/btrfs/Kconfig +++ b/fs/btrfs/Kconfig @@ -4,6 +4,8 @@ config BTRFS_FS select LIBCRC32C select ZLIB_INFLATE select ZLIB_DEFLATE + select LZO_COMPRESS + select LZO_DECOMPRESS help Btrfs is a new filesystem with extents, writable snapshotting, support for multiple devices and many more features. diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index a35eb36b32fd..31610ea73aec 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -6,5 +6,5 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ transaction.o inode.o file.o tree-defrag.o \ extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \ extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \ - export.o tree-log.o acl.o free-space-cache.o zlib.o \ + export.o tree-log.o acl.o free-space-cache.o zlib.o lzo.o \ compression.o delayed-ref.o relocation.o diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c index 6ae2c8cac9d5..5d505aaa72fb 100644 --- a/fs/btrfs/acl.c +++ b/fs/btrfs/acl.c @@ -37,6 +37,9 @@ static struct posix_acl *btrfs_get_acl(struct inode *inode, int type) char *value = NULL; struct posix_acl *acl; + if (!IS_POSIXACL(inode)) + return NULL; + acl = get_cached_acl(inode, type); if (acl != ACL_NOT_CACHED) return acl; @@ -60,8 +63,10 @@ static struct posix_acl *btrfs_get_acl(struct inode *inode, int type) size = __btrfs_getxattr(inode, name, value, size); if (size > 0) { acl = posix_acl_from_xattr(value, size); - if (IS_ERR(acl)) + if (IS_ERR(acl)) { + kfree(value); return acl; + } set_cached_acl(inode, type, acl); } kfree(value); @@ -82,6 +87,9 @@ static int btrfs_xattr_acl_get(struct dentry *dentry, const char *name, struct posix_acl *acl; int ret = 0; + if (!IS_POSIXACL(dentry->d_inode)) + return -EOPNOTSUPP; + acl = btrfs_get_acl(dentry->d_inode, type); if (IS_ERR(acl)) @@ -162,7 +170,7 @@ static int btrfs_xattr_acl_set(struct dentry *dentry, const char *name, int ret; struct posix_acl *acl = NULL; - if (!is_owner_or_cap(dentry->d_inode)) + if (!inode_owner_or_capable(dentry->d_inode)) return -EPERM; if (!IS_POSIXACL(dentry->d_inode)) @@ -170,16 +178,17 @@ static int btrfs_xattr_acl_set(struct dentry *dentry, const char *name, if (value) { acl = posix_acl_from_xattr(value, size); - if (acl == NULL) { - value = NULL; - size = 0; + if (acl) { + ret = posix_acl_valid(acl); + if (ret) + goto out; } else if (IS_ERR(acl)) { return PTR_ERR(acl); } } ret = btrfs_set_acl(NULL, dentry->d_inode, acl, type); - +out: posix_acl_release(acl); return ret; diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 6ad63f17eca0..57c3bb2884ce 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -136,9 +136,8 @@ struct btrfs_inode { * items we think we'll end up using, and reserved_extents is the number * of extent items we've reserved metadata for. */ - spinlock_t accounting_lock; atomic_t outstanding_extents; - int reserved_extents; + atomic_t reserved_extents; /* * ordered_data_close is set by truncate when a file that used @@ -157,7 +156,7 @@ struct btrfs_inode { /* * always compress this one file */ - unsigned force_compress:1; + unsigned force_compress:4; struct inode vfs_inode; }; diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index b50bc4bd5c56..41d1d7c70e29 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -62,6 +62,9 @@ struct compressed_bio { /* number of bytes on disk */ unsigned long compressed_len; + /* the compression algorithm for this bio */ + int compress_type; + /* number of compressed pages in the array */ unsigned long nr_pages; @@ -173,11 +176,12 @@ static void end_compressed_bio_read(struct bio *bio, int err) /* ok, we're the last bio for this extent, lets start * the decompression. */ - ret = btrfs_zlib_decompress_biovec(cb->compressed_pages, - cb->start, - cb->orig_bio->bi_io_vec, - cb->orig_bio->bi_vcnt, - cb->compressed_len); + ret = btrfs_decompress_biovec(cb->compress_type, + cb->compressed_pages, + cb->start, + cb->orig_bio->bi_io_vec, + cb->orig_bio->bi_vcnt, + cb->compressed_len); csum_failed: if (ret) cb->errors = 1; @@ -336,6 +340,8 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start, WARN_ON(start & ((u64)PAGE_CACHE_SIZE - 1)); cb = kmalloc(compressed_bio_size(root, compressed_len), GFP_NOFS); + if (!cb) + return -ENOMEM; atomic_set(&cb->pending_bios, 0); cb->errors = 0; cb->inode = inode; @@ -350,6 +356,10 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start, bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev; bio = compressed_bio_alloc(bdev, first_byte, GFP_NOFS); + if(!bio) { + kfree(cb); + return -ENOMEM; + } bio->bi_private = cb; bio->bi_end_io = end_compressed_bio_write; atomic_inc(&cb->pending_bios); @@ -558,7 +568,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, u64 em_len; u64 em_start; struct extent_map *em; - int ret; + int ret = -ENOMEM; u32 *sums; tree = &BTRFS_I(inode)->io_tree; @@ -573,6 +583,9 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, compressed_len = em->block_len; cb = kmalloc(compressed_bio_size(root, compressed_len), GFP_NOFS); + if (!cb) + goto out; + atomic_set(&cb->pending_bios, 0); cb->errors = 0; cb->inode = inode; @@ -588,17 +601,23 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, cb->len = uncompressed_len; cb->compressed_len = compressed_len; + cb->compress_type = extent_compress_type(bio_flags); cb->orig_bio = bio; nr_pages = (compressed_len + PAGE_CACHE_SIZE - 1) / PAGE_CACHE_SIZE; - cb->compressed_pages = kmalloc(sizeof(struct page *) * nr_pages, + cb->compressed_pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS); + if (!cb->compressed_pages) + goto fail1; + bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev; for (page_index = 0; page_index < nr_pages; page_index++) { cb->compressed_pages[page_index] = alloc_page(GFP_NOFS | __GFP_HIGHMEM); + if (!cb->compressed_pages[page_index]) + goto fail2; } cb->nr_pages = nr_pages; @@ -609,6 +628,8 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, cb->len = uncompressed_len; comp_bio = compressed_bio_alloc(bdev, cur_disk_byte, GFP_NOFS); + if (!comp_bio) + goto fail2; comp_bio->bi_private = cb; comp_bio->bi_end_io = end_compressed_bio_read; atomic_inc(&cb->pending_bios); @@ -642,8 +663,9 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, atomic_inc(&cb->pending_bios); if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) { - btrfs_lookup_bio_sums(root, inode, comp_bio, - sums); + ret = btrfs_lookup_bio_sums(root, inode, + comp_bio, sums); + BUG_ON(ret); } sums += (comp_bio->bi_size + root->sectorsize - 1) / root->sectorsize; @@ -668,12 +690,339 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, 0); BUG_ON(ret); - if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) - btrfs_lookup_bio_sums(root, inode, comp_bio, sums); + if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) { + ret = btrfs_lookup_bio_sums(root, inode, comp_bio, sums); + BUG_ON(ret); + } ret = btrfs_map_bio(root, READ, comp_bio, mirror_num, 0); BUG_ON(ret); bio_put(comp_bio); return 0; + +fail2: + for (page_index = 0; page_index < nr_pages; page_index++) + free_page((unsigned long)cb->compressed_pages[page_index]); + + kfree(cb->compressed_pages); +fail1: + kfree(cb); +out: + free_extent_map(em); + return ret; +} + +static struct list_head comp_idle_workspace[BTRFS_COMPRESS_TYPES]; +static spinlock_t comp_workspace_lock[BTRFS_COMPRESS_TYPES]; +static int comp_num_workspace[BTRFS_COMPRESS_TYPES]; +static atomic_t comp_alloc_workspace[BTRFS_COMPRESS_TYPES]; +static wait_queue_head_t comp_workspace_wait[BTRFS_COMPRESS_TYPES]; + +struct btrfs_compress_op *btrfs_compress_op[] = { + &btrfs_zlib_compress, + &btrfs_lzo_compress, +}; + +int __init btrfs_init_compress(void) +{ + int i; + + for (i = 0; i < BTRFS_COMPRESS_TYPES; i++) { + INIT_LIST_HEAD(&comp_idle_workspace[i]); + spin_lock_init(&comp_workspace_lock[i]); + atomic_set(&comp_alloc_workspace[i], 0); + init_waitqueue_head(&comp_workspace_wait[i]); + } + return 0; +} + +/* + * this finds an available workspace or allocates a new one + * ERR_PTR is returned if things go bad. + */ +static struct list_head *find_workspace(int type) +{ + struct list_head *workspace; + int cpus = num_online_cpus(); + int idx = type - 1; + + struct list_head *idle_workspace = &comp_idle_workspace[idx]; + spinlock_t *workspace_lock = &comp_workspace_lock[idx]; + atomic_t *alloc_workspace = &comp_alloc_workspace[idx]; + wait_queue_head_t *workspace_wait = &comp_workspace_wait[idx]; + int *num_workspace = &comp_num_workspace[idx]; +again: + spin_lock(workspace_lock); + if (!list_empty(idle_workspace)) { + workspace = idle_workspace->next; + list_del(workspace); + (*num_workspace)--; + spin_unlock(workspace_lock); + return workspace; + + } + if (atomic_read(alloc_workspace) > cpus) { + DEFINE_WAIT(wait); + + spin_unlock(workspace_lock); + prepare_to_wait(workspace_wait, &wait, TASK_UNINTERRUPTIBLE); + if (atomic_read(alloc_workspace) > cpus && !*num_workspace) + schedule(); + finish_wait(workspace_wait, &wait); + goto again; + } + atomic_inc(alloc_workspace); + spin_unlock(workspace_lock); + + workspace = btrfs_compress_op[idx]->alloc_workspace(); + if (IS_ERR(workspace)) { + atomic_dec(alloc_workspace); + wake_up(workspace_wait); + } + return workspace; +} + +/* + * put a workspace struct back on the list or free it if we have enough + * idle ones sitting around + */ +static void free_workspace(int type, struct list_head *workspace) +{ + int idx = type - 1; + struct list_head *idle_workspace = &comp_idle_workspace[idx]; + spinlock_t *workspace_lock = &comp_workspace_lock[idx]; + atomic_t *alloc_workspace = &comp_alloc_workspace[idx]; + wait_queue_head_t *workspace_wait = &comp_workspace_wait[idx]; + int *num_workspace = &comp_num_workspace[idx]; + + spin_lock(workspace_lock); + if (*num_workspace < num_online_cpus()) { + list_add_tail(workspace, idle_workspace); + (*num_workspace)++; + spin_unlock(workspace_lock); + goto wake; + } + spin_unlock(workspace_lock); + + btrfs_compress_op[idx]->free_workspace(workspace); + atomic_dec(alloc_workspace); +wake: + if (waitqueue_active(workspace_wait)) + wake_up(workspace_wait); +} + +/* + * cleanup function for module exit + */ +static void free_workspaces(void) +{ + struct list_head *workspace; + int i; + + for (i = 0; i < BTRFS_COMPRESS_TYPES; i++) { + while (!list_empty(&comp_idle_workspace[i])) { + workspace = comp_idle_workspace[i].next; + list_del(workspace); + btrfs_compress_op[i]->free_workspace(workspace); + atomic_dec(&comp_alloc_workspace[i]); + } + } +} + +/* + * given an address space and start/len, compress the bytes. + * + * pages are allocated to hold the compressed result and stored + * in 'pages' + * + * out_pages is used to return the number of pages allocated. There + * may be pages allocated even if we return an error + * + * total_in is used to return the number of bytes actually read. It + * may be smaller then len if we had to exit early because we + * ran out of room in the pages array or because we cross the + * max_out threshold. + * + * total_out is used to return the total number of compressed bytes + * + * max_out tells us the max number of bytes that we're allowed to + * stuff into pages + */ +int btrfs_compress_pages(int type, struct address_space *mapping, + u64 start, unsigned long len, + struct page **pages, + unsigned long nr_dest_pages, + unsigned long *out_pages, + unsigned long *total_in, + unsigned long *total_out, + unsigned long max_out) +{ + struct list_head *workspace; + int ret; + + workspace = find_workspace(type); + if (IS_ERR(workspace)) + return -1; + + ret = btrfs_compress_op[type-1]->compress_pages(workspace, mapping, + start, len, pages, + nr_dest_pages, out_pages, + total_in, total_out, + max_out); + free_workspace(type, workspace); + return ret; +} + +/* + * pages_in is an array of pages with compressed data. + * + * disk_start is the starting logical offset of this array in the file + * + * bvec is a bio_vec of pages from the file that we want to decompress into + * + * vcnt is the count of pages in the biovec + * + * srclen is the number of bytes in pages_in + * + * The basic idea is that we have a bio that was created by readpages. + * The pages in the bio are for the uncompressed data, and they may not + * be contiguous. They all correspond to the range of bytes covered by + * the compressed extent. + */ +int btrfs_decompress_biovec(int type, struct page **pages_in, u64 disk_start, + struct bio_vec *bvec, int vcnt, size_t srclen) +{ + struct list_head *workspace; + int ret; + + workspace = find_workspace(type); + if (IS_ERR(workspace)) + return -ENOMEM; + + ret = btrfs_compress_op[type-1]->decompress_biovec(workspace, pages_in, + disk_start, + bvec, vcnt, srclen); + free_workspace(type, workspace); + return ret; +} + +/* + * a less complex decompression routine. Our compressed data fits in a + * single page, and we want to read a single page out of it. + * start_byte tells us the offset into the compressed data we're interested in + */ +int btrfs_decompress(int type, unsigned char *data_in, struct page *dest_page, + unsigned long start_byte, size_t srclen, size_t destlen) +{ + struct list_head *workspace; + int ret; + + workspace = find_workspace(type); + if (IS_ERR(workspace)) + return -ENOMEM; + + ret = btrfs_compress_op[type-1]->decompress(workspace, data_in, + dest_page, start_byte, + srclen, destlen); + + free_workspace(type, workspace); + return ret; +} + +void btrfs_exit_compress(void) +{ + free_workspaces(); +} + +/* + * Copy uncompressed data from working buffer to pages. + * + * buf_start is the byte offset we're of the start of our workspace buffer. + * + * total_out is the last byte of the buffer + */ +int btrfs_decompress_buf2page(char *buf, unsigned long buf_start, + unsigned long total_out, u64 disk_start, + struct bio_vec *bvec, int vcnt, + unsigned long *page_index, + unsigned long *pg_offset) +{ + unsigned long buf_offset; + unsigned long current_buf_start; + unsigned long start_byte; + unsigned long working_bytes = total_out - buf_start; + unsigned long bytes; + char *kaddr; + struct page *page_out = bvec[*page_index].bv_page; + + /* + * start byte is the first byte of the page we're currently + * copying into relative to the start of the compressed data. + */ + start_byte = page_offset(page_out) - disk_start; + + /* we haven't yet hit data corresponding to this page */ + if (total_out <= start_byte) + return 1; + + /* + * the start of the data we care about is offset into + * the middle of our working buffer + */ + if (total_out > start_byte && buf_start < start_byte) { + buf_offset = start_byte - buf_start; + working_bytes -= buf_offset; + } else { + buf_offset = 0; + } + current_buf_start = buf_start; + + /* copy bytes from the working buffer into the pages */ + while (working_bytes > 0) { + bytes = min(PAGE_CACHE_SIZE - *pg_offset, + PAGE_CACHE_SIZE - buf_offset); + bytes = min(bytes, working_bytes); + kaddr = kmap_atomic(page_out, KM_USER0); + memcpy(kaddr + *pg_offset, buf + buf_offset, bytes); + kunmap_atomic(kaddr, KM_USER0); + flush_dcache_page(page_out); + + *pg_offset += bytes; + buf_offset += bytes; + working_bytes -= bytes; + current_buf_start += bytes; + + /* check if we need to pick another page */ + if (*pg_offset == PAGE_CACHE_SIZE) { + (*page_index)++; + if (*page_index >= vcnt) + return 0; + + page_out = bvec[*page_index].bv_page; + *pg_offset = 0; + start_byte = page_offset(page_out) - disk_start; + + /* + * make sure our new page is covered by this + * working buffer + */ + if (total_out <= start_byte) + return 1; + + /* + * the next page in the biovec might not be adjacent + * to the last page, but it might still be found + * inside this working buffer. bump our offset pointer + */ + if (total_out > start_byte && + current_buf_start < start_byte) { + buf_offset = start_byte - buf_start; + working_bytes = total_out - start_byte; + current_buf_start = buf_start + buf_offset; + } + } + } + + return 1; } diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index 421f5b4aa715..51000174b9d7 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -19,24 +19,27 @@ #ifndef __BTRFS_COMPRESSION_ #define __BTRFS_COMPRESSION_ -int btrfs_zlib_decompress(unsigned char *data_in, - struct page *dest_page, - unsigned long start_byte, - size_t srclen, size_t destlen); -int btrfs_zlib_compress_pages(struct address_space *mapping, - u64 start, unsigned long len, - struct page **pages, - unsigned long nr_dest_pages, - unsigned long *out_pages, - unsigned long *total_in, - unsigned long *total_out, - unsigned long max_out); -int btrfs_zlib_decompress_biovec(struct page **pages_in, - u64 disk_start, - struct bio_vec *bvec, - int vcnt, - size_t srclen); -void btrfs_zlib_exit(void); +int btrfs_init_compress(void); +void btrfs_exit_compress(void); + +int btrfs_compress_pages(int type, struct address_space *mapping, + u64 start, unsigned long len, + struct page **pages, + unsigned long nr_dest_pages, + unsigned long *out_pages, + unsigned long *total_in, + unsigned long *total_out, + unsigned long max_out); +int btrfs_decompress_biovec(int type, struct page **pages_in, u64 disk_start, + struct bio_vec *bvec, int vcnt, size_t srclen); +int btrfs_decompress(int type, unsigned char *data_in, struct page *dest_page, + unsigned long start_byte, size_t srclen, size_t destlen); +int btrfs_decompress_buf2page(char *buf, unsigned long buf_start, + unsigned long total_out, u64 disk_start, + struct bio_vec *bvec, int vcnt, + unsigned long *page_index, + unsigned long *pg_offset); + int btrfs_submit_compressed_write(struct inode *inode, u64 start, unsigned long len, u64 disk_start, unsigned long compressed_len, @@ -44,4 +47,37 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start, unsigned long nr_pages); int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, int mirror_num, unsigned long bio_flags); + +struct btrfs_compress_op { + struct list_head *(*alloc_workspace)(void); + + void (*free_workspace)(struct list_head *workspace); + + int (*compress_pages)(struct list_head *workspace, + struct address_space *mapping, + u64 start, unsigned long len, + struct page **pages, + unsigned long nr_dest_pages, + unsigned long *out_pages, + unsigned long *total_in, + unsigned long *total_out, + unsigned long max_out); + + int (*decompress_biovec)(struct list_head *workspace, + struct page **pages_in, + u64 disk_start, + struct bio_vec *bvec, + int vcnt, + size_t srclen); + + int (*decompress)(struct list_head *workspace, + unsigned char *data_in, + struct page *dest_page, + unsigned long start_byte, + size_t srclen, size_t destlen); +}; + +extern struct btrfs_compress_op btrfs_zlib_compress; +extern struct btrfs_compress_op btrfs_lzo_compress; + #endif diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 9ac171599258..84d7ca1fe0ba 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -105,6 +105,8 @@ noinline void btrfs_clear_path_blocking(struct btrfs_path *p, /* this also releases the path */ void btrfs_free_path(struct btrfs_path *p) { + if (!p) + return; btrfs_release_path(NULL, p); kmem_cache_free(btrfs_path_cachep, p); } @@ -145,10 +147,11 @@ noinline void btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p) struct extent_buffer *btrfs_root_node(struct btrfs_root *root) { struct extent_buffer *eb; - spin_lock(&root->node_lock); - eb = root->node; + + rcu_read_lock(); + eb = rcu_dereference(root->node); extent_buffer_get(eb); - spin_unlock(&root->node_lock); + rcu_read_unlock(); return eb; } @@ -163,14 +166,8 @@ struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root) while (1) { eb = btrfs_root_node(root); btrfs_tree_lock(eb); - - spin_lock(&root->node_lock); - if (eb == root->node) { - spin_unlock(&root->node_lock); + if (eb == root->node) break; - } - spin_unlock(&root->node_lock); - btrfs_tree_unlock(eb); free_extent_buffer(eb); } @@ -456,10 +453,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, else parent_start = 0; - spin_lock(&root->node_lock); - root->node = cow; extent_buffer_get(cow); - spin_unlock(&root->node_lock); + rcu_assign_pointer(root->node, cow); btrfs_free_tree_block(trans, root, buf, parent_start, last_ref); @@ -540,6 +535,9 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans, ret = __btrfs_cow_block(trans, root, buf, parent, parent_slot, cow_ret, search_start, 0); + + trace_btrfs_cow_block(root, buf, *cow_ret); + return ret; } @@ -684,6 +682,8 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans, if (!cur) { cur = read_tree_block(root, blocknr, blocksize, gen); + if (!cur) + return -EIO; } else if (!uptodate) { btrfs_read_buffer(cur, gen); } @@ -730,122 +730,6 @@ static inline unsigned int leaf_data_end(struct btrfs_root *root, return btrfs_item_offset_nr(leaf, nr - 1); } -/* - * extra debugging checks to make sure all the items in a key are - * well formed and in the proper order - */ -static int check_node(struct btrfs_root *root, struct btrfs_path *path, - int level) -{ - struct extent_buffer *parent = NULL; - struct extent_buffer *node = path->nodes[level]; - struct btrfs_disk_key parent_key; - struct btrfs_disk_key node_key; - int parent_slot; - int slot; - struct btrfs_key cpukey; - u32 nritems = btrfs_header_nritems(node); - - if (path->nodes[level + 1]) - parent = path->nodes[level + 1]; - - slot = path->slots[level]; - BUG_ON(nritems == 0); - if (parent) { - parent_slot = path->slots[level + 1]; - btrfs_node_key(parent, &parent_key, parent_slot); - btrfs_node_key(node, &node_key, 0); - BUG_ON(memcmp(&parent_key, &node_key, - sizeof(struct btrfs_disk_key))); - BUG_ON(btrfs_node_blockptr(parent, parent_slot) != - btrfs_header_bytenr(node)); - } - BUG_ON(nritems > BTRFS_NODEPTRS_PER_BLOCK(root)); - if (slot != 0) { - btrfs_node_key_to_cpu(node, &cpukey, slot - 1); - btrfs_node_key(node, &node_key, slot); - BUG_ON(comp_keys(&node_key, &cpukey) <= 0); - } - if (slot < nritems - 1) { - btrfs_node_key_to_cpu(node, &cpukey, slot + 1); - btrfs_node_key(node, &node_key, slot); - BUG_ON(comp_keys(&node_key, &cpukey) >= 0); - } - return 0; -} - -/* - * extra checking to make sure all the items in a leaf are - * well formed and in the proper order - */ -static int check_leaf(struct btrfs_root *root, struct btrfs_path *path, - int level) -{ - struct extent_buffer *leaf = path->nodes[level]; - struct extent_buffer *parent = NULL; - int parent_slot; - struct btrfs_key cpukey; - struct btrfs_disk_key parent_key; - struct btrfs_disk_key leaf_key; - int slot = path->slots[0]; - - u32 nritems = btrfs_header_nritems(leaf); - - if (path->nodes[level + 1]) - parent = path->nodes[level + 1]; - - if (nritems == 0) - return 0; - - if (parent) { - parent_slot = path->slots[level + 1]; - btrfs_node_key(parent, &parent_key, parent_slot); - btrfs_item_key(leaf, &leaf_key, 0); - - BUG_ON(memcmp(&parent_key, &leaf_key, - sizeof(struct btrfs_disk_key))); - BUG_ON(btrfs_node_blockptr(parent, parent_slot) != - btrfs_header_bytenr(leaf)); - } - if (slot != 0 && slot < nritems - 1) { - btrfs_item_key(leaf, &leaf_key, slot); - btrfs_item_key_to_cpu(leaf, &cpukey, slot - 1); - if (comp_keys(&leaf_key, &cpukey) <= 0) { - btrfs_print_leaf(root, leaf); - printk(KERN_CRIT "slot %d offset bad key\n", slot); - BUG_ON(1); - } - if (btrfs_item_offset_nr(leaf, slot - 1) != - btrfs_item_end_nr(leaf, slot)) { - btrfs_print_leaf(root, leaf); - printk(KERN_CRIT "slot %d offset bad\n", slot); - BUG_ON(1); - } - } - if (slot < nritems - 1) { - btrfs_item_key(leaf, &leaf_key, slot); - btrfs_item_key_to_cpu(leaf, &cpukey, slot + 1); - BUG_ON(comp_keys(&leaf_key, &cpukey) >= 0); - if (btrfs_item_offset_nr(leaf, slot) != - btrfs_item_end_nr(leaf, slot + 1)) { - btrfs_print_leaf(root, leaf); - printk(KERN_CRIT "slot %d offset bad\n", slot); - BUG_ON(1); - } - } - BUG_ON(btrfs_item_offset_nr(leaf, 0) + - btrfs_item_size_nr(leaf, 0) != BTRFS_LEAF_DATA_SIZE(root)); - return 0; -} - -static noinline int check_block(struct btrfs_root *root, - struct btrfs_path *path, int level) -{ - return 0; - if (level == 0) - return check_leaf(root, path, level); - return check_node(root, path, level); -} /* * search for key in the extent_buffer. The items start at offset p, @@ -1044,9 +928,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, goto enospc; } - spin_lock(&root->node_lock); - root->node = child; - spin_unlock(&root->node_lock); + rcu_assign_pointer(root->node, child); add_root_to_dirty_list(root); btrfs_tree_unlock(child); @@ -1186,7 +1068,6 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, } } /* double check we haven't messed things up */ - check_block(root, path, level); if (orig_ptr != btrfs_node_blockptr(path->nodes[level], path->slots[level])) BUG(); @@ -1796,12 +1677,6 @@ cow_done: if (!cow) btrfs_unlock_up_safe(p, level + 1); - ret = check_block(root, p, level); - if (ret) { - ret = -1; - goto done; - } - ret = bin_search(b, key, level, &slot); if (level != 0) { @@ -2128,10 +2003,8 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(c); - spin_lock(&root->node_lock); old = root->node; - root->node = c; - spin_unlock(&root->node_lock); + rcu_assign_pointer(root->node, c); /* the super has an extra ref to root->node */ free_extent_buffer(old); @@ -2514,6 +2387,9 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root btrfs_assert_tree_locked(path->nodes[1]); right = read_node_slot(root, upper, slot + 1); + if (right == NULL) + return 1; + btrfs_tree_lock(right); btrfs_set_lock_blocking(right); @@ -2764,6 +2640,9 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root btrfs_assert_tree_locked(path->nodes[1]); left = read_node_slot(root, path->nodes[1], slot - 1); + if (left == NULL) + return 1; + btrfs_tree_lock(left); btrfs_set_lock_blocking(left); @@ -3832,7 +3711,8 @@ int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root unsigned long ptr; path = btrfs_alloc_path(); - BUG_ON(!path); + if (!path) + return -ENOMEM; ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size); if (!ret) { leaf = path->nodes[0]; @@ -4209,6 +4089,7 @@ find_next_key: } btrfs_set_path_blocking(path); cur = read_node_slot(root, cur, slot); + BUG_ON(!cur); btrfs_tree_lock(cur); diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index b875d445ea81..2e61fe1b6b8c 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -28,6 +28,7 @@ #include <linux/wait.h> #include <linux/slab.h> #include <linux/kobject.h> +#include <trace/events/btrfs.h> #include <asm/kmap_types.h> #include "extent_io.h" #include "extent_map.h" @@ -40,6 +41,7 @@ extern struct kmem_cache *btrfs_trans_handle_cachep; extern struct kmem_cache *btrfs_transaction_cachep; extern struct kmem_cache *btrfs_bit_radix_cachep; extern struct kmem_cache *btrfs_path_cachep; +extern struct kmem_cache *btrfs_free_space_cachep; struct btrfs_ordered_sum; #define BTRFS_MAGIC "_BHRfS_M" @@ -295,6 +297,14 @@ static inline unsigned long btrfs_chunk_item_size(int num_stripes) #define BTRFS_FSID_SIZE 16 #define BTRFS_HEADER_FLAG_WRITTEN (1ULL << 0) #define BTRFS_HEADER_FLAG_RELOC (1ULL << 1) + +/* + * File system states + */ + +/* Errors detected */ +#define BTRFS_SUPER_FLAG_ERROR (1ULL << 2) + #define BTRFS_SUPER_FLAG_SEEDING (1ULL << 32) #define BTRFS_SUPER_FLAG_METADUMP (1ULL << 33) @@ -399,13 +409,15 @@ struct btrfs_super_block { #define BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF (1ULL << 0) #define BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL (1ULL << 1) #define BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS (1ULL << 2) +#define BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO (1ULL << 3) #define BTRFS_FEATURE_COMPAT_SUPP 0ULL #define BTRFS_FEATURE_COMPAT_RO_SUPP 0ULL #define BTRFS_FEATURE_INCOMPAT_SUPP \ (BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF | \ BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL | \ - BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) + BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS | \ + BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO) /* * A leaf is full of items. offset and size tell us where to find @@ -552,9 +564,11 @@ struct btrfs_timespec { } __attribute__ ((__packed__)); enum btrfs_compression_type { - BTRFS_COMPRESS_NONE = 0, - BTRFS_COMPRESS_ZLIB = 1, - BTRFS_COMPRESS_LAST = 2, + BTRFS_COMPRESS_NONE = 0, + BTRFS_COMPRESS_ZLIB = 1, + BTRFS_COMPRESS_LZO = 2, + BTRFS_COMPRESS_TYPES = 2, + BTRFS_COMPRESS_LAST = 3, }; struct btrfs_inode_item { @@ -598,6 +612,8 @@ struct btrfs_dir_item { u8 type; } __attribute__ ((__packed__)); +#define BTRFS_ROOT_SUBVOL_RDONLY (1ULL << 0) + struct btrfs_root_item { struct btrfs_inode_item inode; __le64 generation; @@ -715,8 +731,19 @@ struct btrfs_space_info { u64 disk_total; /* total bytes on disk, takes mirrors into account */ - int full; /* indicates that we cannot allocate any more + /* + * we bump reservation progress every time we decrement + * bytes_reserved. This way people waiting for reservations + * know something good has happened and they can check + * for progress. The number here isn't to be trusted, it + * just shows reclaim activity + */ + unsigned long reservation_progress; + + int full:1; /* indicates that we cannot allocate any more chunks for this space */ + int chunk_alloc:1; /* set if we are allocating a chunk */ + int force_alloc; /* set if we need to force a chunk alloc for this space */ @@ -759,9 +786,6 @@ struct btrfs_free_cluster { /* first extent starting offset */ u64 window_start; - /* if this cluster simply points at a bitmap in the block group */ - bool points_to_bitmap; - struct btrfs_block_group_cache *block_group; /* * when a cluster is allocated from a block group, we put the @@ -896,7 +920,8 @@ struct btrfs_fs_info { */ u64 last_trans_log_full_commit; u64 open_ioctl_trans; - unsigned long mount_opt; + unsigned long mount_opt:20; + unsigned long compress_type:4; u64 max_inline; u64 alloc_start; struct btrfs_transaction *running_transaction; @@ -1051,6 +1076,9 @@ struct btrfs_fs_info { unsigned metadata_ratio; void *bdev_holder; + + /* filesystem state */ + u64 fs_state; }; /* @@ -1236,6 +1264,7 @@ struct btrfs_root { #define BTRFS_MOUNT_SPACE_CACHE (1 << 12) #define BTRFS_MOUNT_CLEAR_CACHE (1 << 13) #define BTRFS_MOUNT_USER_SUBVOL_RM_ALLOWED (1 << 14) +#define BTRFS_MOUNT_ENOSPC_DEBUG (1 << 15) #define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt) #define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt) @@ -1255,6 +1284,9 @@ struct btrfs_root { #define BTRFS_INODE_NODUMP (1 << 8) #define BTRFS_INODE_NOATIME (1 << 9) #define BTRFS_INODE_DIRSYNC (1 << 10) +#define BTRFS_INODE_COMPRESS (1 << 11) + +#define BTRFS_INODE_ROOT_ITEM_INIT (1 << 31) /* some macros to generate set/get funcs for the struct fields. This * assumes there is a lefoo_to_cpu for every type, so lets make a simple @@ -1894,6 +1926,11 @@ BTRFS_SETGET_STACK_FUNCS(root_limit, struct btrfs_root_item, byte_limit, 64); BTRFS_SETGET_STACK_FUNCS(root_last_snapshot, struct btrfs_root_item, last_snapshot, 64); +static inline bool btrfs_root_readonly(struct btrfs_root *root) +{ + return root->root_item.flags & BTRFS_ROOT_SUBVOL_RDONLY; +} + /* struct btrfs_super_block */ BTRFS_SETGET_STACK_FUNCS(super_bytenr, struct btrfs_super_block, bytenr, 64); @@ -2124,6 +2161,8 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, u64 root_objectid, u64 owner, u64 offset); int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len); +int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache, + u64 num_bytes, int reserve, int sinfo); int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans, struct btrfs_root *root); int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, @@ -2146,6 +2185,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, int btrfs_remove_block_group(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 group_start); u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags); +u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data); void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *ionde); void btrfs_clear_space_info_full(struct btrfs_fs_info *info); int btrfs_check_data_free_space(struct inode *inode, u64 bytes); @@ -2189,6 +2229,16 @@ int btrfs_set_block_group_ro(struct btrfs_root *root, int btrfs_set_block_group_rw(struct btrfs_root *root, struct btrfs_block_group_cache *cache); void btrfs_put_block_group_cache(struct btrfs_fs_info *info); +u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo); +int btrfs_error_unpin_extent_range(struct btrfs_root *root, + u64 start, u64 end); +int btrfs_error_discard_extent(struct btrfs_root *root, u64 bytenr, + u64 num_bytes, u64 *actual_bytes); +int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 type); +int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range); + +int btrfs_init_space_info(struct btrfs_fs_info *fs_info); /* ctree.c */ int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key, int level, int *slot); @@ -2313,6 +2363,8 @@ int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid); int btrfs_find_orphan_roots(struct btrfs_root *tree_root); int btrfs_set_root_node(struct btrfs_root_item *item, struct extent_buffer *node); +void btrfs_check_and_init_root_item(struct btrfs_root_item *item); + /* dir-item.c */ int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, const char *name, @@ -2350,6 +2402,9 @@ struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans, struct btrfs_path *path, u64 dir, const char *name, u16 name_len, int mod); +int verify_dir_item(struct btrfs_root *root, + struct extent_buffer *leaf, + struct btrfs_dir_item *dir_item); /* orphan.c */ int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans, @@ -2486,7 +2541,7 @@ int btrfs_update_inode(struct btrfs_trans_handle *trans, struct inode *inode); int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode); int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode); -void btrfs_orphan_cleanup(struct btrfs_root *root); +int btrfs_orphan_cleanup(struct btrfs_root *root); void btrfs_orphan_pre_snapshot(struct btrfs_trans_handle *trans, struct btrfs_pending_snapshot *pending, u64 *bytes_to_reserve); @@ -2494,7 +2549,7 @@ void btrfs_orphan_post_snapshot(struct btrfs_trans_handle *trans, struct btrfs_pending_snapshot *pending); void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans, struct btrfs_root *root); -int btrfs_cont_expand(struct inode *inode, loff_t size); +int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size); int btrfs_invalidate_inodes(struct btrfs_root *root); void btrfs_add_delayed_iput(struct inode *inode); void btrfs_run_delayed_iputs(struct btrfs_root *root); @@ -2523,6 +2578,11 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct inode *inode, int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, struct inode *inode, u64 start, u64 end); int btrfs_release_file(struct inode *inode, struct file *file); +void btrfs_drop_pages(struct page **pages, size_t num_pages); +int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode, + struct page **pages, size_t num_pages, + loff_t pos, size_t write_bytes, + struct extent_state **cached); /* tree-defrag.c */ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, @@ -2542,6 +2602,14 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size); /* super.c */ int btrfs_parse_options(struct btrfs_root *root, char *options); int btrfs_sync_fs(struct super_block *sb, int wait); +void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function, + unsigned int line, int errno); + +#define btrfs_std_error(fs_info, errno) \ +do { \ + if ((errno)) \ + __btrfs_std_error((fs_info), __func__, __LINE__, (errno));\ +} while (0) /* acl.c */ #ifdef CONFIG_BTRFS_FS_POSIX_ACL diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index e807b143b857..bce28f653899 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -483,6 +483,8 @@ static noinline int add_delayed_ref_head(struct btrfs_trans_handle *trans, INIT_LIST_HEAD(&head_ref->cluster); mutex_init(&head_ref->mutex); + trace_btrfs_delayed_ref_head(ref, head_ref, action); + existing = tree_insert(&delayed_refs->root, &ref->rb_node); if (existing) { @@ -537,6 +539,8 @@ static noinline int add_delayed_tree_ref(struct btrfs_trans_handle *trans, } full_ref->level = level; + trace_btrfs_delayed_tree_ref(ref, full_ref, action); + existing = tree_insert(&delayed_refs->root, &ref->rb_node); if (existing) { @@ -591,6 +595,8 @@ static noinline int add_delayed_data_ref(struct btrfs_trans_handle *trans, full_ref->objectid = owner; full_ref->offset = offset; + trace_btrfs_delayed_data_ref(ref, full_ref, action); + existing = tree_insert(&delayed_refs->root, &ref->rb_node); if (existing) { diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c index f0cad5ae5be7..c62f02f6ae69 100644 --- a/fs/btrfs/dir-item.c +++ b/fs/btrfs/dir-item.c @@ -151,7 +151,7 @@ int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root ret = PTR_ERR(dir_item); if (ret == -EEXIST) goto second_insert; - goto out; + goto out_free; } leaf = path->nodes[0]; @@ -170,7 +170,7 @@ second_insert: /* FIXME, use some real flag for selecting the extra index */ if (root == root->fs_info->tree_root) { ret = 0; - goto out; + goto out_free; } btrfs_release_path(root, path); @@ -180,7 +180,7 @@ second_insert: name, name_len); if (IS_ERR(dir_item)) { ret2 = PTR_ERR(dir_item); - goto out; + goto out_free; } leaf = path->nodes[0]; btrfs_cpu_key_to_disk(&disk_key, location); @@ -192,7 +192,9 @@ second_insert: name_ptr = (unsigned long)(dir_item + 1); write_extent_buffer(leaf, name, name_ptr, name_len); btrfs_mark_buffer_dirty(leaf); -out: + +out_free: + btrfs_free_path(path); if (ret) return ret; @@ -377,6 +379,9 @@ struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root, leaf = path->nodes[0]; dir_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dir_item); + if (verify_dir_item(root, leaf, dir_item)) + return NULL; + total_len = btrfs_item_size_nr(leaf, path->slots[0]); while (cur < total_len) { this_len = sizeof(*dir_item) + @@ -429,3 +434,35 @@ int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans, } return ret; } + +int verify_dir_item(struct btrfs_root *root, + struct extent_buffer *leaf, + struct btrfs_dir_item *dir_item) +{ + u16 namelen = BTRFS_NAME_LEN; + u8 type = btrfs_dir_type(leaf, dir_item); + + if (type >= BTRFS_FT_MAX) { + printk(KERN_CRIT "btrfs: invalid dir item type: %d\n", + (int)type); + return 1; + } + + if (type == BTRFS_FT_XATTR) + namelen = XATTR_NAME_MAX; + + if (btrfs_dir_name_len(leaf, dir_item) > namelen) { + printk(KERN_CRIT "btrfS: invalid dir item name len: %u\n", + (unsigned)btrfs_dir_data_len(leaf, dir_item)); + return 1; + } + + /* BTRFS_MAX_XATTR_SIZE is the same for all dir items */ + if (btrfs_dir_data_len(leaf, dir_item) > BTRFS_MAX_XATTR_SIZE(root)) { + printk(KERN_CRIT "btrfs: invalid dir item data len: %u\n", + (unsigned)btrfs_dir_data_len(leaf, dir_item)); + return 1; + } + + return 0; +} diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 51d2e4de34eb..68c84c8c24bd 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -29,6 +29,7 @@ #include <linux/crc32c.h> #include <linux/slab.h> #include <linux/migrate.h> +#include <asm/unaligned.h> #include "compat.h" #include "ctree.h" #include "disk-io.h" @@ -44,6 +45,20 @@ static struct extent_io_ops btree_extent_io_ops; static void end_workqueue_fn(struct btrfs_work *work); static void free_fs_root(struct btrfs_root *root); +static void btrfs_check_super_valid(struct btrfs_fs_info *fs_info, + int read_only); +static int btrfs_destroy_ordered_operations(struct btrfs_root *root); +static int btrfs_destroy_ordered_extents(struct btrfs_root *root); +static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, + struct btrfs_root *root); +static int btrfs_destroy_pending_snapshots(struct btrfs_transaction *t); +static int btrfs_destroy_delalloc_inodes(struct btrfs_root *root); +static int btrfs_destroy_marked_extents(struct btrfs_root *root, + struct extent_io_tree *dirty_pages, + int mark); +static int btrfs_destroy_pinned_extent(struct btrfs_root *root, + struct extent_io_tree *pinned_extents); +static int btrfs_cleanup_transaction(struct btrfs_root *root); /* * end_io_wq structs are used to do processing in task context when an IO is @@ -184,7 +199,7 @@ u32 btrfs_csum_data(struct btrfs_root *root, char *data, u32 seed, size_t len) void btrfs_csum_final(u32 crc, char *result) { - *(__le32 *)result = ~cpu_to_le32(crc); + put_unaligned_le32(~crc, result); } /* @@ -309,6 +324,7 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root, int num_copies = 0; int mirror_num = 0; + clear_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags); io_tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree; while (1) { ret = read_extent_buffer_pages(io_tree, eb, start, 1, @@ -317,6 +333,14 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root, !verify_parent_transid(io_tree, eb, parent_transid)) return ret; + /* + * This buffer's crc is fine, but its contents are corrupted, so + * there is no reason to read the other copies, they won't be + * any less wrong. + */ + if (test_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags)) + return ret; + num_copies = btrfs_num_copies(&root->fs_info->mapping_tree, eb->start, eb->len); if (num_copies == 1) @@ -345,14 +369,22 @@ static int csum_dirty_buffer(struct btrfs_root *root, struct page *page) tree = &BTRFS_I(page->mapping->host)->io_tree; - if (page->private == EXTENT_PAGE_PRIVATE) + if (page->private == EXTENT_PAGE_PRIVATE) { + WARN_ON(1); goto out; - if (!page->private) + } + if (!page->private) { + WARN_ON(1); goto out; + } len = page->private >> 2; WARN_ON(len == 0); eb = alloc_extent_buffer(tree, start, len, page, GFP_NOFS); + if (eb == NULL) { + WARN_ON(1); + goto out; + } ret = btree_read_extent_buffer_pages(root, eb, start + PAGE_CACHE_SIZE, btrfs_header_generation(eb)); BUG_ON(ret); @@ -397,6 +429,73 @@ static int check_tree_block_fsid(struct btrfs_root *root, return ret; } +#define CORRUPT(reason, eb, root, slot) \ + printk(KERN_CRIT "btrfs: corrupt leaf, %s: block=%llu," \ + "root=%llu, slot=%d\n", reason, \ + (unsigned long long)btrfs_header_bytenr(eb), \ + (unsigned long long)root->objectid, slot) + +static noinline int check_leaf(struct btrfs_root *root, + struct extent_buffer *leaf) +{ + struct btrfs_key key; + struct btrfs_key leaf_key; + u32 nritems = btrfs_header_nritems(leaf); + int slot; + + if (nritems == 0) + return 0; + + /* Check the 0 item */ + if (btrfs_item_offset_nr(leaf, 0) + btrfs_item_size_nr(leaf, 0) != + BTRFS_LEAF_DATA_SIZE(root)) { + CORRUPT("invalid item offset size pair", leaf, root, 0); + return -EIO; + } + + /* + * Check to make sure each items keys are in the correct order and their + * offsets make sense. We only have to loop through nritems-1 because + * we check the current slot against the next slot, which verifies the + * next slot's offset+size makes sense and that the current's slot + * offset is correct. + */ + for (slot = 0; slot < nritems - 1; slot++) { + btrfs_item_key_to_cpu(leaf, &leaf_key, slot); + btrfs_item_key_to_cpu(leaf, &key, slot + 1); + + /* Make sure the keys are in the right order */ + if (btrfs_comp_cpu_keys(&leaf_key, &key) >= 0) { + CORRUPT("bad key order", leaf, root, slot); + return -EIO; + } + + /* + * Make sure the offset and ends are right, remember that the + * item data starts at the end of the leaf and grows towards the + * front. + */ + if (btrfs_item_offset_nr(leaf, slot) != + btrfs_item_end_nr(leaf, slot + 1)) { + CORRUPT("slot offset bad", leaf, root, slot); + return -EIO; + } + + /* + * Check to make sure that we don't point outside of the leaf, + * just incase all the items are consistent to eachother, but + * all point outside of the leaf. + */ + if (btrfs_item_end_nr(leaf, slot) > + BTRFS_LEAF_DATA_SIZE(root)) { + CORRUPT("slot end outside of leaf", leaf, root, slot); + return -EIO; + } + } + + return 0; +} + #ifdef CONFIG_DEBUG_LOCK_ALLOC void btrfs_set_buffer_lockdep_class(struct extent_buffer *eb, int level) { @@ -427,6 +526,10 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end, WARN_ON(len == 0); eb = alloc_extent_buffer(tree, start, len, page, GFP_NOFS); + if (eb == NULL) { + ret = -EIO; + goto out; + } found_start = btrfs_header_bytenr(eb); if (found_start != start) { @@ -459,8 +562,20 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end, btrfs_set_buffer_lockdep_class(eb, found_level); ret = csum_tree_block(root, eb, 1); - if (ret) + if (ret) { + ret = -EIO; + goto err; + } + + /* + * If this is a leaf block and it is corrupt, set the corrupt bit so + * that we don't try and read the other copies of this block, just + * return -EIO. + */ + if (found_level == 0 && check_leaf(root, eb)) { + set_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags); ret = -EIO; + } end = min_t(u64, eb->len, PAGE_CACHE_SIZE); end = eb->start + end - 1; @@ -821,7 +936,6 @@ static const struct address_space_operations btree_aops = { .writepages = btree_writepages, .releasepage = btree_releasepage, .invalidatepage = btree_invalidatepage, - .sync_page = block_sync_page, #ifdef CONFIG_MIGRATION .migratepage = btree_migratepage, #endif @@ -1134,7 +1248,10 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root, root, fs_info, location->objectid); path = btrfs_alloc_path(); - BUG_ON(!path); + if (!path) { + kfree(root); + return ERR_PTR(-ENOMEM); + } ret = btrfs_search_slot(NULL, tree_root, location, path, 0, 0); if (ret == 0) { l = path->nodes[0]; @@ -1145,6 +1262,7 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root, } btrfs_free_path(path); if (ret) { + kfree(root); if (ret > 0) ret = -ENOENT; return ERR_PTR(ret); @@ -1157,8 +1275,10 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root, root->commit_root = btrfs_root_node(root); BUG_ON(!root->node); out: - if (location->objectid != BTRFS_TREE_LOG_OBJECTID) + if (location->objectid != BTRFS_TREE_LOG_OBJECTID) { root->ref_cows = 1; + btrfs_check_and_init_root_item(&root->root_item); + } return root; } @@ -1304,82 +1424,6 @@ static int btrfs_congested_fn(void *congested_data, int bdi_bits) } /* - * this unplugs every device on the box, and it is only used when page - * is null - */ -static void __unplug_io_fn(struct backing_dev_info *bdi, struct page *page) -{ - struct btrfs_device *device; - struct btrfs_fs_info *info; - - info = (struct btrfs_fs_info *)bdi->unplug_io_data; - list_for_each_entry(device, &info->fs_devices->devices, dev_list) { - if (!device->bdev) - continue; - - bdi = blk_get_backing_dev_info(device->bdev); - if (bdi->unplug_io_fn) - bdi->unplug_io_fn(bdi, page); - } -} - -static void btrfs_unplug_io_fn(struct backing_dev_info *bdi, struct page *page) -{ - struct inode *inode; - struct extent_map_tree *em_tree; - struct extent_map *em; - struct address_space *mapping; - u64 offset; - - /* the generic O_DIRECT read code does this */ - if (1 || !page) { - __unplug_io_fn(bdi, page); - return; - } - - /* - * page->mapping may change at any time. Get a consistent copy - * and use that for everything below - */ - smp_mb(); - mapping = page->mapping; - if (!mapping) - return; - - inode = mapping->host; - - /* - * don't do the expensive searching for a small number of - * devices - */ - if (BTRFS_I(inode)->root->fs_info->fs_devices->open_devices <= 2) { - __unplug_io_fn(bdi, page); - return; - } - - offset = page_offset(page); - - em_tree = &BTRFS_I(inode)->extent_tree; - read_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, offset, PAGE_CACHE_SIZE); - read_unlock(&em_tree->lock); - if (!em) { - __unplug_io_fn(bdi, page); - return; - } - - if (em->block_start >= EXTENT_MAP_LAST_BYTE) { - free_extent_map(em); - __unplug_io_fn(bdi, page); - return; - } - offset = offset - em->start; - btrfs_unplug_page(&BTRFS_I(inode)->root->fs_info->mapping_tree, - em->block_start + offset, page); - free_extent_map(em); -} - -/* * If this fails, caller must call bdi_destroy() to get rid of the * bdi again. */ @@ -1393,8 +1437,6 @@ static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi) return err; bdi->ra_pages = default_backing_dev_info.ra_pages; - bdi->unplug_io_fn = btrfs_unplug_io_fn; - bdi->unplug_io_data = info; bdi->congested_fn = btrfs_congested_fn; bdi->congested_data = info; return 0; @@ -1527,6 +1569,7 @@ static int transaction_kthread(void *arg) spin_unlock(&root->fs_info->new_trans_lock); trans = btrfs_join_transaction(root, 1); + BUG_ON(IS_ERR(trans)); if (transid == trans->transid) { ret = btrfs_commit_transaction(trans, root); BUG_ON(ret); @@ -1604,6 +1647,8 @@ struct btrfs_root *open_ctree(struct super_block *sb, goto fail_bdi; } + fs_info->btree_inode->i_mapping->flags &= ~__GFP_FS; + INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC); INIT_LIST_HEAD(&fs_info->trans_list); INIT_LIST_HEAD(&fs_info->dead_roots); @@ -1713,8 +1758,10 @@ struct btrfs_root *open_ctree(struct super_block *sb, fs_info, BTRFS_ROOT_TREE_OBJECTID); bh = btrfs_read_dev_super(fs_devices->latest_bdev); - if (!bh) + if (!bh) { + err = -EINVAL; goto fail_iput; + } memcpy(&fs_info->super_copy, bh->b_data, sizeof(fs_info->super_copy)); memcpy(&fs_info->super_for_commit, &fs_info->super_copy, @@ -1727,6 +1774,17 @@ struct btrfs_root *open_ctree(struct super_block *sb, if (!btrfs_super_root(disk_super)) goto fail_iput; + /* check FS state, whether FS is broken. */ + fs_info->fs_state |= btrfs_super_flags(disk_super); + + btrfs_check_super_valid(fs_info, sb->s_flags & MS_RDONLY); + + /* + * In the long term, we'll store the compression type in the super + * block, and it'll be used for per file compression control. + */ + fs_info->compress_type = BTRFS_COMPRESS_ZLIB; + ret = btrfs_parse_options(tree_root, options); if (ret) { err = ret; @@ -1744,10 +1802,10 @@ struct btrfs_root *open_ctree(struct super_block *sb, } features = btrfs_super_incompat_flags(disk_super); - if (!(features & BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF)) { - features |= BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF; - btrfs_set_super_incompat_flags(disk_super, features); - } + features |= BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF; + if (tree_root->fs_info->compress_type & BTRFS_COMPRESS_LZO) + features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO; + btrfs_set_super_incompat_flags(disk_super, features); features = btrfs_super_compat_ro_flags(disk_super) & ~BTRFS_FEATURE_COMPAT_RO_SUPP; @@ -1932,6 +1990,12 @@ struct btrfs_root *open_ctree(struct super_block *sb, fs_info->metadata_alloc_profile = (u64)-1; fs_info->system_alloc_profile = fs_info->metadata_alloc_profile; + ret = btrfs_init_space_info(fs_info); + if (ret) { + printk(KERN_ERR "Failed to initial space info: %d\n", ret); + goto fail_block_groups; + } + ret = btrfs_read_block_groups(extent_root); if (ret) { printk(KERN_ERR "Failed to read block groups: %d\n", ret); @@ -1957,7 +2021,9 @@ struct btrfs_root *open_ctree(struct super_block *sb, btrfs_set_opt(fs_info->mount_opt, SSD); } - if (btrfs_super_log_root(disk_super) != 0) { + /* do not make disk changes in broken FS */ + if (btrfs_super_log_root(disk_super) != 0 && + !(fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)) { u64 bytenr = btrfs_super_log_root(disk_super); if (fs_devices->rw_devices == 0) { @@ -2021,9 +2087,14 @@ struct btrfs_root *open_ctree(struct super_block *sb, if (!(sb->s_flags & MS_RDONLY)) { down_read(&fs_info->cleanup_work_sem); - btrfs_orphan_cleanup(fs_info->fs_root); - btrfs_orphan_cleanup(fs_info->tree_root); + err = btrfs_orphan_cleanup(fs_info->fs_root); + if (!err) + err = btrfs_orphan_cleanup(fs_info->tree_root); up_read(&fs_info->cleanup_work_sem); + if (err) { + close_ctree(tree_root); + return ERR_PTR(err); + } } return tree_root; @@ -2398,8 +2469,12 @@ int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info) root_objectid = gang[ret - 1]->root_key.objectid + 1; for (i = 0; i < ret; i++) { + int err; + root_objectid = gang[i]->root_key.objectid; - btrfs_orphan_cleanup(gang[i]); + err = btrfs_orphan_cleanup(gang[i]); + if (err) + return err; } root_objectid++; } @@ -2421,10 +2496,14 @@ int btrfs_commit_super(struct btrfs_root *root) up_write(&root->fs_info->cleanup_work_sem); trans = btrfs_join_transaction(root, 1); + if (IS_ERR(trans)) + return PTR_ERR(trans); ret = btrfs_commit_transaction(trans, root); BUG_ON(ret); /* run commit again to drop the original snapshot */ trans = btrfs_join_transaction(root, 1); + if (IS_ERR(trans)) + return PTR_ERR(trans); btrfs_commit_transaction(trans, root); ret = btrfs_write_and_wait_transaction(NULL, root); BUG_ON(ret); @@ -2442,8 +2521,28 @@ int close_ctree(struct btrfs_root *root) smp_mb(); btrfs_put_block_group_cache(fs_info); + + /* + * Here come 2 situations when btrfs is broken to flip readonly: + * + * 1. when btrfs flips readonly somewhere else before + * btrfs_commit_super, sb->s_flags has MS_RDONLY flag, + * and btrfs will skip to write sb directly to keep + * ERROR state on disk. + * + * 2. when btrfs flips readonly just in btrfs_commit_super, + * and in such case, btrfs cannot write sb via btrfs_commit_super, + * and since fs_state has been set BTRFS_SUPER_FLAG_ERROR flag, + * btrfs will cleanup all FS resources first and write sb then. + */ if (!(fs_info->sb->s_flags & MS_RDONLY)) { - ret = btrfs_commit_super(root); + ret = btrfs_commit_super(root); + if (ret) + printk(KERN_ERR "btrfs: commit super ret %d\n", ret); + } + + if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) { + ret = btrfs_error_commit_super(root); if (ret) printk(KERN_ERR "btrfs: commit super ret %d\n", ret); } @@ -2502,6 +2601,8 @@ int close_ctree(struct btrfs_root *root) kfree(fs_info->chunk_root); kfree(fs_info->dev_root); kfree(fs_info->csum_root); + kfree(fs_info); + return 0; } @@ -2619,6 +2720,355 @@ out: return 0; } +static void btrfs_check_super_valid(struct btrfs_fs_info *fs_info, + int read_only) +{ + if (read_only) + return; + + if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) + printk(KERN_WARNING "warning: mount fs with errors, " + "running btrfsck is recommended\n"); +} + +int btrfs_error_commit_super(struct btrfs_root *root) +{ + int ret; + + mutex_lock(&root->fs_info->cleaner_mutex); + btrfs_run_delayed_iputs(root); + mutex_unlock(&root->fs_info->cleaner_mutex); + + down_write(&root->fs_info->cleanup_work_sem); + up_write(&root->fs_info->cleanup_work_sem); + + /* cleanup FS via transaction */ + btrfs_cleanup_transaction(root); + + ret = write_ctree_super(NULL, root, 0); + + return ret; +} + +static int btrfs_destroy_ordered_operations(struct btrfs_root *root) +{ + struct btrfs_inode *btrfs_inode; + struct list_head splice; + + INIT_LIST_HEAD(&splice); + + mutex_lock(&root->fs_info->ordered_operations_mutex); + spin_lock(&root->fs_info->ordered_extent_lock); + + list_splice_init(&root->fs_info->ordered_operations, &splice); + while (!list_empty(&splice)) { + btrfs_inode = list_entry(splice.next, struct btrfs_inode, + ordered_operations); + + list_del_init(&btrfs_inode->ordered_operations); + + btrfs_invalidate_inodes(btrfs_inode->root); + } + + spin_unlock(&root->fs_info->ordered_extent_lock); + mutex_unlock(&root->fs_info->ordered_operations_mutex); + + return 0; +} + +static int btrfs_destroy_ordered_extents(struct btrfs_root *root) +{ + struct list_head splice; + struct btrfs_ordered_extent *ordered; + struct inode *inode; + + INIT_LIST_HEAD(&splice); + + spin_lock(&root->fs_info->ordered_extent_lock); + + list_splice_init(&root->fs_info->ordered_extents, &splice); + while (!list_empty(&splice)) { + ordered = list_entry(splice.next, struct btrfs_ordered_extent, + root_extent_list); + + list_del_init(&ordered->root_extent_list); + atomic_inc(&ordered->refs); + + /* the inode may be getting freed (in sys_unlink path). */ + inode = igrab(ordered->inode); + + spin_unlock(&root->fs_info->ordered_extent_lock); + if (inode) + iput(inode); + + atomic_set(&ordered->refs, 1); + btrfs_put_ordered_extent(ordered); + + spin_lock(&root->fs_info->ordered_extent_lock); + } + + spin_unlock(&root->fs_info->ordered_extent_lock); + + return 0; +} + +static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, + struct btrfs_root *root) +{ + struct rb_node *node; + struct btrfs_delayed_ref_root *delayed_refs; + struct btrfs_delayed_ref_node *ref; + int ret = 0; + + delayed_refs = &trans->delayed_refs; + + spin_lock(&delayed_refs->lock); + if (delayed_refs->num_entries == 0) { + printk(KERN_INFO "delayed_refs has NO entry\n"); + return ret; + } + + node = rb_first(&delayed_refs->root); + while (node) { + ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); + node = rb_next(node); + + ref->in_tree = 0; + rb_erase(&ref->rb_node, &delayed_refs->root); + delayed_refs->num_entries--; + + atomic_set(&ref->refs, 1); + if (btrfs_delayed_ref_is_head(ref)) { + struct btrfs_delayed_ref_head *head; + + head = btrfs_delayed_node_to_head(ref); + mutex_lock(&head->mutex); + kfree(head->extent_op); + delayed_refs->num_heads--; + if (list_empty(&head->cluster)) + delayed_refs->num_heads_ready--; + list_del_init(&head->cluster); + mutex_unlock(&head->mutex); + } + + spin_unlock(&delayed_refs->lock); + btrfs_put_delayed_ref(ref); + + cond_resched(); + spin_lock(&delayed_refs->lock); + } + + spin_unlock(&delayed_refs->lock); + + return ret; +} + +static int btrfs_destroy_pending_snapshots(struct btrfs_transaction *t) +{ + struct btrfs_pending_snapshot *snapshot; + struct list_head splice; + + INIT_LIST_HEAD(&splice); + + list_splice_init(&t->pending_snapshots, &splice); + + while (!list_empty(&splice)) { + snapshot = list_entry(splice.next, + struct btrfs_pending_snapshot, + list); + + list_del_init(&snapshot->list); + + kfree(snapshot); + } + + return 0; +} + +static int btrfs_destroy_delalloc_inodes(struct btrfs_root *root) +{ + struct btrfs_inode *btrfs_inode; + struct list_head splice; + + INIT_LIST_HEAD(&splice); + + list_splice_init(&root->fs_info->delalloc_inodes, &splice); + + spin_lock(&root->fs_info->delalloc_lock); + + while (!list_empty(&splice)) { + btrfs_inode = list_entry(splice.next, struct btrfs_inode, + delalloc_inodes); + + list_del_init(&btrfs_inode->delalloc_inodes); + + btrfs_invalidate_inodes(btrfs_inode->root); + } + + spin_unlock(&root->fs_info->delalloc_lock); + + return 0; +} + +static int btrfs_destroy_marked_extents(struct btrfs_root *root, + struct extent_io_tree *dirty_pages, + int mark) +{ + int ret; + struct page *page; + struct inode *btree_inode = root->fs_info->btree_inode; + struct extent_buffer *eb; + u64 start = 0; + u64 end; + u64 offset; + unsigned long index; + + while (1) { + ret = find_first_extent_bit(dirty_pages, start, &start, &end, + mark); + if (ret) + break; + + clear_extent_bits(dirty_pages, start, end, mark, GFP_NOFS); + while (start <= end) { + index = start >> PAGE_CACHE_SHIFT; + start = (u64)(index + 1) << PAGE_CACHE_SHIFT; + page = find_get_page(btree_inode->i_mapping, index); + if (!page) + continue; + offset = page_offset(page); + + spin_lock(&dirty_pages->buffer_lock); + eb = radix_tree_lookup( + &(&BTRFS_I(page->mapping->host)->io_tree)->buffer, + offset >> PAGE_CACHE_SHIFT); + spin_unlock(&dirty_pages->buffer_lock); + if (eb) { + ret = test_and_clear_bit(EXTENT_BUFFER_DIRTY, + &eb->bflags); + atomic_set(&eb->refs, 1); + } + if (PageWriteback(page)) + end_page_writeback(page); + + lock_page(page); + if (PageDirty(page)) { + clear_page_dirty_for_io(page); + spin_lock_irq(&page->mapping->tree_lock); + radix_tree_tag_clear(&page->mapping->page_tree, + page_index(page), + PAGECACHE_TAG_DIRTY); + spin_unlock_irq(&page->mapping->tree_lock); + } + + page->mapping->a_ops->invalidatepage(page, 0); + unlock_page(page); + } + } + + return ret; +} + +static int btrfs_destroy_pinned_extent(struct btrfs_root *root, + struct extent_io_tree *pinned_extents) +{ + struct extent_io_tree *unpin; + u64 start; + u64 end; + int ret; + + unpin = pinned_extents; + while (1) { + ret = find_first_extent_bit(unpin, 0, &start, &end, + EXTENT_DIRTY); + if (ret) + break; + + /* opt_discard */ + if (btrfs_test_opt(root, DISCARD)) + ret = btrfs_error_discard_extent(root, start, + end + 1 - start, + NULL); + + clear_extent_dirty(unpin, start, end, GFP_NOFS); + btrfs_error_unpin_extent_range(root, start, end); + cond_resched(); + } + + return 0; +} + +static int btrfs_cleanup_transaction(struct btrfs_root *root) +{ + struct btrfs_transaction *t; + LIST_HEAD(list); + + WARN_ON(1); + + mutex_lock(&root->fs_info->trans_mutex); + mutex_lock(&root->fs_info->transaction_kthread_mutex); + + list_splice_init(&root->fs_info->trans_list, &list); + while (!list_empty(&list)) { + t = list_entry(list.next, struct btrfs_transaction, list); + if (!t) + break; + + btrfs_destroy_ordered_operations(root); + + btrfs_destroy_ordered_extents(root); + + btrfs_destroy_delayed_refs(t, root); + + btrfs_block_rsv_release(root, + &root->fs_info->trans_block_rsv, + t->dirty_pages.dirty_bytes); + + /* FIXME: cleanup wait for commit */ + t->in_commit = 1; + t->blocked = 1; + if (waitqueue_active(&root->fs_info->transaction_blocked_wait)) + wake_up(&root->fs_info->transaction_blocked_wait); + + t->blocked = 0; + if (waitqueue_active(&root->fs_info->transaction_wait)) + wake_up(&root->fs_info->transaction_wait); + mutex_unlock(&root->fs_info->trans_mutex); + + mutex_lock(&root->fs_info->trans_mutex); + t->commit_done = 1; + if (waitqueue_active(&t->commit_wait)) + wake_up(&t->commit_wait); + mutex_unlock(&root->fs_info->trans_mutex); + + mutex_lock(&root->fs_info->trans_mutex); + + btrfs_destroy_pending_snapshots(t); + + btrfs_destroy_delalloc_inodes(root); + + spin_lock(&root->fs_info->new_trans_lock); + root->fs_info->running_transaction = NULL; + spin_unlock(&root->fs_info->new_trans_lock); + + btrfs_destroy_marked_extents(root, &t->dirty_pages, + EXTENT_DIRTY); + + btrfs_destroy_pinned_extent(root, + root->fs_info->pinned_extents); + + atomic_set(&t->use_count, 0); + list_del_init(&t->list); + memset(t, 0, sizeof(*t)); + kmem_cache_free(btrfs_transaction_cachep, t); + } + + mutex_unlock(&root->fs_info->transaction_kthread_mutex); + mutex_unlock(&root->fs_info->trans_mutex); + + return 0; +} + static struct extent_io_ops btree_extent_io_ops = { .write_cache_pages_lock_hook = btree_lock_page_hook, .readpage_end_io_hook = btree_readpage_end_io_hook, diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 88e825a0bf21..07b20dc2fd95 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -52,6 +52,7 @@ int write_ctree_super(struct btrfs_trans_handle *trans, struct btrfs_root *root, int max_mirrors); struct buffer_head *btrfs_read_dev_super(struct block_device *bdev); int btrfs_commit_super(struct btrfs_root *root); +int btrfs_error_commit_super(struct btrfs_root *root); struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize); struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info, diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c index 9786963b07e5..b4ffad859adb 100644 --- a/fs/btrfs/export.c +++ b/fs/btrfs/export.c @@ -21,9 +21,13 @@ static int btrfs_encode_fh(struct dentry *dentry, u32 *fh, int *max_len, int len = *max_len; int type; - if ((len < BTRFS_FID_SIZE_NON_CONNECTABLE) || - (connectable && len < BTRFS_FID_SIZE_CONNECTABLE)) + if (connectable && (len < BTRFS_FID_SIZE_CONNECTABLE)) { + *max_len = BTRFS_FID_SIZE_CONNECTABLE; return 255; + } else if (len < BTRFS_FID_SIZE_NON_CONNECTABLE) { + *max_len = BTRFS_FID_SIZE_NON_CONNECTABLE; + return 255; + } len = BTRFS_FID_SIZE_NON_CONNECTABLE; type = FILEID_BTRFS_WITHOUT_PARENT; @@ -171,6 +175,8 @@ static struct dentry *btrfs_get_parent(struct dentry *child) int ret; path = btrfs_alloc_path(); + if (!path) + return ERR_PTR(-ENOMEM); if (dir->i_ino == BTRFS_FIRST_FREE_OBJECTID) { key.objectid = root->root_key.objectid; diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 227e5815d838..31f33ba56fe8 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -33,11 +33,28 @@ #include "locking.h" #include "free-space-cache.h" +/* control flags for do_chunk_alloc's force field + * CHUNK_ALLOC_NO_FORCE means to only allocate a chunk + * if we really need one. + * + * CHUNK_ALLOC_FORCE means it must try to allocate one + * + * CHUNK_ALLOC_LIMITED means to only try and allocate one + * if we have very few chunks already allocated. This is + * used as part of the clustering code to help make sure + * we have a good pool of storage to cluster in, without + * filling the FS with empty chunks + * + */ +enum { + CHUNK_ALLOC_NO_FORCE = 0, + CHUNK_ALLOC_FORCE = 1, + CHUNK_ALLOC_LIMITED = 2, +}; + static int update_block_group(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 num_bytes, int alloc); -static int update_reserved_bytes(struct btrfs_block_group_cache *cache, - u64 num_bytes, int reserve, int sinfo); static int __btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 num_bytes, u64 parent, @@ -320,11 +337,6 @@ static int caching_kthread(void *data) if (!path) return -ENOMEM; - exclude_super_stripes(extent_root, block_group); - spin_lock(&block_group->space_info->lock); - block_group->space_info->bytes_readonly += block_group->bytes_super; - spin_unlock(&block_group->space_info->lock); - last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET); /* @@ -447,7 +459,7 @@ static int cache_block_group(struct btrfs_block_group_cache *cache, * allocate blocks for the tree root we can't do the fast caching since * we likely hold important locks. */ - if (!trans->transaction->in_commit && + if (trans && (!trans->transaction->in_commit) && (root && root != root->fs_info->tree_root)) { spin_lock(&cache->lock); if (cache->cached != BTRFS_CACHE_NO) { @@ -467,14 +479,16 @@ static int cache_block_group(struct btrfs_block_group_cache *cache, cache->cached = BTRFS_CACHE_NO; } spin_unlock(&cache->lock); - if (ret == 1) + if (ret == 1) { + free_excluded_extents(fs_info->extent_root, cache); return 0; + } } if (load_cache_only) return 0; - caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_KERNEL); + caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS); BUG_ON(!caching_ctl); INIT_LIST_HEAD(&caching_ctl->list); @@ -1743,39 +1757,45 @@ static int remove_extent_backref(struct btrfs_trans_handle *trans, return ret; } -static void btrfs_issue_discard(struct block_device *bdev, +static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len) { - blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL, 0); + return blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_NOFS, 0); } static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, - u64 num_bytes) + u64 num_bytes, u64 *actual_bytes) { int ret; - u64 map_length = num_bytes; + u64 discarded_bytes = 0; struct btrfs_multi_bio *multi = NULL; - if (!btrfs_test_opt(root, DISCARD)) - return 0; /* Tell the block device(s) that the sectors can be discarded */ - ret = btrfs_map_block(&root->fs_info->mapping_tree, READ, - bytenr, &map_length, &multi, 0); + ret = btrfs_map_block(&root->fs_info->mapping_tree, REQ_DISCARD, + bytenr, &num_bytes, &multi, 0); if (!ret) { struct btrfs_bio_stripe *stripe = multi->stripes; int i; - if (map_length > num_bytes) - map_length = num_bytes; for (i = 0; i < multi->num_stripes; i++, stripe++) { - btrfs_issue_discard(stripe->dev->bdev, - stripe->physical, - map_length); + ret = btrfs_issue_discard(stripe->dev->bdev, + stripe->physical, + stripe->length); + if (!ret) + discarded_bytes += stripe->length; + else if (ret != -EOPNOTSUPP) + break; } kfree(multi); } + if (discarded_bytes && ret == -EOPNOTSUPP) + ret = 0; + + if (actual_bytes) + *actual_bytes = discarded_bytes; + return ret; } @@ -3018,7 +3038,8 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags, found->bytes_readonly = 0; found->bytes_may_use = 0; found->full = 0; - found->force_alloc = 0; + found->force_alloc = CHUNK_ALLOC_NO_FORCE; + found->chunk_alloc = 0; *space_info = found; list_add_rcu(&found->list, &info->space_info); atomic_set(&found->caching_threads, 0); @@ -3089,7 +3110,7 @@ static u64 get_alloc_profile(struct btrfs_root *root, u64 flags) return btrfs_reduce_alloc_profile(root, flags); } -static u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data) +u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data) { u64 flags; @@ -3149,7 +3170,7 @@ again: if (!data_sinfo->full && alloc_chunk) { u64 alloc_target; - data_sinfo->force_alloc = 1; + data_sinfo->force_alloc = CHUNK_ALLOC_FORCE; spin_unlock(&data_sinfo->lock); alloc: alloc_target = btrfs_get_alloc_profile(root, 1); @@ -3159,10 +3180,15 @@ alloc: ret = do_chunk_alloc(trans, root->fs_info->extent_root, bytes + 2 * 1024 * 1024, - alloc_target, 0); + alloc_target, + CHUNK_ALLOC_NO_FORCE); btrfs_end_transaction(trans, root); - if (ret < 0) - return ret; + if (ret < 0) { + if (ret != -ENOSPC) + return ret; + else + goto commit_trans; + } if (!data_sinfo) { btrfs_set_inode_space_info(root, inode); @@ -3173,6 +3199,7 @@ alloc: spin_unlock(&data_sinfo->lock); /* commit the current transaction and try again */ +commit_trans: if (!committed && !root->fs_info->open_ioctl_trans) { committed = 1; trans = btrfs_join_transaction(root, 1); @@ -3233,31 +3260,56 @@ static void force_metadata_allocation(struct btrfs_fs_info *info) rcu_read_lock(); list_for_each_entry_rcu(found, head, list) { if (found->flags & BTRFS_BLOCK_GROUP_METADATA) - found->force_alloc = 1; + found->force_alloc = CHUNK_ALLOC_FORCE; } rcu_read_unlock(); } static int should_alloc_chunk(struct btrfs_root *root, - struct btrfs_space_info *sinfo, u64 alloc_bytes) + struct btrfs_space_info *sinfo, u64 alloc_bytes, + int force) { u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly; + u64 num_allocated = sinfo->bytes_used + sinfo->bytes_reserved; u64 thresh; - if (sinfo->bytes_used + sinfo->bytes_reserved + - alloc_bytes + 256 * 1024 * 1024 < num_bytes) + if (force == CHUNK_ALLOC_FORCE) + return 1; + + /* + * in limited mode, we want to have some free space up to + * about 1% of the FS size. + */ + if (force == CHUNK_ALLOC_LIMITED) { + thresh = btrfs_super_total_bytes(&root->fs_info->super_copy); + thresh = max_t(u64, 64 * 1024 * 1024, + div_factor_fine(thresh, 1)); + + if (num_bytes - num_allocated < thresh) + return 1; + } + + /* + * we have two similar checks here, one based on percentage + * and once based on a hard number of 256MB. The idea + * is that if we have a good amount of free + * room, don't allocate a chunk. A good mount is + * less than 80% utilized of the chunks we have allocated, + * or more than 256MB free + */ + if (num_allocated + alloc_bytes + 256 * 1024 * 1024 < num_bytes) return 0; - if (sinfo->bytes_used + sinfo->bytes_reserved + - alloc_bytes < div_factor(num_bytes, 8)) + if (num_allocated + alloc_bytes < div_factor(num_bytes, 8)) return 0; thresh = btrfs_super_total_bytes(&root->fs_info->super_copy); + + /* 256MB or 5% of the FS */ thresh = max_t(u64, 256 * 1024 * 1024, div_factor_fine(thresh, 5)); if (num_bytes > thresh && sinfo->bytes_used < div_factor(num_bytes, 3)) return 0; - return 1; } @@ -3267,10 +3319,9 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans, { struct btrfs_space_info *space_info; struct btrfs_fs_info *fs_info = extent_root->fs_info; + int wait_for_alloc = 0; int ret = 0; - mutex_lock(&fs_info->chunk_mutex); - flags = btrfs_reduce_alloc_profile(extent_root, flags); space_info = __find_space_info(extent_root->fs_info, flags); @@ -3281,21 +3332,40 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans, } BUG_ON(!space_info); +again: spin_lock(&space_info->lock); if (space_info->force_alloc) - force = 1; + force = space_info->force_alloc; if (space_info->full) { spin_unlock(&space_info->lock); - goto out; + return 0; } - if (!force && !should_alloc_chunk(extent_root, space_info, - alloc_bytes)) { + if (!should_alloc_chunk(extent_root, space_info, alloc_bytes, force)) { spin_unlock(&space_info->lock); - goto out; + return 0; + } else if (space_info->chunk_alloc) { + wait_for_alloc = 1; + } else { + space_info->chunk_alloc = 1; } + spin_unlock(&space_info->lock); + mutex_lock(&fs_info->chunk_mutex); + + /* + * The chunk_mutex is held throughout the entirety of a chunk + * allocation, so once we've acquired the chunk_mutex we know that the + * other guy is done and we need to recheck and see if we should + * allocate. + */ + if (wait_for_alloc) { + mutex_unlock(&fs_info->chunk_mutex); + wait_for_alloc = 0; + goto again; + } + /* * If we have mixed data/metadata chunks we want to make sure we keep * allocating mixed chunks instead of individual chunks. @@ -3321,9 +3391,10 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans, space_info->full = 1; else ret = 1; - space_info->force_alloc = 0; + + space_info->force_alloc = CHUNK_ALLOC_NO_FORCE; + space_info->chunk_alloc = 0; spin_unlock(&space_info->lock); -out: mutex_unlock(&extent_root->fs_info->chunk_mutex); return ret; } @@ -3339,21 +3410,24 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans, u64 reserved; u64 max_reclaim; u64 reclaimed = 0; - int pause = 1; + long time_left; int nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT; + int loops = 0; + unsigned long progress; block_rsv = &root->fs_info->delalloc_block_rsv; space_info = block_rsv->space_info; smp_mb(); reserved = space_info->bytes_reserved; + progress = space_info->reservation_progress; if (reserved == 0) return 0; max_reclaim = min(reserved, to_reclaim); - while (1) { + while (loops < 1024) { /* have the flusher threads jump in and do some IO */ smp_mb(); nr_pages = min_t(unsigned long, nr_pages, @@ -3366,17 +3440,31 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans, reserved = space_info->bytes_reserved; spin_unlock(&space_info->lock); + loops++; + if (reserved == 0 || reclaimed >= max_reclaim) break; if (trans && trans->transaction->blocked) return -EAGAIN; - __set_current_state(TASK_INTERRUPTIBLE); - schedule_timeout(pause); - pause <<= 1; - if (pause > HZ / 10) - pause = HZ / 10; + time_left = schedule_timeout_interruptible(1); + + /* We were interrupted, exit */ + if (time_left) + break; + + /* we've kicked the IO a few times, if anything has been freed, + * exit. There is no sense in looping here for a long time + * when we really need to commit the transaction, or there are + * just too many writers without enough free space + */ + + if (loops > 3) { + smp_mb(); + if (progress != space_info->reservation_progress) + break; + } } return reclaimed >= to_reclaim; @@ -3583,10 +3671,23 @@ void block_rsv_release_bytes(struct btrfs_block_rsv *block_rsv, if (num_bytes > 0) { if (dest) { - block_rsv_add_bytes(dest, num_bytes, 0); - } else { + spin_lock(&dest->lock); + if (!dest->full) { + u64 bytes_to_add; + + bytes_to_add = dest->size - dest->reserved; + bytes_to_add = min(num_bytes, bytes_to_add); + dest->reserved += bytes_to_add; + if (dest->reserved >= dest->size) + dest->full = 1; + num_bytes -= bytes_to_add; + } + spin_unlock(&dest->lock); + } + if (num_bytes) { spin_lock(&space_info->lock); space_info->bytes_reserved -= num_bytes; + space_info->reservation_progress++; spin_unlock(&space_info->lock); } } @@ -3721,11 +3822,6 @@ int btrfs_block_rsv_check(struct btrfs_trans_handle *trans, return 0; } - WARN_ON(1); - printk(KERN_INFO"block_rsv size %llu reserved %llu freed %llu %llu\n", - block_rsv->size, block_rsv->reserved, - block_rsv->freed[0], block_rsv->freed[1]); - return -ENOSPC; } @@ -3824,6 +3920,7 @@ static void update_global_block_rsv(struct btrfs_fs_info *fs_info) if (block_rsv->reserved >= block_rsv->size) { num_bytes = block_rsv->reserved - block_rsv->size; sinfo->bytes_reserved -= num_bytes; + sinfo->reservation_progress++; block_rsv->reserved = block_rsv->size; block_rsv->full = 1; } @@ -3968,6 +4065,7 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv; u64 to_reserve; int nr_extents; + int reserved_extents; int ret; if (btrfs_transaction_in_commit(root->fs_info)) @@ -3975,26 +4073,24 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) num_bytes = ALIGN(num_bytes, root->sectorsize); - spin_lock(&BTRFS_I(inode)->accounting_lock); nr_extents = atomic_read(&BTRFS_I(inode)->outstanding_extents) + 1; - if (nr_extents > BTRFS_I(inode)->reserved_extents) { - nr_extents -= BTRFS_I(inode)->reserved_extents; + reserved_extents = atomic_read(&BTRFS_I(inode)->reserved_extents); + + if (nr_extents > reserved_extents) { + nr_extents -= reserved_extents; to_reserve = calc_trans_metadata_size(root, nr_extents); } else { nr_extents = 0; to_reserve = 0; } - spin_unlock(&BTRFS_I(inode)->accounting_lock); to_reserve += calc_csum_metadata_size(inode, num_bytes); ret = reserve_metadata_bytes(NULL, root, block_rsv, to_reserve, 1); if (ret) return ret; - spin_lock(&BTRFS_I(inode)->accounting_lock); - BTRFS_I(inode)->reserved_extents += nr_extents; + atomic_add(nr_extents, &BTRFS_I(inode)->reserved_extents); atomic_inc(&BTRFS_I(inode)->outstanding_extents); - spin_unlock(&BTRFS_I(inode)->accounting_lock); block_rsv_add_bytes(block_rsv, to_reserve, 1); @@ -4009,19 +4105,30 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes) struct btrfs_root *root = BTRFS_I(inode)->root; u64 to_free; int nr_extents; + int reserved_extents; num_bytes = ALIGN(num_bytes, root->sectorsize); atomic_dec(&BTRFS_I(inode)->outstanding_extents); + WARN_ON(atomic_read(&BTRFS_I(inode)->outstanding_extents) < 0); - spin_lock(&BTRFS_I(inode)->accounting_lock); - nr_extents = atomic_read(&BTRFS_I(inode)->outstanding_extents); - if (nr_extents < BTRFS_I(inode)->reserved_extents) { - nr_extents = BTRFS_I(inode)->reserved_extents - nr_extents; - BTRFS_I(inode)->reserved_extents -= nr_extents; - } else { - nr_extents = 0; - } - spin_unlock(&BTRFS_I(inode)->accounting_lock); + reserved_extents = atomic_read(&BTRFS_I(inode)->reserved_extents); + do { + int old, new; + + nr_extents = atomic_read(&BTRFS_I(inode)->outstanding_extents); + if (nr_extents >= reserved_extents) { + nr_extents = 0; + break; + } + old = reserved_extents; + nr_extents = reserved_extents - nr_extents; + new = reserved_extents - nr_extents; + old = atomic_cmpxchg(&BTRFS_I(inode)->reserved_extents, + reserved_extents, new); + if (likely(old == reserved_extents)) + break; + reserved_extents = old; + } while (1); to_free = calc_csum_metadata_size(inode, num_bytes); if (nr_extents > 0) @@ -4112,6 +4219,7 @@ static int update_block_group(struct btrfs_trans_handle *trans, btrfs_set_block_group_used(&cache->item, old_val); cache->reserved -= num_bytes; cache->space_info->bytes_reserved -= num_bytes; + cache->space_info->reservation_progress++; cache->space_info->bytes_used += num_bytes; cache->space_info->disk_used += num_bytes * factor; spin_unlock(&cache->lock); @@ -4163,6 +4271,7 @@ static int pin_down_extent(struct btrfs_root *root, if (reserved) { cache->reserved -= num_bytes; cache->space_info->bytes_reserved -= num_bytes; + cache->space_info->reservation_progress++; } spin_unlock(&cache->lock); spin_unlock(&cache->space_info->lock); @@ -4193,8 +4302,8 @@ int btrfs_pin_extent(struct btrfs_root *root, * update size of reserved extents. this function may return -EAGAIN * if 'reserve' is true or 'sinfo' is false. */ -static int update_reserved_bytes(struct btrfs_block_group_cache *cache, - u64 num_bytes, int reserve, int sinfo) +int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache, + u64 num_bytes, int reserve, int sinfo) { int ret = 0; if (sinfo) { @@ -4213,6 +4322,7 @@ static int update_reserved_bytes(struct btrfs_block_group_cache *cache, space_info->bytes_readonly += num_bytes; cache->reserved -= num_bytes; space_info->bytes_reserved -= num_bytes; + space_info->reservation_progress++; } spin_unlock(&cache->lock); spin_unlock(&space_info->lock); @@ -4332,7 +4442,9 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, if (ret) break; - ret = btrfs_discard_extent(root, start, end + 1 - start); + if (btrfs_test_opt(root, DISCARD)) + ret = btrfs_discard_extent(root, start, + end + 1 - start, NULL); clear_extent_dirty(unpin, start, end, GFP_NOFS); unpin_extent_range(root, start, end); @@ -4673,10 +4785,10 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)); btrfs_add_free_space(cache, buf->start, buf->len); - ret = update_reserved_bytes(cache, buf->len, 0, 0); + ret = btrfs_update_reserved_bytes(cache, buf->len, 0, 0); if (ret == -EAGAIN) { /* block group became read-only */ - update_reserved_bytes(cache, buf->len, 0, 1); + btrfs_update_reserved_bytes(cache, buf->len, 0, 1); goto out; } @@ -4691,6 +4803,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, if (ret) { spin_lock(&cache->space_info->lock); cache->space_info->bytes_reserved -= buf->len; + cache->space_info->reservation_progress++; spin_unlock(&cache->space_info->lock); } goto out; @@ -4712,6 +4825,11 @@ pin: } } out: + /* + * Deleting the buffer, clear the corrupt flag since it doesn't matter + * anymore. + */ + clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags); btrfs_put_block_group(cache); } @@ -5159,7 +5277,7 @@ checks: search_start - offset); BUG_ON(offset > search_start); - ret = update_reserved_bytes(block_group, num_bytes, 1, + ret = btrfs_update_reserved_bytes(block_group, num_bytes, 1, (data & BTRFS_BLOCK_GROUP_DATA)); if (ret == -EAGAIN) { btrfs_add_free_space(block_group, offset, num_bytes); @@ -5250,11 +5368,13 @@ loop: if (allowed_chunk_alloc) { ret = do_chunk_alloc(trans, root, num_bytes + - 2 * 1024 * 1024, data, 1); + 2 * 1024 * 1024, data, + CHUNK_ALLOC_LIMITED); allowed_chunk_alloc = 0; done_chunk_alloc = 1; - } else if (!done_chunk_alloc) { - space_info->force_alloc = 1; + } else if (!done_chunk_alloc && + space_info->force_alloc == CHUNK_ALLOC_NO_FORCE) { + space_info->force_alloc = CHUNK_ALLOC_LIMITED; } if (loop < LOOP_NO_EMPTY_SIZE) { @@ -5340,7 +5460,8 @@ again: */ if (empty_size || root->ref_cows) ret = do_chunk_alloc(trans, root->fs_info->extent_root, - num_bytes + 2 * 1024 * 1024, data, 0); + num_bytes + 2 * 1024 * 1024, data, + CHUNK_ALLOC_NO_FORCE); WARN_ON(num_bytes < root->sectorsize); ret = find_free_extent(trans, root, num_bytes, empty_size, @@ -5352,10 +5473,10 @@ again: num_bytes = num_bytes & ~(root->sectorsize - 1); num_bytes = max(num_bytes, min_alloc_size); do_chunk_alloc(trans, root->fs_info->extent_root, - num_bytes, data, 1); + num_bytes, data, CHUNK_ALLOC_FORCE); goto again; } - if (ret == -ENOSPC) { + if (ret == -ENOSPC && btrfs_test_opt(root, ENOSPC_DEBUG)) { struct btrfs_space_info *sinfo; sinfo = __find_space_info(root->fs_info, data); @@ -5365,6 +5486,8 @@ again: dump_space_info(sinfo, num_bytes, 1); } + trace_btrfs_reserved_extent_alloc(root, ins->objectid, ins->offset); + return ret; } @@ -5380,12 +5503,15 @@ int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len) return -ENOSPC; } - ret = btrfs_discard_extent(root, start, len); + if (btrfs_test_opt(root, DISCARD)) + ret = btrfs_discard_extent(root, start, len, NULL); btrfs_add_free_space(cache, start, len); - update_reserved_bytes(cache, len, 0, 1); + btrfs_update_reserved_bytes(cache, len, 0, 1); btrfs_put_block_group(cache); + trace_btrfs_reserved_extent_free(root, start, len); + return ret; } @@ -5412,7 +5538,8 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, size = sizeof(*extent_item) + btrfs_extent_inline_ref_size(type); path = btrfs_alloc_path(); - BUG_ON(!path); + if (!path) + return -ENOMEM; path->leave_spinning = 1; ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path, @@ -5582,7 +5709,7 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, put_caching_control(caching_ctl); } - ret = update_reserved_bytes(block_group, ins->offset, 1, 1); + ret = btrfs_update_reserved_bytes(block_group, ins->offset, 1, 1); BUG_ON(ret); btrfs_put_block_group(block_group); ret = alloc_reserved_file_extent(trans, root, 0, root_objectid, @@ -5633,6 +5760,7 @@ use_block_rsv(struct btrfs_trans_handle *trans, struct btrfs_root *root, u32 blocksize) { struct btrfs_block_rsv *block_rsv; + struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv; int ret; block_rsv = get_block_rsv(trans, root); @@ -5640,14 +5768,39 @@ use_block_rsv(struct btrfs_trans_handle *trans, if (block_rsv->size == 0) { ret = reserve_metadata_bytes(trans, root, block_rsv, blocksize, 0); - if (ret) + /* + * If we couldn't reserve metadata bytes try and use some from + * the global reserve. + */ + if (ret && block_rsv != global_rsv) { + ret = block_rsv_use_bytes(global_rsv, blocksize); + if (!ret) + return global_rsv; + return ERR_PTR(ret); + } else if (ret) { return ERR_PTR(ret); + } return block_rsv; } ret = block_rsv_use_bytes(block_rsv, blocksize); if (!ret) return block_rsv; + if (ret) { + WARN_ON(1); + ret = reserve_metadata_bytes(trans, root, block_rsv, blocksize, + 0); + if (!ret) { + spin_lock(&block_rsv->lock); + block_rsv->size += blocksize; + spin_unlock(&block_rsv->lock); + return block_rsv; + } else if (ret && block_rsv != global_rsv) { + ret = block_rsv_use_bytes(global_rsv, blocksize); + if (!ret) + return global_rsv; + } + } return ERR_PTR(-ENOSPC); } @@ -5989,6 +6142,8 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, if (reada && level == 1) reada_walk_down(trans, root, wc, path); next = read_tree_block(root, bytenr, blocksize, generation); + if (!next) + return -EIO; btrfs_tree_lock(next); btrfs_set_lock_blocking(next); } @@ -6221,6 +6376,8 @@ int btrfs_drop_snapshot(struct btrfs_root *root, BUG_ON(!wc); trans = btrfs_start_transaction(tree_root, 0); + BUG_ON(IS_ERR(trans)); + if (block_rsv) trans->block_rsv = block_rsv; @@ -6318,6 +6475,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, btrfs_end_transaction_throttle(trans, tree_root); trans = btrfs_start_transaction(tree_root, 0); + BUG_ON(IS_ERR(trans)); if (block_rsv) trans->block_rsv = block_rsv; } @@ -6377,10 +6535,14 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans, BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID); path = btrfs_alloc_path(); - BUG_ON(!path); + if (!path) + return -ENOMEM; wc = kzalloc(sizeof(*wc), GFP_NOFS); - BUG_ON(!wc); + if (!wc) { + btrfs_free_path(path); + return -ENOMEM; + } btrfs_assert_tree_locked(parent); parent_level = btrfs_header_level(parent); @@ -6446,6 +6608,8 @@ static noinline int relocate_inode_pages(struct inode *inode, u64 start, int ret = 0; ra = kzalloc(sizeof(*ra), GFP_NOFS); + if (!ra) + return -ENOMEM; mutex_lock(&inode->i_mutex); first_index = start >> PAGE_CACHE_SHIFT; @@ -6531,7 +6695,7 @@ static noinline int relocate_data_extent(struct inode *reloc_inode, u64 end = start + extent_key->offset - 1; em = alloc_extent_map(GFP_NOFS); - BUG_ON(!em || IS_ERR(em)); + BUG_ON(!em); em->start = start; em->len = extent_key->offset; @@ -6836,7 +7000,11 @@ static noinline int get_new_locations(struct inode *reloc_inode, } path = btrfs_alloc_path(); - BUG_ON(!path); + if (!path) { + if (exts != *extents) + kfree(exts); + return -ENOMEM; + } cur_pos = extent_key->objectid - offset; last_byte = extent_key->objectid + extent_key->offset; @@ -6878,6 +7046,10 @@ static noinline int get_new_locations(struct inode *reloc_inode, struct disk_extent *old = exts; max *= 2; exts = kzalloc(sizeof(*exts) * max, GFP_NOFS); + if (!exts) { + ret = -ENOMEM; + goto out; + } memcpy(exts, old, sizeof(*exts) * nr); if (old != *extents) kfree(old); @@ -7360,7 +7532,8 @@ static noinline int replace_extents_in_leaf(struct btrfs_trans_handle *trans, int ret; new_extent = kmalloc(sizeof(*new_extent), GFP_NOFS); - BUG_ON(!new_extent); + if (!new_extent) + return -ENOMEM; ref = btrfs_lookup_leaf_ref(root, leaf->start); BUG_ON(!ref); @@ -7477,7 +7650,7 @@ int btrfs_drop_dead_reloc_roots(struct btrfs_root *root) BUG_ON(reloc_root->commit_root != NULL); while (1) { trans = btrfs_join_transaction(root, 1); - BUG_ON(!trans); + BUG_ON(IS_ERR(trans)); mutex_lock(&root->fs_info->drop_mutex); ret = btrfs_drop_snapshot(trans, reloc_root); @@ -7535,7 +7708,7 @@ int btrfs_cleanup_reloc_trees(struct btrfs_root *root) if (found) { trans = btrfs_start_transaction(root, 1); - BUG_ON(!trans); + BUG_ON(IS_ERR(trans)); ret = btrfs_commit_transaction(trans, root); BUG_ON(ret); } @@ -7546,7 +7719,8 @@ int btrfs_cleanup_reloc_trees(struct btrfs_root *root) reloc_root = btrfs_read_fs_root_no_name(root->fs_info, &location); BUG_ON(!reloc_root); - btrfs_orphan_cleanup(reloc_root); + ret = btrfs_orphan_cleanup(reloc_root); + BUG_ON(ret); return 0; } @@ -7564,7 +7738,8 @@ static noinline int init_reloc_tree(struct btrfs_trans_handle *trans, return 0; root_item = kmalloc(sizeof(*root_item), GFP_NOFS); - BUG_ON(!root_item); + if (!root_item) + return -ENOMEM; ret = btrfs_copy_root(trans, root, root->commit_root, &eb, BTRFS_TREE_RELOC_OBJECTID); @@ -7590,7 +7765,7 @@ static noinline int init_reloc_tree(struct btrfs_trans_handle *trans, reloc_root = btrfs_read_fs_root_no_radix(root->fs_info->tree_root, &root_key); - BUG_ON(!reloc_root); + BUG_ON(IS_ERR(reloc_root)); reloc_root->last_trans = trans->transid; reloc_root->commit_root = NULL; reloc_root->ref_tree = &root->fs_info->reloc_ref_tree; @@ -7779,7 +7954,7 @@ static noinline int relocate_one_extent(struct btrfs_root *extent_root, trans = btrfs_start_transaction(extent_root, 1); - BUG_ON(!trans); + BUG_ON(IS_ERR(trans)); if (extent_key->objectid == 0) { ret = del_extent_zero(trans, extent_root, path, extent_key); @@ -7843,6 +8018,10 @@ static noinline int relocate_one_extent(struct btrfs_root *extent_root, eb = read_tree_block(found_root, block_start, block_size, 0); + if (!eb) { + ret = -EIO; + goto out; + } btrfs_tree_lock(eb); BUG_ON(level != btrfs_header_level(eb)); @@ -7970,13 +8149,14 @@ static int set_block_group_ro(struct btrfs_block_group_cache *cache) if (sinfo->bytes_used + sinfo->bytes_reserved + sinfo->bytes_pinned + sinfo->bytes_may_use + sinfo->bytes_readonly + - cache->reserved_pinned + num_bytes < sinfo->total_bytes) { + cache->reserved_pinned + num_bytes <= sinfo->total_bytes) { sinfo->bytes_readonly += num_bytes; sinfo->bytes_reserved += cache->reserved_pinned; cache->reserved_pinned = 0; cache->ro = 1; ret = 0; } + spin_unlock(&cache->lock); spin_unlock(&sinfo->lock); return ret; @@ -7997,13 +8177,15 @@ int btrfs_set_block_group_ro(struct btrfs_root *root, alloc_flags = update_block_group_flags(root, cache->flags); if (alloc_flags != cache->flags) - do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags, 1); + do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags, + CHUNK_ALLOC_FORCE); ret = set_block_group_ro(cache); if (!ret) goto out; alloc_flags = get_alloc_profile(root, cache->space_info->flags); - ret = do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags, 1); + ret = do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags, + CHUNK_ALLOC_FORCE); if (ret < 0) goto out; ret = set_block_group_ro(cache); @@ -8012,6 +8194,70 @@ out: return ret; } +int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 type) +{ + u64 alloc_flags = get_alloc_profile(root, type); + return do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags, + CHUNK_ALLOC_FORCE); +} + +/* + * helper to account the unused space of all the readonly block group in the + * list. takes mirrors into account. + */ +static u64 __btrfs_get_ro_block_group_free_space(struct list_head *groups_list) +{ + struct btrfs_block_group_cache *block_group; + u64 free_bytes = 0; + int factor; + + list_for_each_entry(block_group, groups_list, list) { + spin_lock(&block_group->lock); + + if (!block_group->ro) { + spin_unlock(&block_group->lock); + continue; + } + + if (block_group->flags & (BTRFS_BLOCK_GROUP_RAID1 | + BTRFS_BLOCK_GROUP_RAID10 | + BTRFS_BLOCK_GROUP_DUP)) + factor = 2; + else + factor = 1; + + free_bytes += (block_group->key.offset - + btrfs_block_group_used(&block_group->item)) * + factor; + + spin_unlock(&block_group->lock); + } + + return free_bytes; +} + +/* + * helper to account the unused space of all the readonly block group in the + * space_info. takes mirrors into account. + */ +u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo) +{ + int i; + u64 free_bytes = 0; + + spin_lock(&sinfo->lock); + + for(i = 0; i < BTRFS_NR_RAID_TYPES; i++) + if (!list_empty(&sinfo->block_groups[i])) + free_bytes += __btrfs_get_ro_block_group_free_space( + &sinfo->block_groups[i]); + + spin_unlock(&sinfo->lock); + + return free_bytes; +} + int btrfs_set_block_group_rw(struct btrfs_root *root, struct btrfs_block_group_cache *cache) { @@ -8092,7 +8338,7 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr) mutex_lock(&root->fs_info->chunk_mutex); list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) { u64 min_free = btrfs_block_group_used(&block_group->item); - u64 dev_offset, max_avail; + u64 dev_offset; /* * check to make sure we can actually find a chunk with enough @@ -8100,7 +8346,7 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr) */ if (device->total_bytes > device->bytes_used + min_free) { ret = find_free_dev_extent(NULL, device, min_free, - &dev_offset, &max_avail); + &dev_offset, NULL); if (!ret) break; ret = -1; @@ -8213,6 +8459,13 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) if (block_group->cached == BTRFS_CACHE_STARTED) wait_block_group_cache_done(block_group); + /* + * We haven't cached this block group, which means we could + * possibly have excluded extents on this block group. + */ + if (block_group->cached == BTRFS_CACHE_NO) + free_excluded_extents(info->extent_root, block_group); + btrfs_remove_free_space_cache(block_group); btrfs_put_block_group(block_group); @@ -8328,6 +8581,13 @@ int btrfs_read_block_groups(struct btrfs_root *root) cache->sectorsize = root->sectorsize; /* + * We need to exclude the super stripes now so that the space + * info has super bytes accounted for, otherwise we'll think + * we have more space than we actually do. + */ + exclude_super_stripes(root, cache); + + /* * check for two cases, either we are full, and therefore * don't need to bother with the caching work since we won't * find any space, or we are empty, and we can just add all @@ -8335,12 +8595,10 @@ int btrfs_read_block_groups(struct btrfs_root *root) * time, particularly in the full case. */ if (found_key.offset == btrfs_block_group_used(&cache->item)) { - exclude_super_stripes(root, cache); cache->last_byte_to_unpin = (u64)-1; cache->cached = BTRFS_CACHE_FINISHED; free_excluded_extents(root, cache); } else if (btrfs_block_group_used(&cache->item) == 0) { - exclude_super_stripes(root, cache); cache->last_byte_to_unpin = (u64)-1; cache->cached = BTRFS_CACHE_FINISHED; add_new_free_space(cache, root->fs_info, @@ -8482,6 +8740,12 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, BUG_ON(!block_group); BUG_ON(!block_group->ro); + /* + * Free the reserved super bytes from this block group before + * remove it. + */ + free_excluded_extents(root, block_group); + memcpy(&key, &block_group->key, sizeof(key)); if (block_group->flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 | @@ -8584,3 +8848,85 @@ out: btrfs_free_path(path); return ret; } + +int btrfs_init_space_info(struct btrfs_fs_info *fs_info) +{ + struct btrfs_space_info *space_info; + int ret; + + ret = update_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM, 0, 0, + &space_info); + if (ret) + return ret; + + ret = update_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA, 0, 0, + &space_info); + if (ret) + return ret; + + ret = update_space_info(fs_info, BTRFS_BLOCK_GROUP_DATA, 0, 0, + &space_info); + if (ret) + return ret; + + return ret; +} + +int btrfs_error_unpin_extent_range(struct btrfs_root *root, u64 start, u64 end) +{ + return unpin_extent_range(root, start, end); +} + +int btrfs_error_discard_extent(struct btrfs_root *root, u64 bytenr, + u64 num_bytes, u64 *actual_bytes) +{ + return btrfs_discard_extent(root, bytenr, num_bytes, actual_bytes); +} + +int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_block_group_cache *cache = NULL; + u64 group_trimmed; + u64 start; + u64 end; + u64 trimmed = 0; + int ret = 0; + + cache = btrfs_lookup_block_group(fs_info, range->start); + + while (cache) { + if (cache->key.objectid >= (range->start + range->len)) { + btrfs_put_block_group(cache); + break; + } + + start = max(range->start, cache->key.objectid); + end = min(range->start + range->len, + cache->key.objectid + cache->key.offset); + + if (end - start >= range->minlen) { + if (!block_group_cache_done(cache)) { + ret = cache_block_group(cache, NULL, root, 0); + if (!ret) + wait_block_group_cache_done(cache); + } + ret = btrfs_trim_block_group(cache, + &group_trimmed, + start, + end, + range->minlen); + + trimmed += group_trimmed; + if (ret) { + btrfs_put_block_group(cache); + break; + } + } + + cache = next_block_group(fs_info->tree_root, cache); + } + + range->len = trimmed; + return ret; +} diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 3e86b9f36507..315138605088 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -690,6 +690,15 @@ static void cache_state(struct extent_state *state, } } +static void uncache_state(struct extent_state **cached_ptr) +{ + if (cached_ptr && (*cached_ptr)) { + struct extent_state *state = *cached_ptr; + *cached_ptr = NULL; + free_extent_state(state); + } +} + /* * set some bits on a range in the tree. This may require allocations or * sleeping, so the gfp mask is used to indicate what is allowed. @@ -940,10 +949,10 @@ static int clear_extent_new(struct extent_io_tree *tree, u64 start, u64 end, } int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, - gfp_t mask) + struct extent_state **cached_state, gfp_t mask) { - return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, NULL, - NULL, mask); + return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, + NULL, cached_state, mask); } static int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, @@ -1012,8 +1021,7 @@ int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end, mask); } -int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, - gfp_t mask) +int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask) { return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, NULL, mask); @@ -1433,12 +1441,13 @@ int extent_clear_unlock_delalloc(struct inode *inode, */ u64 count_range_bits(struct extent_io_tree *tree, u64 *start, u64 search_end, u64 max_bytes, - unsigned long bits) + unsigned long bits, int contig) { struct rb_node *node; struct extent_state *state; u64 cur_start = *start; u64 total_bytes = 0; + u64 last = 0; int found = 0; if (search_end <= cur_start) { @@ -1463,7 +1472,9 @@ u64 count_range_bits(struct extent_io_tree *tree, state = rb_entry(node, struct extent_state, rb_node); if (state->start > search_end) break; - if (state->end >= cur_start && (state->state & bits)) { + if (contig && found && state->start > last + 1) + break; + if (state->end >= cur_start && (state->state & bits) == bits) { total_bytes += min(search_end, state->end) + 1 - max(cur_start, state->start); if (total_bytes >= max_bytes) @@ -1472,6 +1483,9 @@ u64 count_range_bits(struct extent_io_tree *tree, *start = state->start; found = 1; } + last = state->end; + } else if (contig && found) { + break; } node = rb_next(node); if (!node) @@ -1729,6 +1743,9 @@ static void end_bio_extent_readpage(struct bio *bio, int err) do { struct page *page = bvec->bv_page; + struct extent_state *cached = NULL; + struct extent_state *state; + tree = &BTRFS_I(page->mapping->host)->io_tree; start = ((u64)page->index << PAGE_CACHE_SHIFT) + @@ -1743,9 +1760,20 @@ static void end_bio_extent_readpage(struct bio *bio, int err) if (++bvec <= bvec_end) prefetchw(&bvec->bv_page->flags); + spin_lock(&tree->lock); + state = find_first_extent_bit_state(tree, start, EXTENT_LOCKED); + if (state && state->start == start) { + /* + * take a reference on the state, unlock will drop + * the ref + */ + cache_state(state, &cached); + } + spin_unlock(&tree->lock); + if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) { ret = tree->ops->readpage_end_io_hook(page, start, end, - NULL); + state); if (ret) uptodate = 0; } @@ -1758,15 +1786,16 @@ static void end_bio_extent_readpage(struct bio *bio, int err) test_bit(BIO_UPTODATE, &bio->bi_flags); if (err) uptodate = 0; + uncache_state(&cached); continue; } } if (uptodate) { - set_extent_uptodate(tree, start, end, + set_extent_uptodate(tree, start, end, &cached, GFP_ATOMIC); } - unlock_extent(tree, start, end, GFP_ATOMIC); + unlock_extent_cached(tree, start, end, &cached, GFP_ATOMIC); if (whole_page) { if (uptodate) { @@ -1805,6 +1834,7 @@ static void end_bio_extent_preparewrite(struct bio *bio, int err) do { struct page *page = bvec->bv_page; + struct extent_state *cached = NULL; tree = &BTRFS_I(page->mapping->host)->io_tree; start = ((u64)page->index << PAGE_CACHE_SHIFT) + @@ -1815,13 +1845,14 @@ static void end_bio_extent_preparewrite(struct bio *bio, int err) prefetchw(&bvec->bv_page->flags); if (uptodate) { - set_extent_uptodate(tree, start, end, GFP_ATOMIC); + set_extent_uptodate(tree, start, end, &cached, + GFP_ATOMIC); } else { ClearPageUptodate(page); SetPageError(page); } - unlock_extent(tree, start, end, GFP_ATOMIC); + unlock_extent_cached(tree, start, end, &cached, GFP_ATOMIC); } while (bvec >= bio->bi_io_vec); @@ -1865,7 +1896,7 @@ static int submit_one_bio(int rw, struct bio *bio, int mirror_num, bio_get(bio); if (tree->ops && tree->ops->submit_bio_hook) - tree->ops->submit_bio_hook(page->mapping->host, rw, bio, + ret = tree->ops->submit_bio_hook(page->mapping->host, rw, bio, mirror_num, bio_flags, start); else submit_bio(rw, bio); @@ -1920,6 +1951,8 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree, nr = bio_get_nr_vecs(bdev); bio = btrfs_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH); + if (!bio) + return -ENOMEM; bio_add_page(bio, page, page_size, offset); bio->bi_end_io = end_io_func; @@ -1944,6 +1977,7 @@ void set_page_extent_mapped(struct page *page) static void set_page_extent_head(struct page *page, unsigned long len) { + WARN_ON(!PagePrivate(page)); set_page_private(page, EXTENT_PAGE_PRIVATE_FIRST_PAGE | len << 2); } @@ -2007,14 +2041,17 @@ static int __extent_read_full_page(struct extent_io_tree *tree, while (cur <= end) { if (cur >= last_byte) { char *userpage; + struct extent_state *cached = NULL; + iosize = PAGE_CACHE_SIZE - page_offset; userpage = kmap_atomic(page, KM_USER0); memset(userpage + page_offset, 0, iosize); flush_dcache_page(page); kunmap_atomic(userpage, KM_USER0); set_extent_uptodate(tree, cur, cur + iosize - 1, - GFP_NOFS); - unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS); + &cached, GFP_NOFS); + unlock_extent_cached(tree, cur, cur + iosize - 1, + &cached, GFP_NOFS); break; } em = get_extent(inode, page, page_offset, cur, @@ -2028,8 +2065,11 @@ static int __extent_read_full_page(struct extent_io_tree *tree, BUG_ON(extent_map_end(em) <= cur); BUG_ON(end < cur); - if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) + if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { this_bio_flag = EXTENT_BIO_COMPRESSED; + extent_set_compress_type(&this_bio_flag, + em->compress_type); + } iosize = min(extent_map_end(em) - cur, end - cur + 1); cur_end = min(extent_map_end(em) - 1, end); @@ -2051,14 +2091,17 @@ static int __extent_read_full_page(struct extent_io_tree *tree, /* we've found a hole, just zero and go on */ if (block_start == EXTENT_MAP_HOLE) { char *userpage; + struct extent_state *cached = NULL; + userpage = kmap_atomic(page, KM_USER0); memset(userpage + page_offset, 0, iosize); flush_dcache_page(page); kunmap_atomic(userpage, KM_USER0); set_extent_uptodate(tree, cur, cur + iosize - 1, - GFP_NOFS); - unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS); + &cached, GFP_NOFS); + unlock_extent_cached(tree, cur, cur + iosize - 1, + &cached, GFP_NOFS); cur = cur + iosize; page_offset += iosize; continue; @@ -2123,7 +2166,7 @@ int extent_read_full_page(struct extent_io_tree *tree, struct page *page, ret = __extent_read_full_page(tree, page, get_extent, &bio, 0, &bio_flags); if (bio) - submit_one_bio(READ, bio, 0, bio_flags); + ret = submit_one_bio(READ, bio, 0, bio_flags); return ret; } @@ -2176,10 +2219,12 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, unsigned long nr_written = 0; if (wbc->sync_mode == WB_SYNC_ALL) - write_flags = WRITE_SYNC_PLUG; + write_flags = WRITE_SYNC; else write_flags = WRITE; + trace___extent_writepage(page, inode, wbc); + WARN_ON(!PageLocked(page)); pg_offset = i_size & (PAGE_CACHE_SIZE - 1); if (page->index > end_index || @@ -2775,9 +2820,12 @@ int extent_prepare_write(struct extent_io_tree *tree, iocount++; block_start = block_start + iosize; } else { - set_extent_uptodate(tree, block_start, cur_end, + struct extent_state *cached = NULL; + + set_extent_uptodate(tree, block_start, cur_end, &cached, GFP_NOFS); - unlock_extent(tree, block_start, cur_end, GFP_NOFS); + unlock_extent_cached(tree, block_start, cur_end, + &cached, GFP_NOFS); block_start = cur_end + 1; } page_offset = block_start & (PAGE_CACHE_SIZE - 1); @@ -2816,9 +2864,17 @@ int try_release_extent_state(struct extent_map_tree *map, * at this point we can safely clear everything except the * locked bit and the nodatasum bit */ - clear_extent_bit(tree, start, end, + ret = clear_extent_bit(tree, start, end, ~(EXTENT_LOCKED | EXTENT_NODATASUM), 0, 0, NULL, mask); + + /* if clear_extent_bit failed for enomem reasons, + * we can't allow the release to continue. + */ + if (ret < 0) + ret = 0; + else + ret = 1; } return ret; } @@ -2898,6 +2954,46 @@ out: return sector; } +/* + * helper function for fiemap, which doesn't want to see any holes. + * This maps until we find something past 'last' + */ +static struct extent_map *get_extent_skip_holes(struct inode *inode, + u64 offset, + u64 last, + get_extent_t *get_extent) +{ + u64 sectorsize = BTRFS_I(inode)->root->sectorsize; + struct extent_map *em; + u64 len; + + if (offset >= last) + return NULL; + + while(1) { + len = last - offset; + if (len == 0) + break; + len = (len + sectorsize - 1) & ~(sectorsize - 1); + em = get_extent(inode, NULL, 0, offset, len, 0); + if (!em || IS_ERR(em)) + return em; + + /* if this isn't a hole return it */ + if (!test_bit(EXTENT_FLAG_VACANCY, &em->flags) && + em->block_start != EXTENT_MAP_HOLE) { + return em; + } + + /* this is a hole, advance to the next extent */ + offset = extent_map_end(em); + free_extent_map(em); + if (offset >= last) + break; + } + return NULL; +} + int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, __u64 start, __u64 len, get_extent_t *get_extent) { @@ -2907,16 +3003,19 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u32 flags = 0; u32 found_type; u64 last; + u64 last_for_get_extent = 0; u64 disko = 0; + u64 isize = i_size_read(inode); struct btrfs_key found_key; struct extent_map *em = NULL; struct extent_state *cached_state = NULL; struct btrfs_path *path; struct btrfs_file_extent_item *item; int end = 0; - u64 em_start = 0, em_len = 0; + u64 em_start = 0; + u64 em_len = 0; + u64 em_end = 0; unsigned long emflags; - int hole = 0; if (len == 0) return -EINVAL; @@ -2926,6 +3025,10 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, return -ENOMEM; path->leave_spinning = 1; + /* + * lookup the last file extent. We're not using i_size here + * because there might be preallocation past i_size + */ ret = btrfs_lookup_file_extent(NULL, BTRFS_I(inode)->root, path, inode->i_ino, -1, 0); if (ret < 0) { @@ -2939,18 +3042,38 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]); found_type = btrfs_key_type(&found_key); - /* No extents, just return */ + /* No extents, but there might be delalloc bits */ if (found_key.objectid != inode->i_ino || found_type != BTRFS_EXTENT_DATA_KEY) { - btrfs_free_path(path); - return 0; + /* have to trust i_size as the end */ + last = (u64)-1; + last_for_get_extent = isize; + } else { + /* + * remember the start of the last extent. There are a + * bunch of different factors that go into the length of the + * extent, so its much less complex to remember where it started + */ + last = found_key.offset; + last_for_get_extent = last + 1; } - last = found_key.offset; btrfs_free_path(path); + /* + * we might have some extents allocated but more delalloc past those + * extents. so, we trust isize unless the start of the last extent is + * beyond isize + */ + if (last < isize) { + last = (u64)-1; + last_for_get_extent = isize; + } + lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len, 0, &cached_state, GFP_NOFS); - em = get_extent(inode, NULL, 0, off, max - off, 0); + + em = get_extent_skip_holes(inode, off, last_for_get_extent, + get_extent); if (!em) goto out; if (IS_ERR(em)) { @@ -2959,22 +3082,38 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, } while (!end) { - hole = 0; - off = em->start + em->len; - if (off >= max) - end = 1; + u64 offset_in_extent; - if (em->block_start == EXTENT_MAP_HOLE) { - hole = 1; - goto next; - } + /* break if the extent we found is outside the range */ + if (em->start >= max || extent_map_end(em) < off) + break; - em_start = em->start; - em_len = em->len; + /* + * get_extent may return an extent that starts before our + * requested range. We have to make sure the ranges + * we return to fiemap always move forward and don't + * overlap, so adjust the offsets here + */ + em_start = max(em->start, off); + /* + * record the offset from the start of the extent + * for adjusting the disk offset below + */ + offset_in_extent = em_start - em->start; + em_end = extent_map_end(em); + em_len = em_end - em_start; + emflags = em->flags; disko = 0; flags = 0; + /* + * bump off for our next call to get_extent + */ + off = extent_map_end(em); + if (off >= max) + end = 1; + if (em->block_start == EXTENT_MAP_LAST_BYTE) { end = 1; flags |= FIEMAP_EXTENT_LAST; @@ -2985,42 +3124,34 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, flags |= (FIEMAP_EXTENT_DELALLOC | FIEMAP_EXTENT_UNKNOWN); } else { - disko = em->block_start; + disko = em->block_start + offset_in_extent; } if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) flags |= FIEMAP_EXTENT_ENCODED; -next: - emflags = em->flags; free_extent_map(em); em = NULL; - if (!end) { - em = get_extent(inode, NULL, 0, off, max - off, 0); - if (!em) - goto out; - if (IS_ERR(em)) { - ret = PTR_ERR(em); - goto out; - } - emflags = em->flags; - } - - if (test_bit(EXTENT_FLAG_VACANCY, &emflags)) { + if ((em_start >= last) || em_len == (u64)-1 || + (last == (u64)-1 && isize <= em_end)) { flags |= FIEMAP_EXTENT_LAST; end = 1; } - if (em_start == last) { + /* now scan forward to see if this is really the last extent. */ + em = get_extent_skip_holes(inode, off, last_for_get_extent, + get_extent); + if (IS_ERR(em)) { + ret = PTR_ERR(em); + goto out; + } + if (!em) { flags |= FIEMAP_EXTENT_LAST; end = 1; } - - if (!hole) { - ret = fiemap_fill_next_extent(fieinfo, em_start, disko, - em_len, flags); - if (ret) - goto out_free; - } + ret = fiemap_fill_next_extent(fieinfo, em_start, disko, + em_len, flags); + if (ret) + goto out_free; } out_free: free_extent_map(em); @@ -3072,6 +3203,8 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree, #endif eb = kmem_cache_zalloc(extent_buffer_cache, mask); + if (eb == NULL) + return NULL; eb->start = start; eb->len = len; spin_lock_init(&eb->lock); @@ -3187,7 +3320,13 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, } if (!PageUptodate(p)) uptodate = 0; - unlock_page(p); + + /* + * see below about how we avoid a nasty race with release page + * and why we unlock later + */ + if (i != 0) + unlock_page(p); } if (uptodate) set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); @@ -3211,9 +3350,26 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, atomic_inc(&eb->refs); spin_unlock(&tree->buffer_lock); radix_tree_preload_end(); + + /* + * there is a race where release page may have + * tried to find this extent buffer in the radix + * but failed. It will tell the VM it is safe to + * reclaim the, and it will clear the page private bit. + * We must make sure to set the page private bit properly + * after the extent buffer is in the radix tree so + * it doesn't get lost + */ + set_page_extent_mapped(eb->first_page); + set_page_extent_head(eb->first_page, eb->len); + if (!page0) + unlock_page(eb->first_page); return eb; free_eb: + if (eb->first_page && !page0) + unlock_page(eb->first_page); + if (!atomic_dec_and_test(&eb->refs)) return exists; btrfs_release_extent_buffer(eb); @@ -3264,10 +3420,11 @@ int clear_extent_buffer_dirty(struct extent_io_tree *tree, continue; lock_page(page); + WARN_ON(!PagePrivate(page)); + + set_page_extent_mapped(page); if (i == 0) set_page_extent_head(page, eb->len); - else - set_page_private(page, EXTENT_PAGE_PRIVATE); clear_page_dirty_for_io(page); spin_lock_irq(&page->mapping->tree_lock); @@ -3334,7 +3491,7 @@ int set_extent_buffer_uptodate(struct extent_io_tree *tree, num_pages = num_extent_pages(eb->start, eb->len); set_extent_uptodate(tree, eb->start, eb->start + eb->len - 1, - GFP_NOFS); + NULL, GFP_NOFS); for (i = 0; i < num_pages; i++) { page = extent_buffer_page(eb, i); if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) || @@ -3457,6 +3614,13 @@ int read_extent_buffer_pages(struct extent_io_tree *tree, for (i = start_i; i < num_pages; i++) { page = extent_buffer_page(eb, i); + + WARN_ON(!PagePrivate(page)); + + set_page_extent_mapped(page); + if (i == 0) + set_page_extent_head(page, eb->len); + if (inc_all_pages) page_cache_get(page); if (!PageUptodate(page)) { @@ -3562,6 +3726,7 @@ int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start, "wanted %lu %lu\n", (unsigned long long)eb->start, eb->len, start, min_len); WARN_ON(1); + return -EINVAL; } p = extent_buffer_page(eb, i); @@ -3754,6 +3919,12 @@ static void move_pages(struct page *dst_page, struct page *src_page, kunmap_atomic(dst_kaddr, KM_USER0); } +static inline bool areas_overlap(unsigned long src, unsigned long dst, unsigned long len) +{ + unsigned long distance = (src > dst) ? src - dst : dst - src; + return distance < len; +} + static void copy_pages(struct page *dst_page, struct page *src_page, unsigned long dst_off, unsigned long src_off, unsigned long len) @@ -3761,10 +3932,12 @@ static void copy_pages(struct page *dst_page, struct page *src_page, char *dst_kaddr = kmap_atomic(dst_page, KM_USER0); char *src_kaddr; - if (dst_page != src_page) + if (dst_page != src_page) { src_kaddr = kmap_atomic(src_page, KM_USER1); - else + } else { src_kaddr = dst_kaddr; + BUG_ON(areas_overlap(src_off, dst_off, len)); + } memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len); kunmap_atomic(dst_kaddr, KM_USER0); @@ -3839,7 +4012,7 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, "len %lu len %lu\n", dst_offset, len, dst->len); BUG_ON(1); } - if (dst_offset < src_offset) { + if (!areas_overlap(src_offset, dst_offset, len)) { memcpy_extent_buffer(dst, dst_offset, src_offset, len); return; } diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 4183c8178f01..af2d7179c372 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -20,13 +20,18 @@ #define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK) #define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC) -/* flags for bio submission */ +/* + * flags for bio submission. The high bits indicate the compression + * type for this bio + */ #define EXTENT_BIO_COMPRESSED 1 +#define EXTENT_BIO_FLAG_SHIFT 16 /* these are bit numbers for test/set bit */ #define EXTENT_BUFFER_UPTODATE 0 #define EXTENT_BUFFER_BLOCKING 1 #define EXTENT_BUFFER_DIRTY 2 +#define EXTENT_BUFFER_CORRUPT 3 /* these are flags for extent_clear_unlock_delalloc */ #define EXTENT_CLEAR_UNLOCK_PAGE 0x1 @@ -135,6 +140,17 @@ struct extent_buffer { wait_queue_head_t lock_wq; }; +static inline void extent_set_compress_type(unsigned long *bio_flags, + int compress_type) +{ + *bio_flags |= compress_type << EXTENT_BIO_FLAG_SHIFT; +} + +static inline int extent_compress_type(unsigned long bio_flags) +{ + return bio_flags >> EXTENT_BIO_FLAG_SHIFT; +} + struct extent_map_tree; static inline struct extent_state *extent_state_next(struct extent_state *state) @@ -176,7 +192,7 @@ void extent_io_exit(void); u64 count_range_bits(struct extent_io_tree *tree, u64 *start, u64 search_end, - u64 max_bytes, unsigned long bits); + u64 max_bytes, unsigned long bits, int contig); void free_extent_state(struct extent_state *state); int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, @@ -192,7 +208,7 @@ int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits, int exclusive_bits, u64 *failed_start, struct extent_state **cached_state, gfp_t mask); int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, - gfp_t mask); + struct extent_state **cached_state, gfp_t mask); int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask); int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 23cb8da3ff66..a24a3f2fa13e 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -3,6 +3,7 @@ #include <linux/module.h> #include <linux/spinlock.h> #include <linux/hardirq.h> +#include "ctree.h" #include "extent_map.h" @@ -50,10 +51,11 @@ struct extent_map *alloc_extent_map(gfp_t mask) { struct extent_map *em; em = kmem_cache_alloc(extent_map_cache, mask); - if (!em || IS_ERR(em)) - return em; + if (!em) + return NULL; em->in_tree = 0; em->flags = 0; + em->compress_type = BTRFS_COMPRESS_NONE; atomic_set(&em->refs, 1); return em; } @@ -241,7 +243,7 @@ out: * Insert @em into @tree or perform a simple forward/backward merge with * existing mappings. The extent_map struct passed in will be inserted * into the tree directly, with an additional reference taken, or a - * reference dropped if the merge attempt was successfull. + * reference dropped if the merge attempt was successful. */ int add_extent_mapping(struct extent_map_tree *tree, struct extent_map *em) diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h index ab6d74b6e647..28b44dbd1e35 100644 --- a/fs/btrfs/extent_map.h +++ b/fs/btrfs/extent_map.h @@ -26,7 +26,8 @@ struct extent_map { unsigned long flags; struct block_device *bdev; atomic_t refs; - int in_tree; + unsigned int in_tree:1; + unsigned int compress_type:4; }; struct extent_map_tree { diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index a562a250ae77..a6a9d4e8b491 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -48,7 +48,8 @@ int btrfs_insert_file_extent(struct btrfs_trans_handle *trans, struct extent_buffer *leaf; path = btrfs_alloc_path(); - BUG_ON(!path); + if (!path) + return -ENOMEM; file_key.objectid = objectid; file_key.offset = pos; btrfs_set_key_type(&file_key, BTRFS_EXTENT_DATA_KEY); @@ -169,6 +170,8 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root, struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; if (bio->bi_size > PAGE_CACHE_SIZE * 8) path->reada = 2; @@ -536,6 +539,8 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans, root = root->fs_info->csum_root; path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; while (1) { key.objectid = BTRFS_EXTENT_CSUM_OBJECTID; @@ -548,7 +553,10 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans, if (path->slots[0] == 0) goto out; path->slots[0]--; + } else if (ret < 0) { + goto out; } + leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 66836d85763b..75899a01dded 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -24,6 +24,7 @@ #include <linux/string.h> #include <linux/backing-dev.h> #include <linux/mpage.h> +#include <linux/falloc.h> #include <linux/swap.h> #include <linux/writeback.h> #include <linux/statfs.h> @@ -44,14 +45,14 @@ * and be replaced with calls into generic code. */ static noinline int btrfs_copy_from_user(loff_t pos, int num_pages, - int write_bytes, + size_t write_bytes, struct page **prepared_pages, struct iov_iter *i) { size_t copied = 0; + size_t total_copied = 0; int pg = 0; int offset = pos & (PAGE_CACHE_SIZE - 1); - int total_copied = 0; while (write_bytes > 0) { size_t count = min_t(size_t, @@ -69,14 +70,26 @@ static noinline int btrfs_copy_from_user(loff_t pos, int num_pages, /* Flush processor's dcache for this page */ flush_dcache_page(page); + + /* + * if we get a partial write, we can end up with + * partially up to date pages. These add + * a lot of complexity, so make sure they don't + * happen by forcing this copy to be retried. + * + * The rest of the btrfs_file_write code will fall + * back to page at a time copies after we return 0. + */ + if (!PageUptodate(page) && copied < count) + copied = 0; + iov_iter_advance(i, copied); write_bytes -= copied; total_copied += copied; /* Return to btrfs_file_aio_write to fault page */ - if (unlikely(copied == 0)) { + if (unlikely(copied == 0)) break; - } if (unlikely(copied < PAGE_CACHE_SIZE - offset)) { offset += copied; @@ -91,12 +104,10 @@ static noinline int btrfs_copy_from_user(loff_t pos, int num_pages, /* * unlocks pages after btrfs_file_write is done with them */ -static noinline void btrfs_drop_pages(struct page **pages, size_t num_pages) +void btrfs_drop_pages(struct page **pages, size_t num_pages) { size_t i; for (i = 0; i < num_pages; i++) { - if (!pages[i]) - break; /* page checked is some magic around finding pages that * have been modified without going through btrfs_set_page_dirty * clear it here @@ -116,17 +127,13 @@ static noinline void btrfs_drop_pages(struct page **pages, size_t num_pages) * this also makes the decision about creating an inline extent vs * doing real data extents, marking pages dirty and delalloc as required. */ -static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct file *file, - struct page **pages, - size_t num_pages, - loff_t pos, - size_t write_bytes) +int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode, + struct page **pages, size_t num_pages, + loff_t pos, size_t write_bytes, + struct extent_state **cached) { int err = 0; int i; - struct inode *inode = fdentry(file)->d_inode; u64 num_bytes; u64 start_pos; u64 end_of_last_block; @@ -139,8 +146,9 @@ static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans, end_of_last_block = start_pos + num_bytes - 1; err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block, - NULL); - BUG_ON(err); + cached); + if (err) + return err; for (i = 0; i < num_pages; i++) { struct page *p = pages[i]; @@ -148,13 +156,14 @@ static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans, ClearPageChecked(p); set_page_dirty(p); } - if (end_pos > isize) { + + /* + * we've only changed i_size in ram, and we haven't updated + * the disk i_size. There is no need to log the inode + * at this time. + */ + if (end_pos > isize) i_size_write(inode, end_pos); - /* we've only changed i_size in ram, and we haven't updated - * the disk i_size. There is no need to log the inode - * at this time. - */ - } return 0; } @@ -185,6 +194,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, split = alloc_extent_map(GFP_NOFS); if (!split2) split2 = alloc_extent_map(GFP_NOFS); + BUG_ON(!split || !split2); write_lock(&em_tree->lock); em = lookup_extent_mapping(em_tree, start, len); @@ -224,6 +234,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, split->bdev = em->bdev; split->flags = flags; + split->compress_type = em->compress_type; ret = add_extent_mapping(em_tree, split); BUG_ON(ret); free_extent_map(split); @@ -238,6 +249,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, split->len = em->start + em->len - (start + len); split->bdev = em->bdev; split->flags = flags; + split->compress_type = em->compress_type; if (compressed) { split->block_len = em->block_len; @@ -593,6 +605,8 @@ again: key.offset = split; ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + if (ret < 0) + goto out; if (ret > 0 && path->slots[0] > 0) path->slots[0]--; @@ -759,6 +773,27 @@ out: } /* + * on error we return an unlocked page and the error value + * on success we return a locked page and 0 + */ +static int prepare_uptodate_page(struct page *page, u64 pos) +{ + int ret = 0; + + if ((pos & (PAGE_CACHE_SIZE - 1)) && !PageUptodate(page)) { + ret = btrfs_readpage(NULL, page); + if (ret) + return ret; + lock_page(page); + if (!PageUptodate(page)) { + unlock_page(page); + return -EIO; + } + } + return 0; +} + +/* * this gets pages into the page cache and locks them down, it also properly * waits for data=ordered extents to finish before allowing the pages to be * modified. @@ -773,6 +808,7 @@ static noinline int prepare_pages(struct btrfs_root *root, struct file *file, unsigned long index = pos >> PAGE_CACHE_SHIFT; struct inode *inode = fdentry(file)->d_inode; int err = 0; + int faili = 0; u64 start_pos; u64 last_pos; @@ -780,21 +816,33 @@ static noinline int prepare_pages(struct btrfs_root *root, struct file *file, last_pos = ((u64)index + num_pages) << PAGE_CACHE_SHIFT; if (start_pos > inode->i_size) { - err = btrfs_cont_expand(inode, start_pos); + err = btrfs_cont_expand(inode, i_size_read(inode), start_pos); if (err) return err; } - memset(pages, 0, num_pages * sizeof(struct page *)); again: for (i = 0; i < num_pages; i++) { pages[i] = grab_cache_page(inode->i_mapping, index + i); if (!pages[i]) { + faili = i - 1; err = -ENOMEM; - BUG_ON(1); + goto fail; + } + + if (i == 0) + err = prepare_uptodate_page(pages[i], pos); + if (i == num_pages - 1) + err = prepare_uptodate_page(pages[i], + pos + write_bytes); + if (err) { + page_cache_release(pages[i]); + faili = i - 1; + goto fail; } wait_on_page_writeback(pages[i]); } + err = 0; if (start_pos < inode->i_size) { struct btrfs_ordered_extent *ordered; lock_extent_bits(&BTRFS_I(inode)->io_tree, @@ -834,176 +882,103 @@ again: WARN_ON(!PageLocked(pages[i])); } return 0; +fail: + while (faili >= 0) { + unlock_page(pages[faili]); + page_cache_release(pages[faili]); + faili--; + } + return err; + } -static ssize_t btrfs_file_aio_write(struct kiocb *iocb, - const struct iovec *iov, - unsigned long nr_segs, loff_t pos) +static noinline ssize_t __btrfs_buffered_write(struct file *file, + struct iov_iter *i, + loff_t pos) { - struct file *file = iocb->ki_filp; struct inode *inode = fdentry(file)->d_inode; struct btrfs_root *root = BTRFS_I(inode)->root; - struct page *pinned[2]; struct page **pages = NULL; - struct iov_iter i; - loff_t *ppos = &iocb->ki_pos; - loff_t start_pos; - ssize_t num_written = 0; - ssize_t err = 0; - size_t count; - size_t ocount; - int ret = 0; - int nrptrs; unsigned long first_index; unsigned long last_index; - int will_write; - int buffered = 0; - int copied = 0; - int dirty_pages = 0; - - will_write = ((file->f_flags & O_DSYNC) || IS_SYNC(inode) || - (file->f_flags & O_DIRECT)); - - pinned[0] = NULL; - pinned[1] = NULL; - - start_pos = pos; - - vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); - - mutex_lock(&inode->i_mutex); - - err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ); - if (err) - goto out; - count = ocount; - - current->backing_dev_info = inode->i_mapping->backing_dev_info; - err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode)); - if (err) - goto out; - - if (count == 0) - goto out; - - err = file_remove_suid(file); - if (err) - goto out; - - file_update_time(file); - BTRFS_I(inode)->sequence++; - - if (unlikely(file->f_flags & O_DIRECT)) { - num_written = generic_file_direct_write(iocb, iov, &nr_segs, - pos, ppos, count, - ocount); - /* - * the generic O_DIRECT will update in-memory i_size after the - * DIOs are done. But our endio handlers that update the on - * disk i_size never update past the in memory i_size. So we - * need one more update here to catch any additions to the - * file - */ - if (inode->i_size != BTRFS_I(inode)->disk_i_size) { - btrfs_ordered_update_i_size(inode, inode->i_size, NULL); - mark_inode_dirty(inode); - } - - if (num_written < 0) { - ret = num_written; - num_written = 0; - goto out; - } else if (num_written == count) { - /* pick up pos changes done by the generic code */ - pos = *ppos; - goto out; - } - /* - * We are going to do buffered for the rest of the range, so we - * need to make sure to invalidate the buffered pages when we're - * done. - */ - buffered = 1; - pos += num_written; - } + size_t num_written = 0; + int nrptrs; + int ret = 0; - iov_iter_init(&i, iov, nr_segs, count, num_written); - nrptrs = min((iov_iter_count(&i) + PAGE_CACHE_SIZE - 1) / + nrptrs = min((iov_iter_count(i) + PAGE_CACHE_SIZE - 1) / PAGE_CACHE_SIZE, PAGE_CACHE_SIZE / (sizeof(struct page *))); pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL); - - /* generic_write_checks can change our pos */ - start_pos = pos; + if (!pages) + return -ENOMEM; first_index = pos >> PAGE_CACHE_SHIFT; - last_index = (pos + iov_iter_count(&i)) >> PAGE_CACHE_SHIFT; - - /* - * there are lots of better ways to do this, but this code - * makes sure the first and last page in the file range are - * up to date and ready for cow - */ - if ((pos & (PAGE_CACHE_SIZE - 1))) { - pinned[0] = grab_cache_page(inode->i_mapping, first_index); - if (!PageUptodate(pinned[0])) { - ret = btrfs_readpage(NULL, pinned[0]); - BUG_ON(ret); - wait_on_page_locked(pinned[0]); - } else { - unlock_page(pinned[0]); - } - } - if ((pos + iov_iter_count(&i)) & (PAGE_CACHE_SIZE - 1)) { - pinned[1] = grab_cache_page(inode->i_mapping, last_index); - if (!PageUptodate(pinned[1])) { - ret = btrfs_readpage(NULL, pinned[1]); - BUG_ON(ret); - wait_on_page_locked(pinned[1]); - } else { - unlock_page(pinned[1]); - } - } + last_index = (pos + iov_iter_count(i)) >> PAGE_CACHE_SHIFT; - while (iov_iter_count(&i) > 0) { + while (iov_iter_count(i) > 0) { size_t offset = pos & (PAGE_CACHE_SIZE - 1); - size_t write_bytes = min(iov_iter_count(&i), + size_t write_bytes = min(iov_iter_count(i), nrptrs * (size_t)PAGE_CACHE_SIZE - offset); - size_t num_pages = (write_bytes + PAGE_CACHE_SIZE - 1) >> - PAGE_CACHE_SHIFT; + size_t num_pages = (write_bytes + offset + + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + size_t dirty_pages; + size_t copied; WARN_ON(num_pages > nrptrs); - memset(pages, 0, sizeof(struct page *) * nrptrs); /* * Fault pages before locking them in prepare_pages * to avoid recursive lock */ - if (unlikely(iov_iter_fault_in_readable(&i, write_bytes))) { + if (unlikely(iov_iter_fault_in_readable(i, write_bytes))) { ret = -EFAULT; - goto out; + break; } ret = btrfs_delalloc_reserve_space(inode, num_pages << PAGE_CACHE_SHIFT); if (ret) - goto out; + break; + /* + * This is going to setup the pages array with the number of + * pages we want, so we don't really need to worry about the + * contents of pages from loop to loop + */ ret = prepare_pages(root, file, pages, num_pages, pos, first_index, last_index, write_bytes); if (ret) { btrfs_delalloc_release_space(inode, num_pages << PAGE_CACHE_SHIFT); - goto out; + break; } copied = btrfs_copy_from_user(pos, num_pages, - write_bytes, pages, &i); - dirty_pages = (copied + PAGE_CACHE_SIZE - 1) >> - PAGE_CACHE_SHIFT; + write_bytes, pages, i); + /* + * if we have trouble faulting in the pages, fall + * back to one page at a time + */ + if (copied < write_bytes) + nrptrs = 1; + + if (copied == 0) + dirty_pages = 0; + else + dirty_pages = (copied + offset + + PAGE_CACHE_SIZE - 1) >> + PAGE_CACHE_SHIFT; + + /* + * If we had a short copy we need to release the excess delaloc + * bytes we reserved. We need to increment outstanding_extents + * because btrfs_delalloc_release_space will decrement it, but + * we still have an outstanding extent for the chunk we actually + * managed to copy. + */ if (num_pages > dirty_pages) { if (copied > 0) atomic_inc( @@ -1014,43 +989,157 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb, } if (copied > 0) { - dirty_and_release_pages(NULL, root, file, pages, - dirty_pages, pos, copied); + ret = btrfs_dirty_pages(root, inode, pages, + dirty_pages, pos, copied, + NULL); + if (ret) { + btrfs_delalloc_release_space(inode, + dirty_pages << PAGE_CACHE_SHIFT); + btrfs_drop_pages(pages, num_pages); + break; + } } btrfs_drop_pages(pages, num_pages); - if (copied > 0) { - if (will_write) { - filemap_fdatawrite_range(inode->i_mapping, pos, - pos + copied - 1); - } else { - balance_dirty_pages_ratelimited_nr( - inode->i_mapping, - dirty_pages); - if (dirty_pages < - (root->leafsize >> PAGE_CACHE_SHIFT) + 1) - btrfs_btree_balance_dirty(root, 1); - btrfs_throttle(root); - } - } + cond_resched(); + + balance_dirty_pages_ratelimited_nr(inode->i_mapping, + dirty_pages); + if (dirty_pages < (root->leafsize >> PAGE_CACHE_SHIFT) + 1) + btrfs_btree_balance_dirty(root, 1); + btrfs_throttle(root); pos += copied; num_written += copied; + } - cond_resched(); + kfree(pages); + + return num_written ? num_written : ret; +} + +static ssize_t __btrfs_direct_write(struct kiocb *iocb, + const struct iovec *iov, + unsigned long nr_segs, loff_t pos, + loff_t *ppos, size_t count, size_t ocount) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = fdentry(file)->d_inode; + struct iov_iter i; + ssize_t written; + ssize_t written_buffered; + loff_t endbyte; + int err; + + written = generic_file_direct_write(iocb, iov, &nr_segs, pos, ppos, + count, ocount); + + /* + * the generic O_DIRECT will update in-memory i_size after the + * DIOs are done. But our endio handlers that update the on + * disk i_size never update past the in memory i_size. So we + * need one more update here to catch any additions to the + * file + */ + if (inode->i_size != BTRFS_I(inode)->disk_i_size) { + btrfs_ordered_update_i_size(inode, inode->i_size, NULL); + mark_inode_dirty(inode); + } + + if (written < 0 || written == count) + return written; + + pos += written; + count -= written; + iov_iter_init(&i, iov, nr_segs, count, written); + written_buffered = __btrfs_buffered_write(file, &i, pos); + if (written_buffered < 0) { + err = written_buffered; + goto out; } + endbyte = pos + written_buffered - 1; + err = filemap_write_and_wait_range(file->f_mapping, pos, endbyte); + if (err) + goto out; + written += written_buffered; + *ppos = pos + written_buffered; + invalidate_mapping_pages(file->f_mapping, pos >> PAGE_CACHE_SHIFT, + endbyte >> PAGE_CACHE_SHIFT); out: - mutex_unlock(&inode->i_mutex); - if (ret) - err = ret; + return written ? written : err; +} - kfree(pages); - if (pinned[0]) - page_cache_release(pinned[0]); - if (pinned[1]) - page_cache_release(pinned[1]); - *ppos = pos; +static ssize_t btrfs_file_aio_write(struct kiocb *iocb, + const struct iovec *iov, + unsigned long nr_segs, loff_t pos) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = fdentry(file)->d_inode; + struct btrfs_root *root = BTRFS_I(inode)->root; + loff_t *ppos = &iocb->ki_pos; + ssize_t num_written = 0; + ssize_t err = 0; + size_t count, ocount; + + vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); + + mutex_lock(&inode->i_mutex); + + err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ); + if (err) { + mutex_unlock(&inode->i_mutex); + goto out; + } + count = ocount; + + current->backing_dev_info = inode->i_mapping->backing_dev_info; + err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode)); + if (err) { + mutex_unlock(&inode->i_mutex); + goto out; + } + + if (count == 0) { + mutex_unlock(&inode->i_mutex); + goto out; + } + + err = file_remove_suid(file); + if (err) { + mutex_unlock(&inode->i_mutex); + goto out; + } + + /* + * If BTRFS flips readonly due to some impossible error + * (fs_info->fs_state now has BTRFS_SUPER_FLAG_ERROR), + * although we have opened a file as writable, we have + * to stop this write operation to ensure FS consistency. + */ + if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) { + mutex_unlock(&inode->i_mutex); + err = -EROFS; + goto out; + } + + file_update_time(file); + BTRFS_I(inode)->sequence++; + + if (unlikely(file->f_flags & O_DIRECT)) { + num_written = __btrfs_direct_write(iocb, iov, nr_segs, + pos, ppos, count, ocount); + } else { + struct iov_iter i; + + iov_iter_init(&i, iov, nr_segs, count, num_written); + + num_written = __btrfs_buffered_write(file, &i, pos); + if (num_written > 0) + *ppos = pos + num_written; + } + + mutex_unlock(&inode->i_mutex); /* * we want to make sure fsync finds this change @@ -1065,43 +1154,12 @@ out: * one running right now. */ BTRFS_I(inode)->last_trans = root->fs_info->generation + 1; - - if (num_written > 0 && will_write) { - struct btrfs_trans_handle *trans; - - err = btrfs_wait_ordered_range(inode, start_pos, num_written); - if (err) + if (num_written > 0 || num_written == -EIOCBQUEUED) { + err = generic_write_sync(file, pos, num_written); + if (err < 0 && num_written > 0) num_written = err; - - if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) { - trans = btrfs_start_transaction(root, 0); - if (IS_ERR(trans)) { - num_written = PTR_ERR(trans); - goto done; - } - mutex_lock(&inode->i_mutex); - ret = btrfs_log_dentry_safe(trans, root, - file->f_dentry); - mutex_unlock(&inode->i_mutex); - if (ret == 0) { - ret = btrfs_sync_log(trans, root); - if (ret == 0) - btrfs_end_transaction(trans, root); - else - btrfs_commit_transaction(trans, root); - } else if (ret != BTRFS_NO_LOG_SYNC) { - btrfs_commit_transaction(trans, root); - } else { - btrfs_end_transaction(trans, root); - } - } - if (file->f_flags & O_DIRECT && buffered) { - invalidate_mapping_pages(inode->i_mapping, - start_pos >> PAGE_CACHE_SHIFT, - (start_pos + num_written - 1) >> PAGE_CACHE_SHIFT); - } } -done: +out: current->backing_dev_info = NULL; return num_written ? num_written : err; } @@ -1144,6 +1202,7 @@ int btrfs_sync_file(struct file *file, int datasync) int ret = 0; struct btrfs_trans_handle *trans; + trace_btrfs_sync_file(file, datasync); /* we wait first, since the writeback may change the inode */ root->log_batch++; @@ -1237,6 +1296,118 @@ static int btrfs_file_mmap(struct file *filp, struct vm_area_struct *vma) return 0; } +static long btrfs_fallocate(struct file *file, int mode, + loff_t offset, loff_t len) +{ + struct inode *inode = file->f_path.dentry->d_inode; + struct extent_state *cached_state = NULL; + u64 cur_offset; + u64 last_byte; + u64 alloc_start; + u64 alloc_end; + u64 alloc_hint = 0; + u64 locked_end; + u64 mask = BTRFS_I(inode)->root->sectorsize - 1; + struct extent_map *em; + int ret; + + alloc_start = offset & ~mask; + alloc_end = (offset + len + mask) & ~mask; + + /* We only support the FALLOC_FL_KEEP_SIZE mode */ + if (mode & ~FALLOC_FL_KEEP_SIZE) + return -EOPNOTSUPP; + + /* + * wait for ordered IO before we have any locks. We'll loop again + * below with the locks held. + */ + btrfs_wait_ordered_range(inode, alloc_start, alloc_end - alloc_start); + + mutex_lock(&inode->i_mutex); + ret = inode_newsize_ok(inode, alloc_end); + if (ret) + goto out; + + if (alloc_start > inode->i_size) { + ret = btrfs_cont_expand(inode, i_size_read(inode), + alloc_start); + if (ret) + goto out; + } + + ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start); + if (ret) + goto out; + + locked_end = alloc_end - 1; + while (1) { + struct btrfs_ordered_extent *ordered; + + /* the extent lock is ordered inside the running + * transaction + */ + lock_extent_bits(&BTRFS_I(inode)->io_tree, alloc_start, + locked_end, 0, &cached_state, GFP_NOFS); + ordered = btrfs_lookup_first_ordered_extent(inode, + alloc_end - 1); + if (ordered && + ordered->file_offset + ordered->len > alloc_start && + ordered->file_offset < alloc_end) { + btrfs_put_ordered_extent(ordered); + unlock_extent_cached(&BTRFS_I(inode)->io_tree, + alloc_start, locked_end, + &cached_state, GFP_NOFS); + /* + * we can't wait on the range with the transaction + * running or with the extent lock held + */ + btrfs_wait_ordered_range(inode, alloc_start, + alloc_end - alloc_start); + } else { + if (ordered) + btrfs_put_ordered_extent(ordered); + break; + } + } + + cur_offset = alloc_start; + while (1) { + em = btrfs_get_extent(inode, NULL, 0, cur_offset, + alloc_end - cur_offset, 0); + BUG_ON(IS_ERR(em) || !em); + last_byte = min(extent_map_end(em), alloc_end); + last_byte = (last_byte + mask) & ~mask; + if (em->block_start == EXTENT_MAP_HOLE || + (cur_offset >= inode->i_size && + !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) { + ret = btrfs_prealloc_file_range(inode, mode, cur_offset, + last_byte - cur_offset, + 1 << inode->i_blkbits, + offset + len, + &alloc_hint); + if (ret < 0) { + free_extent_map(em); + break; + } + } + free_extent_map(em); + + cur_offset = last_byte; + if (cur_offset >= alloc_end) { + ret = 0; + break; + } + } + unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, + &cached_state, GFP_NOFS); + + btrfs_free_reserved_data_space(inode, alloc_end - alloc_start); +out: + mutex_unlock(&inode->i_mutex); + return ret; +} + const struct file_operations btrfs_file_operations = { .llseek = generic_file_llseek, .read = do_sync_read, @@ -1248,6 +1419,7 @@ const struct file_operations btrfs_file_operations = { .open = generic_file_open, .release = btrfs_release_file, .fsync = btrfs_sync_file, + .fallocate = btrfs_fallocate, .unlocked_ioctl = btrfs_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = btrfs_ioctl, diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 60d684266959..11d2e9cea09e 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -24,6 +24,7 @@ #include "free-space-cache.h" #include "transaction.h" #include "disk-io.h" +#include "extent_io.h" #define BITS_PER_BITMAP (PAGE_CACHE_SIZE * 8) #define MAX_CACHE_BYTES_PER_GIG (32 * 1024) @@ -81,6 +82,8 @@ struct inode *lookup_free_space_inode(struct btrfs_root *root, return ERR_PTR(-ENOENT); } + inode->i_mapping->flags &= ~__GFP_FS; + spin_lock(&block_group->lock); if (!root->fs_info->closing) { block_group->inode = igrab(inode); @@ -222,6 +225,7 @@ int load_free_space_cache(struct btrfs_fs_info *fs_info, u64 num_entries; u64 num_bitmaps; u64 generation; + u64 used = btrfs_block_group_used(&block_group->item); u32 cur_crc = ~(u32)0; pgoff_t index = 0; unsigned long first_page_offset; @@ -393,7 +397,8 @@ int load_free_space_cache(struct btrfs_fs_info *fs_info, break; need_loop = 1; - e = kzalloc(sizeof(struct btrfs_free_space), GFP_NOFS); + e = kmem_cache_zalloc(btrfs_free_space_cachep, + GFP_NOFS); if (!e) { kunmap(page); unlock_page(page); @@ -405,7 +410,7 @@ int load_free_space_cache(struct btrfs_fs_info *fs_info, e->bytes = le64_to_cpu(entry->bytes); if (!e->bytes) { kunmap(page); - kfree(e); + kmem_cache_free(btrfs_free_space_cachep, e); unlock_page(page); page_cache_release(page); goto free_cache; @@ -420,7 +425,8 @@ int load_free_space_cache(struct btrfs_fs_info *fs_info, e->bitmap = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS); if (!e->bitmap) { kunmap(page); - kfree(e); + kmem_cache_free( + btrfs_free_space_cachep, e); unlock_page(page); page_cache_release(page); goto free_cache; @@ -465,6 +471,17 @@ next: index++; } + spin_lock(&block_group->tree_lock); + if (block_group->free_space != (block_group->key.offset - used - + block_group->bytes_super)) { + spin_unlock(&block_group->tree_lock); + printk(KERN_ERR "block group %llu has an wrong amount of free " + "space\n", block_group->key.objectid); + ret = 0; + goto free_cache; + } + spin_unlock(&block_group->tree_lock); + ret = 1; out: kfree(checksums); @@ -491,18 +508,23 @@ int btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode; struct rb_node *node; struct list_head *pos, *n; + struct page **pages; struct page *page; struct extent_state *cached_state = NULL; + struct btrfs_free_cluster *cluster = NULL; + struct extent_io_tree *unpin = NULL; struct list_head bitmap_list; struct btrfs_key key; + u64 start, end, len; u64 bytes = 0; u32 *crc, *checksums; - pgoff_t index = 0, last_index = 0; unsigned long first_page_offset; - int num_checksums; + int index = 0, num_pages = 0; int entries = 0; int bitmaps = 0; int ret = 0; + bool next_page = false; + bool out_of_space = false; root = root->fs_info->tree_root; @@ -530,24 +552,43 @@ int btrfs_write_out_cache(struct btrfs_root *root, return 0; } - last_index = (i_size_read(inode) - 1) >> PAGE_CACHE_SHIFT; + num_pages = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> + PAGE_CACHE_SHIFT; filemap_write_and_wait(inode->i_mapping); btrfs_wait_ordered_range(inode, inode->i_size & ~(root->sectorsize - 1), (u64)-1); /* We need a checksum per page. */ - num_checksums = i_size_read(inode) / PAGE_CACHE_SIZE; - crc = checksums = kzalloc(sizeof(u32) * num_checksums, GFP_NOFS); + crc = checksums = kzalloc(sizeof(u32) * num_pages, GFP_NOFS); if (!crc) { iput(inode); return 0; } + pages = kzalloc(sizeof(struct page *) * num_pages, GFP_NOFS); + if (!pages) { + kfree(crc); + iput(inode); + return 0; + } + /* Since the first page has all of our checksums and our generation we * need to calculate the offset into the page that we can start writing * our entries. */ - first_page_offset = (sizeof(u32) * num_checksums) + sizeof(u64); + first_page_offset = (sizeof(u32) * num_pages) + sizeof(u64); + + /* Get the cluster for this block_group if it exists */ + if (!list_empty(&block_group->cluster_list)) + cluster = list_entry(block_group->cluster_list.next, + struct btrfs_free_cluster, + block_group_list); + + /* + * We shouldn't have switched the pinned extents yet so this is the + * right one + */ + unpin = root->fs_info->pinned_extents; /* * Lock all pages first so we can lock the extent safely. @@ -557,20 +598,18 @@ int btrfs_write_out_cache(struct btrfs_root *root, * after find_get_page at this point. Just putting this here so people * know and don't freak out. */ - while (index <= last_index) { + while (index < num_pages) { page = grab_cache_page(inode->i_mapping, index); if (!page) { - pgoff_t i = 0; + int i; - while (i < index) { - page = find_get_page(inode->i_mapping, i); - unlock_page(page); - page_cache_release(page); - page_cache_release(page); - i++; + for (i = 0; i < num_pages; i++) { + unlock_page(pages[i]); + page_cache_release(pages[i]); } goto out_free; } + pages[index] = page; index++; } @@ -578,6 +617,12 @@ int btrfs_write_out_cache(struct btrfs_root *root, lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, 0, &cached_state, GFP_NOFS); + /* + * When searching for pinned extents, we need to start at our start + * offset. + */ + start = block_group->key.objectid; + /* Write out the extent entries */ do { struct btrfs_free_space_entry *entry; @@ -585,18 +630,25 @@ int btrfs_write_out_cache(struct btrfs_root *root, unsigned long offset = 0; unsigned long start_offset = 0; + next_page = false; + if (index == 0) { start_offset = first_page_offset; offset = start_offset; } - page = find_get_page(inode->i_mapping, index); + if (index >= num_pages) { + out_of_space = true; + break; + } + + page = pages[index]; addr = kmap(page); entry = addr + start_offset; memset(addr, 0, PAGE_CACHE_SIZE); - while (1) { + while (node && !next_page) { struct btrfs_free_space *e; e = rb_entry(node, struct btrfs_free_space, offset_index); @@ -612,12 +664,49 @@ int btrfs_write_out_cache(struct btrfs_root *root, entry->type = BTRFS_FREE_SPACE_EXTENT; } node = rb_next(node); - if (!node) - break; + if (!node && cluster) { + node = rb_first(&cluster->root); + cluster = NULL; + } offset += sizeof(struct btrfs_free_space_entry); if (offset + sizeof(struct btrfs_free_space_entry) >= PAGE_CACHE_SIZE) + next_page = true; + entry++; + } + + /* + * We want to add any pinned extents to our free space cache + * so we don't leak the space + */ + while (!next_page && (start < block_group->key.objectid + + block_group->key.offset)) { + ret = find_first_extent_bit(unpin, start, &start, &end, + EXTENT_DIRTY); + if (ret) { + ret = 0; break; + } + + /* This pinned extent is out of our range */ + if (start >= block_group->key.objectid + + block_group->key.offset) + break; + + len = block_group->key.objectid + + block_group->key.offset - start; + len = min(len, end + 1 - start); + + entries++; + entry->offset = cpu_to_le64(start); + entry->bytes = cpu_to_le64(len); + entry->type = BTRFS_FREE_SPACE_EXTENT; + + start = end + 1; + offset += sizeof(struct btrfs_free_space_entry); + if (offset + sizeof(struct btrfs_free_space_entry) >= + PAGE_CACHE_SIZE) + next_page = true; entry++; } *crc = ~(u32)0; @@ -630,25 +719,8 @@ int btrfs_write_out_cache(struct btrfs_root *root, bytes += PAGE_CACHE_SIZE; - ClearPageChecked(page); - set_page_extent_mapped(page); - SetPageUptodate(page); - set_page_dirty(page); - - /* - * We need to release our reference we got for grab_cache_page, - * except for the first page which will hold our checksums, we - * do that below. - */ - if (index != 0) { - unlock_page(page); - page_cache_release(page); - } - - page_cache_release(page); - index++; - } while (node); + } while (node || next_page); /* Write out the bitmaps */ list_for_each_safe(pos, n, &bitmap_list) { @@ -656,7 +728,11 @@ int btrfs_write_out_cache(struct btrfs_root *root, struct btrfs_free_space *entry = list_entry(pos, struct btrfs_free_space, list); - page = find_get_page(inode->i_mapping, index); + if (index >= num_pages) { + out_of_space = true; + break; + } + page = pages[index]; addr = kmap(page); memcpy(addr, entry->bitmap, PAGE_CACHE_SIZE); @@ -667,64 +743,58 @@ int btrfs_write_out_cache(struct btrfs_root *root, crc++; bytes += PAGE_CACHE_SIZE; - ClearPageChecked(page); - set_page_extent_mapped(page); - SetPageUptodate(page); - set_page_dirty(page); - unlock_page(page); - page_cache_release(page); - page_cache_release(page); list_del_init(&entry->list); index++; } + if (out_of_space) { + btrfs_drop_pages(pages, num_pages); + unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0, + i_size_read(inode) - 1, &cached_state, + GFP_NOFS); + ret = 0; + goto out_free; + } + /* Zero out the rest of the pages just to make sure */ - while (index <= last_index) { + while (index < num_pages) { void *addr; - page = find_get_page(inode->i_mapping, index); - + page = pages[index]; addr = kmap(page); memset(addr, 0, PAGE_CACHE_SIZE); kunmap(page); - ClearPageChecked(page); - set_page_extent_mapped(page); - SetPageUptodate(page); - set_page_dirty(page); - unlock_page(page); - page_cache_release(page); - page_cache_release(page); bytes += PAGE_CACHE_SIZE; index++; } - btrfs_set_extent_delalloc(inode, 0, bytes - 1, &cached_state); - /* Write the checksums and trans id to the first page */ { void *addr; u64 *gen; - page = find_get_page(inode->i_mapping, 0); + page = pages[0]; addr = kmap(page); - memcpy(addr, checksums, sizeof(u32) * num_checksums); - gen = addr + (sizeof(u32) * num_checksums); + memcpy(addr, checksums, sizeof(u32) * num_pages); + gen = addr + (sizeof(u32) * num_pages); *gen = trans->transid; kunmap(page); - ClearPageChecked(page); - set_page_extent_mapped(page); - SetPageUptodate(page); - set_page_dirty(page); - unlock_page(page); - page_cache_release(page); - page_cache_release(page); } - BTRFS_I(inode)->generation = trans->transid; + ret = btrfs_dirty_pages(root, inode, pages, num_pages, 0, + bytes, &cached_state); + btrfs_drop_pages(pages, num_pages); unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, &cached_state, GFP_NOFS); + if (ret) { + ret = 0; + goto out_free; + } + + BTRFS_I(inode)->generation = trans->transid; + filemap_write_and_wait(inode->i_mapping); key.objectid = BTRFS_FREE_SPACE_OBJECTID; @@ -775,6 +845,7 @@ out_free: BTRFS_I(inode)->generation = 0; } kfree(checksums); + kfree(pages); btrfs_update_inode(trans, root, inode); iput(inode); return ret; @@ -987,11 +1058,18 @@ tree_search_offset(struct btrfs_block_group_cache *block_group, return entry; } -static void unlink_free_space(struct btrfs_block_group_cache *block_group, - struct btrfs_free_space *info) +static inline void +__unlink_free_space(struct btrfs_block_group_cache *block_group, + struct btrfs_free_space *info) { rb_erase(&info->offset_index, &block_group->free_space_offset); block_group->free_extents--; +} + +static void unlink_free_space(struct btrfs_block_group_cache *block_group, + struct btrfs_free_space *info) +{ + __unlink_free_space(block_group, info); block_group->free_space -= info->bytes; } @@ -1016,14 +1094,18 @@ static void recalculate_thresholds(struct btrfs_block_group_cache *block_group) u64 max_bytes; u64 bitmap_bytes; u64 extent_bytes; + u64 size = block_group->key.offset; /* * The goal is to keep the total amount of memory used per 1gb of space * at or below 32k, so we need to adjust how much memory we allow to be * used by extent based free space tracking */ - max_bytes = MAX_CACHE_BYTES_PER_GIG * - (div64_u64(block_group->key.offset, 1024 * 1024 * 1024)); + if (size < 1024 * 1024 * 1024) + max_bytes = MAX_CACHE_BYTES_PER_GIG; + else + max_bytes = MAX_CACHE_BYTES_PER_GIG * + div64_u64(size, 1024 * 1024 * 1024); /* * we want to account for 1 more bitmap than what we have so we can make @@ -1171,6 +1253,16 @@ static void add_new_bitmap(struct btrfs_block_group_cache *block_group, recalculate_thresholds(block_group); } +static void free_bitmap(struct btrfs_block_group_cache *block_group, + struct btrfs_free_space *bitmap_info) +{ + unlink_free_space(block_group, bitmap_info); + kfree(bitmap_info->bitmap); + kmem_cache_free(btrfs_free_space_cachep, bitmap_info); + block_group->total_bitmaps--; + recalculate_thresholds(block_group); +} + static noinline int remove_from_bitmap(struct btrfs_block_group_cache *block_group, struct btrfs_free_space *bitmap_info, u64 *offset, u64 *bytes) @@ -1195,6 +1287,7 @@ again: */ search_start = *offset; search_bytes = *bytes; + search_bytes = min(search_bytes, end - search_start + 1); ret = search_bitmap(block_group, bitmap_info, &search_start, &search_bytes); BUG_ON(ret < 0 || search_start != *offset); @@ -1211,13 +1304,8 @@ again: if (*bytes) { struct rb_node *next = rb_next(&bitmap_info->offset_index); - if (!bitmap_info->bytes) { - unlink_free_space(block_group, bitmap_info); - kfree(bitmap_info->bitmap); - kfree(bitmap_info); - block_group->total_bitmaps--; - recalculate_thresholds(block_group); - } + if (!bitmap_info->bytes) + free_bitmap(block_group, bitmap_info); /* * no entry after this bitmap, but we still have bytes to @@ -1250,13 +1338,8 @@ again: return -EAGAIN; goto again; - } else if (!bitmap_info->bytes) { - unlink_free_space(block_group, bitmap_info); - kfree(bitmap_info->bitmap); - kfree(bitmap_info); - block_group->total_bitmaps--; - recalculate_thresholds(block_group); - } + } else if (!bitmap_info->bytes) + free_bitmap(block_group, bitmap_info); return 0; } @@ -1273,9 +1356,22 @@ static int insert_into_bitmap(struct btrfs_block_group_cache *block_group, * If we are below the extents threshold then we can add this as an * extent, and don't have to deal with the bitmap */ - if (block_group->free_extents < block_group->extents_thresh && - info->bytes > block_group->sectorsize * 4) - return 0; + if (block_group->free_extents < block_group->extents_thresh) { + /* + * If this block group has some small extents we don't want to + * use up all of our free slots in the cache with them, we want + * to reserve them to larger extents, however if we have plent + * of cache left then go ahead an dadd them, no sense in adding + * the overhead of a bitmap if we don't have to. + */ + if (info->bytes <= block_group->sectorsize * 4) { + if (block_group->free_extents * 2 <= + block_group->extents_thresh) + return 0; + } else { + return 0; + } + } /* * some block groups are so tiny they can't be enveloped by a bitmap, so @@ -1330,8 +1426,8 @@ new_bitmap: /* no pre-allocated info, allocate a new one */ if (!info) { - info = kzalloc(sizeof(struct btrfs_free_space), - GFP_NOFS); + info = kmem_cache_zalloc(btrfs_free_space_cachep, + GFP_NOFS); if (!info) { spin_lock(&block_group->tree_lock); ret = -ENOMEM; @@ -1353,28 +1449,20 @@ out: if (info) { if (info->bitmap) kfree(info->bitmap); - kfree(info); + kmem_cache_free(btrfs_free_space_cachep, info); } return ret; } -int btrfs_add_free_space(struct btrfs_block_group_cache *block_group, - u64 offset, u64 bytes) +bool try_merge_free_space(struct btrfs_block_group_cache *block_group, + struct btrfs_free_space *info, bool update_stat) { - struct btrfs_free_space *right_info = NULL; - struct btrfs_free_space *left_info = NULL; - struct btrfs_free_space *info = NULL; - int ret = 0; - - info = kzalloc(sizeof(struct btrfs_free_space), GFP_NOFS); - if (!info) - return -ENOMEM; - - info->offset = offset; - info->bytes = bytes; - - spin_lock(&block_group->tree_lock); + struct btrfs_free_space *left_info; + struct btrfs_free_space *right_info; + bool merged = false; + u64 offset = info->offset; + u64 bytes = info->bytes; /* * first we want to see if there is free space adjacent to the range we @@ -1388,40 +1476,65 @@ int btrfs_add_free_space(struct btrfs_block_group_cache *block_group, else left_info = tree_search_offset(block_group, offset - 1, 0, 0); - /* - * If there was no extent directly to the left or right of this new - * extent then we know we're going to have to allocate a new extent, so - * before we do that see if we need to drop this into a bitmap - */ - if ((!left_info || left_info->bitmap) && - (!right_info || right_info->bitmap)) { - ret = insert_into_bitmap(block_group, info); - - if (ret < 0) { - goto out; - } else if (ret) { - ret = 0; - goto out; - } - } - if (right_info && !right_info->bitmap) { - unlink_free_space(block_group, right_info); + if (update_stat) + unlink_free_space(block_group, right_info); + else + __unlink_free_space(block_group, right_info); info->bytes += right_info->bytes; - kfree(right_info); + kmem_cache_free(btrfs_free_space_cachep, right_info); + merged = true; } if (left_info && !left_info->bitmap && left_info->offset + left_info->bytes == offset) { - unlink_free_space(block_group, left_info); + if (update_stat) + unlink_free_space(block_group, left_info); + else + __unlink_free_space(block_group, left_info); info->offset = left_info->offset; info->bytes += left_info->bytes; - kfree(left_info); + kmem_cache_free(btrfs_free_space_cachep, left_info); + merged = true; } + return merged; +} + +int btrfs_add_free_space(struct btrfs_block_group_cache *block_group, + u64 offset, u64 bytes) +{ + struct btrfs_free_space *info; + int ret = 0; + + info = kmem_cache_zalloc(btrfs_free_space_cachep, GFP_NOFS); + if (!info) + return -ENOMEM; + + info->offset = offset; + info->bytes = bytes; + + spin_lock(&block_group->tree_lock); + + if (try_merge_free_space(block_group, info, true)) + goto link; + + /* + * There was no extent directly to the left or right of this new + * extent then we know we're going to have to allocate a new extent, so + * before we do that see if we need to drop this into a bitmap + */ + ret = insert_into_bitmap(block_group, info); + if (ret < 0) { + goto out; + } else if (ret) { + ret = 0; + goto out; + } +link: ret = link_free_space(block_group, info); if (ret) - kfree(info); + kmem_cache_free(btrfs_free_space_cachep, info); out: spin_unlock(&block_group->tree_lock); @@ -1491,7 +1604,7 @@ again: kfree(info->bitmap); block_group->total_bitmaps--; } - kfree(info); + kmem_cache_free(btrfs_free_space_cachep, info); goto out_lock; } @@ -1527,7 +1640,7 @@ again: /* the hole we're creating ends at the end * of the info struct, just free the info */ - kfree(info); + kmem_cache_free(btrfs_free_space_cachep, info); } spin_unlock(&block_group->tree_lock); @@ -1600,29 +1713,28 @@ __btrfs_return_cluster_to_free_space( { struct btrfs_free_space *entry; struct rb_node *node; - bool bitmap; spin_lock(&cluster->lock); if (cluster->block_group != block_group) goto out; - bitmap = cluster->points_to_bitmap; cluster->block_group = NULL; cluster->window_start = 0; list_del_init(&cluster->block_group_list); - cluster->points_to_bitmap = false; - - if (bitmap) - goto out; node = rb_first(&cluster->root); while (node) { + bool bitmap; + entry = rb_entry(node, struct btrfs_free_space, offset_index); node = rb_next(&entry->offset_index); rb_erase(&entry->offset_index, &cluster->root); - BUG_ON(entry->bitmap); + + bitmap = (entry->bitmap != NULL); + if (!bitmap) + try_merge_free_space(block_group, entry, false); tree_insert_offset(&block_group->free_space_offset, - entry->offset, &entry->offset_index, 0); + entry->offset, &entry->offset_index, bitmap); } cluster->root = RB_ROOT; @@ -1659,7 +1771,7 @@ void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group) unlink_free_space(block_group, info); if (info->bitmap) kfree(info->bitmap); - kfree(info); + kmem_cache_free(btrfs_free_space_cachep, info); if (need_resched()) { spin_unlock(&block_group->tree_lock); cond_resched(); @@ -1685,19 +1797,14 @@ u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group, ret = offset; if (entry->bitmap) { bitmap_clear_bits(block_group, entry, offset, bytes); - if (!entry->bytes) { - unlink_free_space(block_group, entry); - kfree(entry->bitmap); - kfree(entry); - block_group->total_bitmaps--; - recalculate_thresholds(block_group); - } + if (!entry->bytes) + free_bitmap(block_group, entry); } else { unlink_free_space(block_group, entry); entry->offset += bytes; entry->bytes -= bytes; if (!entry->bytes) - kfree(entry); + kmem_cache_free(btrfs_free_space_cachep, entry); else link_free_space(block_group, entry); } @@ -1750,48 +1857,24 @@ int btrfs_return_cluster_to_free_space( static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group_cache *block_group, struct btrfs_free_cluster *cluster, + struct btrfs_free_space *entry, u64 bytes, u64 min_start) { - struct btrfs_free_space *entry; int err; u64 search_start = cluster->window_start; u64 search_bytes = bytes; u64 ret = 0; - spin_lock(&block_group->tree_lock); - spin_lock(&cluster->lock); - - if (!cluster->points_to_bitmap) - goto out; - - if (cluster->block_group != block_group) - goto out; - - /* - * search_start is the beginning of the bitmap, but at some point it may - * be a good idea to point to the actual start of the free area in the - * bitmap, so do the offset_to_bitmap trick anyway, and set bitmap_only - * to 1 to make sure we get the bitmap entry - */ - entry = tree_search_offset(block_group, - offset_to_bitmap(block_group, search_start), - 1, 0); - if (!entry || !entry->bitmap) - goto out; - search_start = min_start; search_bytes = bytes; err = search_bitmap(block_group, entry, &search_start, &search_bytes); if (err) - goto out; + return 0; ret = search_start; bitmap_clear_bits(block_group, entry, ret, bytes); -out: - spin_unlock(&cluster->lock); - spin_unlock(&block_group->tree_lock); return ret; } @@ -1809,10 +1892,6 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group, struct rb_node *node; u64 ret = 0; - if (cluster->points_to_bitmap) - return btrfs_alloc_from_bitmap(block_group, cluster, bytes, - min_start); - spin_lock(&cluster->lock); if (bytes > cluster->max_size) goto out; @@ -1825,9 +1904,9 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group, goto out; entry = rb_entry(node, struct btrfs_free_space, offset_index); - while(1) { - if (entry->bytes < bytes || entry->offset < min_start) { + if (entry->bytes < bytes || + (!entry->bitmap && entry->offset < min_start)) { struct rb_node *node; node = rb_next(&entry->offset_index); @@ -1837,20 +1916,53 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group, offset_index); continue; } - ret = entry->offset; - entry->offset += bytes; - entry->bytes -= bytes; + if (entry->bitmap) { + ret = btrfs_alloc_from_bitmap(block_group, + cluster, entry, bytes, + min_start); + if (ret == 0) { + struct rb_node *node; + node = rb_next(&entry->offset_index); + if (!node) + break; + entry = rb_entry(node, struct btrfs_free_space, + offset_index); + continue; + } + } else { - if (entry->bytes == 0) { - rb_erase(&entry->offset_index, &cluster->root); - kfree(entry); + ret = entry->offset; + + entry->offset += bytes; + entry->bytes -= bytes; } + + if (entry->bytes == 0) + rb_erase(&entry->offset_index, &cluster->root); break; } out: spin_unlock(&cluster->lock); + if (!ret) + return 0; + + spin_lock(&block_group->tree_lock); + + block_group->free_space -= bytes; + if (entry->bytes == 0) { + block_group->free_extents--; + if (entry->bitmap) { + kfree(entry->bitmap); + block_group->total_bitmaps--; + recalculate_thresholds(block_group); + } + kmem_cache_free(btrfs_free_space_cachep, entry); + } + + spin_unlock(&block_group->tree_lock); + return ret; } @@ -1866,12 +1978,13 @@ static int btrfs_bitmap_cluster(struct btrfs_block_group_cache *block_group, unsigned long found_bits; unsigned long start = 0; unsigned long total_found = 0; + int ret; bool found = false; i = offset_to_bit(entry->offset, block_group->sectorsize, max_t(u64, offset, entry->offset)); - search_bits = bytes_to_bits(min_bytes, block_group->sectorsize); - total_bits = bytes_to_bits(bytes, block_group->sectorsize); + search_bits = bytes_to_bits(bytes, block_group->sectorsize); + total_bits = bytes_to_bits(min_bytes, block_group->sectorsize); again: found_bits = 0; @@ -1888,7 +2001,7 @@ again: } if (!found_bits) - return -1; + return -ENOSPC; if (!found) { start = i; @@ -1912,189 +2025,208 @@ again: cluster->window_start = start * block_group->sectorsize + entry->offset; - cluster->points_to_bitmap = true; + rb_erase(&entry->offset_index, &block_group->free_space_offset); + ret = tree_insert_offset(&cluster->root, entry->offset, + &entry->offset_index, 1); + BUG_ON(ret); return 0; } /* - * here we try to find a cluster of blocks in a block group. The goal - * is to find at least bytes free and up to empty_size + bytes free. - * We might not find them all in one contiguous area. - * - * returns zero and sets up cluster if things worked out, otherwise - * it returns -enospc + * This searches the block group for just extents to fill the cluster with. */ -int btrfs_find_space_cluster(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_block_group_cache *block_group, - struct btrfs_free_cluster *cluster, - u64 offset, u64 bytes, u64 empty_size) +static int setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group, + struct btrfs_free_cluster *cluster, + u64 offset, u64 bytes, u64 min_bytes) { + struct btrfs_free_space *first = NULL; struct btrfs_free_space *entry = NULL; + struct btrfs_free_space *prev = NULL; + struct btrfs_free_space *last; struct rb_node *node; - struct btrfs_free_space *next; - struct btrfs_free_space *last = NULL; - u64 min_bytes; u64 window_start; u64 window_free; - u64 max_extent = 0; - bool found_bitmap = false; - int ret; + u64 max_extent; + u64 max_gap = 128 * 1024; - /* for metadata, allow allocates with more holes */ - if (btrfs_test_opt(root, SSD_SPREAD)) { - min_bytes = bytes + empty_size; - } else if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) { - /* - * we want to do larger allocations when we are - * flushing out the delayed refs, it helps prevent - * making more work as we go along. - */ - if (trans->transaction->delayed_refs.flushing) - min_bytes = max(bytes, (bytes + empty_size) >> 1); - else - min_bytes = max(bytes, (bytes + empty_size) >> 4); - } else - min_bytes = max(bytes, (bytes + empty_size) >> 2); - - spin_lock(&block_group->tree_lock); - spin_lock(&cluster->lock); - - /* someone already found a cluster, hooray */ - if (cluster->block_group) { - ret = 0; - goto out; - } -again: - entry = tree_search_offset(block_group, offset, found_bitmap, 1); - if (!entry) { - ret = -ENOSPC; - goto out; - } + entry = tree_search_offset(block_group, offset, 0, 1); + if (!entry) + return -ENOSPC; /* - * If found_bitmap is true, we exhausted our search for extent entries, - * and we just want to search all of the bitmaps that we can find, and - * ignore any extent entries we find. + * We don't want bitmaps, so just move along until we find a normal + * extent entry. */ - while (entry->bitmap || found_bitmap || - (!entry->bitmap && entry->bytes < min_bytes)) { - struct rb_node *node = rb_next(&entry->offset_index); - - if (entry->bitmap && entry->bytes > bytes + empty_size) { - ret = btrfs_bitmap_cluster(block_group, entry, cluster, - offset, bytes + empty_size, - min_bytes); - if (!ret) - goto got_it; - } - - if (!node) { - ret = -ENOSPC; - goto out; - } + while (entry->bitmap) { + node = rb_next(&entry->offset_index); + if (!node) + return -ENOSPC; entry = rb_entry(node, struct btrfs_free_space, offset_index); } - /* - * We already searched all the extent entries from the passed in offset - * to the end and didn't find enough space for the cluster, and we also - * didn't find any bitmaps that met our criteria, just go ahead and exit - */ - if (found_bitmap) { - ret = -ENOSPC; - goto out; - } - - cluster->points_to_bitmap = false; window_start = entry->offset; window_free = entry->bytes; - last = entry; max_extent = entry->bytes; + first = entry; + last = entry; + prev = entry; - while (1) { - /* out window is just right, lets fill it */ - if (window_free >= bytes + empty_size) - break; - - node = rb_next(&last->offset_index); - if (!node) { - if (found_bitmap) - goto again; - ret = -ENOSPC; - goto out; - } - next = rb_entry(node, struct btrfs_free_space, offset_index); + while (window_free <= min_bytes) { + node = rb_next(&entry->offset_index); + if (!node) + return -ENOSPC; + entry = rb_entry(node, struct btrfs_free_space, offset_index); - /* - * we found a bitmap, so if this search doesn't result in a - * cluster, we know to go and search again for the bitmaps and - * start looking for space there - */ - if (next->bitmap) { - if (!found_bitmap) - offset = next->offset; - found_bitmap = true; - last = next; + if (entry->bitmap) continue; - } - /* * we haven't filled the empty size and the window is * very large. reset and try again */ - if (next->offset - (last->offset + last->bytes) > 128 * 1024 || - next->offset - window_start > (bytes + empty_size) * 2) { - entry = next; + if (entry->offset - (prev->offset + prev->bytes) > max_gap || + entry->offset - window_start > (min_bytes * 2)) { + first = entry; window_start = entry->offset; window_free = entry->bytes; last = entry; max_extent = entry->bytes; } else { - last = next; - window_free += next->bytes; + last = entry; + window_free += entry->bytes; if (entry->bytes > max_extent) max_extent = entry->bytes; } + prev = entry; } - cluster->window_start = entry->offset; + cluster->window_start = first->offset; + + node = &first->offset_index; /* * now we've found our entries, pull them out of the free space * cache and put them into the cluster rbtree - * - * The cluster includes an rbtree, but only uses the offset index - * of each free space cache entry. */ - while (1) { + do { + int ret; + + entry = rb_entry(node, struct btrfs_free_space, offset_index); node = rb_next(&entry->offset_index); - if (entry->bitmap && node) { - entry = rb_entry(node, struct btrfs_free_space, - offset_index); + if (entry->bitmap) continue; - } else if (entry->bitmap && !node) { - break; - } rb_erase(&entry->offset_index, &block_group->free_space_offset); ret = tree_insert_offset(&cluster->root, entry->offset, &entry->offset_index, 0); BUG_ON(ret); + } while (node && entry != last); - if (!node || entry == last) - break; + cluster->max_size = max_extent; + + return 0; +} + +/* + * This specifically looks for bitmaps that may work in the cluster, we assume + * that we have already failed to find extents that will work. + */ +static int setup_cluster_bitmap(struct btrfs_block_group_cache *block_group, + struct btrfs_free_cluster *cluster, + u64 offset, u64 bytes, u64 min_bytes) +{ + struct btrfs_free_space *entry; + struct rb_node *node; + int ret = -ENOSPC; + + if (block_group->total_bitmaps == 0) + return -ENOSPC; + + entry = tree_search_offset(block_group, + offset_to_bitmap(block_group, offset), + 0, 1); + if (!entry) + return -ENOSPC; + node = &entry->offset_index; + do { entry = rb_entry(node, struct btrfs_free_space, offset_index); + node = rb_next(&entry->offset_index); + if (!entry->bitmap) + continue; + if (entry->bytes < min_bytes) + continue; + ret = btrfs_bitmap_cluster(block_group, entry, cluster, offset, + bytes, min_bytes); + } while (ret && node); + + return ret; +} + +/* + * here we try to find a cluster of blocks in a block group. The goal + * is to find at least bytes free and up to empty_size + bytes free. + * We might not find them all in one contiguous area. + * + * returns zero and sets up cluster if things worked out, otherwise + * it returns -enospc + */ +int btrfs_find_space_cluster(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_block_group_cache *block_group, + struct btrfs_free_cluster *cluster, + u64 offset, u64 bytes, u64 empty_size) +{ + u64 min_bytes; + int ret; + + /* for metadata, allow allocates with more holes */ + if (btrfs_test_opt(root, SSD_SPREAD)) { + min_bytes = bytes + empty_size; + } else if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) { + /* + * we want to do larger allocations when we are + * flushing out the delayed refs, it helps prevent + * making more work as we go along. + */ + if (trans->transaction->delayed_refs.flushing) + min_bytes = max(bytes, (bytes + empty_size) >> 1); + else + min_bytes = max(bytes, (bytes + empty_size) >> 4); + } else + min_bytes = max(bytes, (bytes + empty_size) >> 2); + + spin_lock(&block_group->tree_lock); + + /* + * If we know we don't have enough space to make a cluster don't even + * bother doing all the work to try and find one. + */ + if (block_group->free_space < min_bytes) { + spin_unlock(&block_group->tree_lock); + return -ENOSPC; } - cluster->max_size = max_extent; -got_it: - ret = 0; - atomic_inc(&block_group->count); - list_add_tail(&cluster->block_group_list, &block_group->cluster_list); - cluster->block_group = block_group; + spin_lock(&cluster->lock); + + /* someone already found a cluster, hooray */ + if (cluster->block_group) { + ret = 0; + goto out; + } + + ret = setup_cluster_no_bitmap(block_group, cluster, offset, bytes, + min_bytes); + if (ret) + ret = setup_cluster_bitmap(block_group, cluster, offset, + bytes, min_bytes); + + if (!ret) { + atomic_inc(&block_group->count); + list_add_tail(&cluster->block_group_list, + &block_group->cluster_list); + cluster->block_group = block_group; + } out: spin_unlock(&cluster->lock); spin_unlock(&block_group->tree_lock); @@ -2111,8 +2243,99 @@ void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster) spin_lock_init(&cluster->refill_lock); cluster->root = RB_ROOT; cluster->max_size = 0; - cluster->points_to_bitmap = false; INIT_LIST_HEAD(&cluster->block_group_list); cluster->block_group = NULL; } +int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group, + u64 *trimmed, u64 start, u64 end, u64 minlen) +{ + struct btrfs_free_space *entry = NULL; + struct btrfs_fs_info *fs_info = block_group->fs_info; + u64 bytes = 0; + u64 actually_trimmed; + int ret = 0; + + *trimmed = 0; + + while (start < end) { + spin_lock(&block_group->tree_lock); + + if (block_group->free_space < minlen) { + spin_unlock(&block_group->tree_lock); + break; + } + + entry = tree_search_offset(block_group, start, 0, 1); + if (!entry) + entry = tree_search_offset(block_group, + offset_to_bitmap(block_group, + start), + 1, 1); + + if (!entry || entry->offset >= end) { + spin_unlock(&block_group->tree_lock); + break; + } + + if (entry->bitmap) { + ret = search_bitmap(block_group, entry, &start, &bytes); + if (!ret) { + if (start >= end) { + spin_unlock(&block_group->tree_lock); + break; + } + bytes = min(bytes, end - start); + bitmap_clear_bits(block_group, entry, + start, bytes); + if (entry->bytes == 0) + free_bitmap(block_group, entry); + } else { + start = entry->offset + BITS_PER_BITMAP * + block_group->sectorsize; + spin_unlock(&block_group->tree_lock); + ret = 0; + continue; + } + } else { + start = entry->offset; + bytes = min(entry->bytes, end - start); + unlink_free_space(block_group, entry); + kfree(entry); + } + + spin_unlock(&block_group->tree_lock); + + if (bytes >= minlen) { + int update_ret; + update_ret = btrfs_update_reserved_bytes(block_group, + bytes, 1, 1); + + ret = btrfs_error_discard_extent(fs_info->extent_root, + start, + bytes, + &actually_trimmed); + + btrfs_add_free_space(block_group, + start, bytes); + if (!update_ret) + btrfs_update_reserved_bytes(block_group, + bytes, 0, 1); + + if (ret) + break; + *trimmed += actually_trimmed; + } + start += bytes; + bytes = 0; + + if (fatal_signal_pending(current)) { + ret = -ERESTARTSYS; + break; + } + + cond_resched(); + } + + return ret; +} diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h index e49ca5c321b5..65c3b935289f 100644 --- a/fs/btrfs/free-space-cache.h +++ b/fs/btrfs/free-space-cache.h @@ -68,4 +68,6 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group, int btrfs_return_cluster_to_free_space( struct btrfs_block_group_cache *block_group, struct btrfs_free_cluster *cluster); +int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group, + u64 *trimmed, u64 start, u64 end, u64 minlen); #endif diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c index c56eb5909172..c05a08f4c411 100644 --- a/fs/btrfs/inode-map.c +++ b/fs/btrfs/inode-map.c @@ -30,7 +30,8 @@ int btrfs_find_highest_inode(struct btrfs_root *root, u64 *objectid) int slot; path = btrfs_alloc_path(); - BUG_ON(!path); + if (!path) + return -ENOMEM; search_key.objectid = BTRFS_LAST_FREE_OBJECTID; search_key.type = -1; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index a3798a3aa0d2..fcd66b6a8086 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -50,6 +50,7 @@ #include "tree-log.h" #include "compression.h" #include "locking.h" +#include "free-space-cache.h" struct btrfs_iget_args { u64 ino; @@ -70,6 +71,7 @@ static struct kmem_cache *btrfs_inode_cachep; struct kmem_cache *btrfs_trans_handle_cachep; struct kmem_cache *btrfs_transaction_cachep; struct kmem_cache *btrfs_path_cachep; +struct kmem_cache *btrfs_free_space_cachep; #define S_SHIFT 12 static unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = { @@ -82,7 +84,8 @@ static unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = { [S_IFLNK >> S_SHIFT] = BTRFS_FT_SYMLINK, }; -static void btrfs_truncate(struct inode *inode); +static int btrfs_setsize(struct inode *inode, loff_t newsize); +static int btrfs_truncate(struct inode *inode); static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end); static noinline int cow_file_range(struct inode *inode, struct page *locked_page, @@ -90,13 +93,14 @@ static noinline int cow_file_range(struct inode *inode, unsigned long *nr_written, int unlock); static int btrfs_init_inode_security(struct btrfs_trans_handle *trans, - struct inode *inode, struct inode *dir) + struct inode *inode, struct inode *dir, + const struct qstr *qstr) { int err; err = btrfs_init_acl(trans, inode, dir); if (!err) - err = btrfs_xattr_security_init(trans, inode, dir); + err = btrfs_xattr_security_init(trans, inode, dir, qstr); return err; } @@ -108,6 +112,7 @@ static int btrfs_init_inode_security(struct btrfs_trans_handle *trans, static noinline int insert_inline_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode, u64 start, size_t size, size_t compressed_size, + int compress_type, struct page **compressed_pages) { struct btrfs_key key; @@ -122,12 +127,9 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans, size_t cur_size = size; size_t datasize; unsigned long offset; - int use_compress = 0; - if (compressed_size && compressed_pages) { - use_compress = 1; + if (compressed_size && compressed_pages) cur_size = compressed_size; - } path = btrfs_alloc_path(); if (!path) @@ -159,7 +161,7 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans, btrfs_set_file_extent_ram_bytes(leaf, ei, size); ptr = btrfs_file_extent_inline_start(ei); - if (use_compress) { + if (compress_type != BTRFS_COMPRESS_NONE) { struct page *cpage; int i = 0; while (compressed_size > 0) { @@ -176,7 +178,7 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans, compressed_size -= cur_size; } btrfs_set_file_extent_compression(leaf, ei, - BTRFS_COMPRESS_ZLIB); + compress_type); } else { page = find_get_page(inode->i_mapping, start >> PAGE_CACHE_SHIFT); @@ -217,7 +219,7 @@ fail: static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode, u64 start, u64 end, - size_t compressed_size, + size_t compressed_size, int compress_type, struct page **compressed_pages) { u64 isize = i_size_read(inode); @@ -250,7 +252,7 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans, inline_len = min_t(u64, isize, actual_end); ret = insert_inline_extent(trans, root, inode, start, inline_len, compressed_size, - compressed_pages); + compress_type, compressed_pages); BUG_ON(ret); btrfs_delalloc_release_metadata(inode, end + 1 - start); btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0); @@ -263,6 +265,7 @@ struct async_extent { u64 compressed_size; struct page **pages; unsigned long nr_pages; + int compress_type; struct list_head list; }; @@ -280,16 +283,19 @@ static noinline int add_async_extent(struct async_cow *cow, u64 start, u64 ram_size, u64 compressed_size, struct page **pages, - unsigned long nr_pages) + unsigned long nr_pages, + int compress_type) { struct async_extent *async_extent; async_extent = kmalloc(sizeof(*async_extent), GFP_NOFS); + BUG_ON(!async_extent); async_extent->start = start; async_extent->ram_size = ram_size; async_extent->compressed_size = compressed_size; async_extent->pages = pages; async_extent->nr_pages = nr_pages; + async_extent->compress_type = compress_type; list_add_tail(&async_extent->list, &cow->extents); return 0; } @@ -332,6 +338,7 @@ static noinline int compress_file_range(struct inode *inode, unsigned long max_uncompressed = 128 * 1024; int i; int will_compress; + int compress_type = root->fs_info->compress_type; actual_end = min_t(u64, isize, end + 1); again: @@ -377,16 +384,22 @@ again: */ if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS) && (btrfs_test_opt(root, COMPRESS) || - (BTRFS_I(inode)->force_compress))) { + (BTRFS_I(inode)->force_compress) || + (BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS))) { WARN_ON(pages); pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS); + BUG_ON(!pages); + + if (BTRFS_I(inode)->force_compress) + compress_type = BTRFS_I(inode)->force_compress; - ret = btrfs_zlib_compress_pages(inode->i_mapping, start, - total_compressed, pages, - nr_pages, &nr_pages_ret, - &total_in, - &total_compressed, - max_compressed); + ret = btrfs_compress_pages(compress_type, + inode->i_mapping, start, + total_compressed, pages, + nr_pages, &nr_pages_ret, + &total_in, + &total_compressed, + max_compressed); if (!ret) { unsigned long offset = total_compressed & @@ -408,7 +421,7 @@ again: } if (start == 0) { trans = btrfs_join_transaction(root, 1); - BUG_ON(!trans); + BUG_ON(IS_ERR(trans)); btrfs_set_trans_block_group(trans, inode); trans->block_rsv = &root->fs_info->delalloc_block_rsv; @@ -418,12 +431,13 @@ again: * to make an uncompressed inline extent. */ ret = cow_file_range_inline(trans, root, inode, - start, end, 0, NULL); + start, end, 0, 0, NULL); } else { /* try making a compressed inline extent */ ret = cow_file_range_inline(trans, root, inode, start, end, - total_compressed, pages); + total_compressed, + compress_type, pages); } if (ret == 0) { /* @@ -493,7 +507,8 @@ again: * and will submit them to the elevator. */ add_async_extent(async_cow, start, num_bytes, - total_compressed, pages, nr_pages_ret); + total_compressed, pages, nr_pages_ret, + compress_type); if (start + num_bytes < end) { start += num_bytes; @@ -515,7 +530,8 @@ cleanup_and_bail_uncompressed: __set_page_dirty_nobuffers(locked_page); /* unlocked later on in the async handlers */ } - add_async_extent(async_cow, start, end - start + 1, 0, NULL, 0); + add_async_extent(async_cow, start, end - start + 1, + 0, NULL, 0, BTRFS_COMPRESS_NONE); *num_added += 1; } @@ -602,6 +618,7 @@ retry: GFP_NOFS); trans = btrfs_join_transaction(root, 1); + BUG_ON(IS_ERR(trans)); ret = btrfs_reserve_extent(trans, root, async_extent->compressed_size, async_extent->compressed_size, @@ -633,6 +650,7 @@ retry: async_extent->ram_size - 1, 0); em = alloc_extent_map(GFP_NOFS); + BUG_ON(!em); em->start = async_extent->start; em->len = async_extent->ram_size; em->orig_start = em->start; @@ -640,6 +658,7 @@ retry: em->block_start = ins.objectid; em->block_len = ins.offset; em->bdev = root->fs_info->fs_devices->latest_bdev; + em->compress_type = async_extent->compress_type; set_bit(EXTENT_FLAG_PINNED, &em->flags); set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); @@ -656,11 +675,13 @@ retry: async_extent->ram_size - 1, 0); } - ret = btrfs_add_ordered_extent(inode, async_extent->start, - ins.objectid, - async_extent->ram_size, - ins.offset, - BTRFS_ORDERED_COMPRESSED); + ret = btrfs_add_ordered_extent_compress(inode, + async_extent->start, + ins.objectid, + async_extent->ram_size, + ins.offset, + BTRFS_ORDERED_COMPRESSED, + async_extent->compress_type); BUG_ON(ret); /* @@ -758,7 +779,7 @@ static noinline int cow_file_range(struct inode *inode, BUG_ON(root == root->fs_info->tree_root); trans = btrfs_join_transaction(root, 1); - BUG_ON(!trans); + BUG_ON(IS_ERR(trans)); btrfs_set_trans_block_group(trans, inode); trans->block_rsv = &root->fs_info->delalloc_block_rsv; @@ -770,7 +791,7 @@ static noinline int cow_file_range(struct inode *inode, if (start == 0) { /* lets try to make an inline extent */ ret = cow_file_range_inline(trans, root, inode, - start, end, 0, NULL); + start, end, 0, 0, NULL); if (ret == 0) { extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree, @@ -806,6 +827,7 @@ static noinline int cow_file_range(struct inode *inode, BUG_ON(ret); em = alloc_extent_map(GFP_NOFS); + BUG_ON(!em); em->start = start; em->orig_start = em->start; ram_size = ins.offset; @@ -1036,7 +1058,7 @@ static noinline int run_delalloc_nocow(struct inode *inode, } else { trans = btrfs_join_transaction(root, 1); } - BUG_ON(!trans); + BUG_ON(IS_ERR(trans)); cow_start = (u64)-1; cur_offset = start; @@ -1155,6 +1177,7 @@ out_check: struct extent_map_tree *em_tree; em_tree = &BTRFS_I(inode)->extent_tree; em = alloc_extent_map(GFP_NOFS); + BUG_ON(!em); em->start = cur_offset; em->orig_start = em->start; em->len = num_bytes; @@ -1236,7 +1259,8 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page, ret = run_delalloc_nocow(inode, locked_page, start, end, page_started, 0, nr_written); else if (!btrfs_test_opt(root, COMPRESS) && - !(BTRFS_I(inode)->force_compress)) + !(BTRFS_I(inode)->force_compress) && + !(BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS)) ret = cow_file_range(inode, locked_page, start, end, page_started, nr_written, 1); else @@ -1443,8 +1467,11 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, if (bio_flags & EXTENT_BIO_COMPRESSED) { return btrfs_submit_compressed_read(inode, bio, mirror_num, bio_flags); - } else if (!skip_sum) - btrfs_lookup_bio_sums(root, inode, bio, NULL); + } else if (!skip_sum) { + ret = btrfs_lookup_bio_sums(root, inode, bio, NULL); + if (ret) + return ret; + } goto mapit; } else if (!skip_sum) { /* csum items have already been cloned */ @@ -1544,6 +1571,7 @@ out: out_page: unlock_page(page); page_cache_release(page); + kfree(fixup); } /* @@ -1670,7 +1698,7 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) struct btrfs_ordered_extent *ordered_extent = NULL; struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; struct extent_state *cached_state = NULL; - int compressed = 0; + int compress_type = 0; int ret; bool nolock = false; @@ -1690,7 +1718,7 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) trans = btrfs_join_transaction_nolock(root, 1); else trans = btrfs_join_transaction(root, 1); - BUG_ON(!trans); + BUG_ON(IS_ERR(trans)); btrfs_set_trans_block_group(trans, inode); trans->block_rsv = &root->fs_info->delalloc_block_rsv; ret = btrfs_update_inode(trans, root, inode); @@ -1707,13 +1735,14 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) trans = btrfs_join_transaction_nolock(root, 1); else trans = btrfs_join_transaction(root, 1); + BUG_ON(IS_ERR(trans)); btrfs_set_trans_block_group(trans, inode); trans->block_rsv = &root->fs_info->delalloc_block_rsv; if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags)) - compressed = 1; + compress_type = ordered_extent->compress_type; if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) { - BUG_ON(compressed); + BUG_ON(compress_type); ret = btrfs_mark_extent_written(trans, inode, ordered_extent->file_offset, ordered_extent->file_offset + @@ -1727,7 +1756,7 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) ordered_extent->disk_len, ordered_extent->len, ordered_extent->len, - compressed, 0, 0, + compress_type, 0, 0, BTRFS_FILE_EXTENT_REG); unpin_extent_cache(&BTRFS_I(inode)->extent_tree, ordered_extent->file_offset, @@ -1741,9 +1770,12 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) add_pending_csums(trans, inode, ordered_extent->file_offset, &ordered_extent->list); - btrfs_ordered_update_i_size(inode, 0, ordered_extent); - ret = btrfs_update_inode(trans, root, inode); - BUG_ON(ret); + ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent); + if (!ret) { + ret = btrfs_update_inode(trans, root, inode); + BUG_ON(ret); + } + ret = 0; out: if (nolock) { if (trans) @@ -1765,6 +1797,8 @@ out: static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end, struct extent_state *state, int uptodate) { + trace_btrfs_writepage_end_io_hook(page, start, end, uptodate); + ClearPagePrivate2(page); return btrfs_finish_ordered_io(page->mapping->host, start, end); } @@ -1829,6 +1863,8 @@ static int btrfs_io_failed_hook(struct bio *failed_bio, if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { logical = em->block_start; failrec->bio_flags = EXTENT_BIO_COMPRESSED; + extent_set_compress_type(&failrec->bio_flags, + em->compress_type); } failrec->logical = logical; free_extent_map(em); @@ -1873,10 +1909,10 @@ static int btrfs_io_failed_hook(struct bio *failed_bio, else rw = READ; - BTRFS_I(inode)->io_tree.ops->submit_bio_hook(inode, rw, bio, + ret = BTRFS_I(inode)->io_tree.ops->submit_bio_hook(inode, rw, bio, failrec->last_mirror, failrec->bio_flags, 0); - return 0; + return ret; } /* @@ -1892,7 +1928,7 @@ static int btrfs_clean_io_failures(struct inode *inode, u64 start) private = 0; if (count_range_bits(&BTRFS_I(inode)->io_failure_tree, &private, - (u64)-1, 1, EXTENT_DIRTY)) { + (u64)-1, 1, EXTENT_DIRTY, 0)) { ret = get_state_private(&BTRFS_I(inode)->io_failure_tree, start, &private_failure); if (ret == 0) { @@ -2188,8 +2224,6 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode) insert = 1; #endif insert = 1; - } else { - WARN_ON(!BTRFS_I(inode)->orphan_meta_reserved); } if (!BTRFS_I(inode)->orphan_meta_reserved) { @@ -2260,7 +2294,7 @@ int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode) * this cleans up any orphans that may be left on the list from the last use * of this root. */ -void btrfs_orphan_cleanup(struct btrfs_root *root) +int btrfs_orphan_cleanup(struct btrfs_root *root) { struct btrfs_path *path; struct extent_buffer *leaf; @@ -2270,10 +2304,13 @@ void btrfs_orphan_cleanup(struct btrfs_root *root) int ret = 0, nr_unlink = 0, nr_truncate = 0; if (cmpxchg(&root->orphan_cleanup_state, 0, ORPHAN_CLEANUP_STARTED)) - return; + return 0; path = btrfs_alloc_path(); - BUG_ON(!path); + if (!path) { + ret = -ENOMEM; + goto out; + } path->reada = -1; key.objectid = BTRFS_ORPHAN_OBJECTID; @@ -2282,18 +2319,16 @@ void btrfs_orphan_cleanup(struct btrfs_root *root) while (1) { ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - if (ret < 0) { - printk(KERN_ERR "Error searching slot for orphan: %d" - "\n", ret); - break; - } + if (ret < 0) + goto out; /* * if ret == 0 means we found what we were searching for, which - * is weird, but possible, so only screw with path if we didnt + * is weird, but possible, so only screw with path if we didn't * find the key and see if we have stuff that matches */ if (ret > 0) { + ret = 0; if (path->slots[0] == 0) break; path->slots[0]--; @@ -2321,7 +2356,10 @@ void btrfs_orphan_cleanup(struct btrfs_root *root) found_key.type = BTRFS_INODE_ITEM_KEY; found_key.offset = 0; inode = btrfs_iget(root->fs_info->sb, &found_key, root, NULL); - BUG_ON(IS_ERR(inode)); + if (IS_ERR(inode)) { + ret = PTR_ERR(inode); + goto out; + } /* * add this inode to the orphan list so btrfs_orphan_del does @@ -2339,6 +2377,10 @@ void btrfs_orphan_cleanup(struct btrfs_root *root) */ if (is_bad_inode(inode)) { trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto out; + } btrfs_orphan_del(trans, inode); btrfs_end_transaction(trans, root); iput(inode); @@ -2347,17 +2389,22 @@ void btrfs_orphan_cleanup(struct btrfs_root *root) /* if we have links, this was a truncate, lets do that */ if (inode->i_nlink) { + if (!S_ISREG(inode->i_mode)) { + WARN_ON(1); + iput(inode); + continue; + } nr_truncate++; - btrfs_truncate(inode); + ret = btrfs_truncate(inode); } else { nr_unlink++; } /* this will do delete_inode and everything for us */ iput(inode); + if (ret) + goto out; } - btrfs_free_path(path); - root->orphan_cleanup_state = ORPHAN_CLEANUP_DONE; if (root->orphan_block_rsv) @@ -2366,13 +2413,20 @@ void btrfs_orphan_cleanup(struct btrfs_root *root) if (root->orphan_block_rsv || root->orphan_item_inserted) { trans = btrfs_join_transaction(root, 1); - btrfs_end_transaction(trans, root); + if (!IS_ERR(trans)) + btrfs_end_transaction(trans, root); } if (nr_unlink) printk(KERN_INFO "btrfs: unlinked %d orphans\n", nr_unlink); if (nr_truncate) printk(KERN_INFO "btrfs: truncated %d orphans\n", nr_truncate); + +out: + if (ret) + printk(KERN_CRIT "btrfs: could not do orphan cleanup %d\n", ret); + btrfs_free_path(path); + return ret; } /* @@ -2539,6 +2593,13 @@ static void fill_inode_item(struct btrfs_trans_handle *trans, struct btrfs_inode_item *item, struct inode *inode) { + if (!leaf->map_token) + map_private_extent_buffer(leaf, (unsigned long)item, + sizeof(struct btrfs_inode_item), + &leaf->map_token, &leaf->kaddr, + &leaf->map_start, &leaf->map_len, + KM_USER1); + btrfs_set_inode_uid(leaf, item, inode->i_uid); btrfs_set_inode_gid(leaf, item, inode->i_gid); btrfs_set_inode_size(leaf, item, BTRFS_I(inode)->disk_i_size); @@ -2567,6 +2628,11 @@ static void fill_inode_item(struct btrfs_trans_handle *trans, btrfs_set_inode_rdev(leaf, item, inode->i_rdev); btrfs_set_inode_flags(leaf, item, BTRFS_I(inode)->flags); btrfs_set_inode_block_group(leaf, item, BTRFS_I(inode)->block_group); + + if (leaf->map_token) { + unmap_extent_buffer(leaf, leaf->map_token, KM_USER1); + leaf->map_token = NULL; + } } /* @@ -2611,10 +2677,10 @@ failed: * recovery code. It remove a link in a directory with a given name, and * also drops the back refs in the inode to the directory */ -int btrfs_unlink_inode(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct inode *dir, struct inode *inode, - const char *name, int name_len) +static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct inode *dir, struct inode *inode, + const char *name, int name_len) { struct btrfs_path *path; int ret = 0; @@ -2626,7 +2692,7 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans, path = btrfs_alloc_path(); if (!path) { ret = -ENOMEM; - goto err; + goto out; } path->leave_spinning = 1; @@ -2686,12 +2752,25 @@ err: btrfs_i_size_write(dir, dir->i_size - name_len * 2); inode->i_ctime = dir->i_mtime = dir->i_ctime = CURRENT_TIME; btrfs_update_inode(trans, root, dir); - btrfs_drop_nlink(inode); - ret = btrfs_update_inode(trans, root, inode); out: return ret; } +int btrfs_unlink_inode(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct inode *dir, struct inode *inode, + const char *name, int name_len) +{ + int ret; + ret = __btrfs_unlink_inode(trans, root, dir, inode, name, name_len); + if (!ret) { + btrfs_drop_nlink(inode); + ret = btrfs_update_inode(trans, root, inode); + } + return ret; +} + + /* helper to check if there is any shared block in the path */ static int check_path_shared(struct btrfs_root *root, struct btrfs_path *path) @@ -2699,9 +2778,10 @@ static int check_path_shared(struct btrfs_root *root, struct extent_buffer *eb; int level; u64 refs = 1; - int uninitialized_var(ret); for (level = 0; level < BTRFS_MAX_LEVEL; level++) { + int ret; + if (!path->nodes[level]) break; eb = path->nodes[level]; @@ -2712,7 +2792,7 @@ static int check_path_shared(struct btrfs_root *root, if (refs > 1) return 1; } - return ret; /* XXX callers? */ + return 0; } /* @@ -3512,7 +3592,13 @@ out: return ret; } -int btrfs_cont_expand(struct inode *inode, loff_t size) +/* + * This function puts in dummy file extents for the area we're creating a hole + * for. So if we are truncating this file to a larger size we need to insert + * these file extents so that btrfs_get_extent will return a EXTENT_MAP_HOLE for + * the range between oldsize and size + */ +int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size) { struct btrfs_trans_handle *trans; struct btrfs_root *root = BTRFS_I(inode)->root; @@ -3520,7 +3606,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t size) struct extent_map *em = NULL; struct extent_state *cached_state = NULL; u64 mask = root->sectorsize - 1; - u64 hole_start = (inode->i_size + mask) & ~mask; + u64 hole_start = (oldsize + mask) & ~mask; u64 block_end = (size + mask) & ~mask; u64 last_byte; u64 cur_offset; @@ -3565,13 +3651,15 @@ int btrfs_cont_expand(struct inode *inode, loff_t size) err = btrfs_drop_extents(trans, inode, cur_offset, cur_offset + hole_size, &hint_byte, 1); - BUG_ON(err); + if (err) + break; err = btrfs_insert_file_extent(trans, root, inode->i_ino, cur_offset, 0, 0, hole_size, 0, hole_size, 0, 0, 0); - BUG_ON(err); + if (err) + break; btrfs_drop_extent_cache(inode, hole_start, last_byte - 1, 0); @@ -3591,94 +3679,58 @@ int btrfs_cont_expand(struct inode *inode, loff_t size) return err; } -static int btrfs_setattr_size(struct inode *inode, struct iattr *attr) +static int btrfs_setsize(struct inode *inode, loff_t newsize) { - struct btrfs_root *root = BTRFS_I(inode)->root; - struct btrfs_trans_handle *trans; - unsigned long nr; + loff_t oldsize = i_size_read(inode); int ret; - if (attr->ia_size == inode->i_size) + if (newsize == oldsize) return 0; - if (attr->ia_size > inode->i_size) { - unsigned long limit; - limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur; - if (attr->ia_size > inode->i_sb->s_maxbytes) - return -EFBIG; - if (limit != RLIM_INFINITY && attr->ia_size > limit) { - send_sig(SIGXFSZ, current, 0); - return -EFBIG; - } - } - - trans = btrfs_start_transaction(root, 5); - if (IS_ERR(trans)) - return PTR_ERR(trans); - - btrfs_set_trans_block_group(trans, inode); - - ret = btrfs_orphan_add(trans, inode); - BUG_ON(ret); - - nr = trans->blocks_used; - btrfs_end_transaction(trans, root); - btrfs_btree_balance_dirty(root, nr); - - if (attr->ia_size > inode->i_size) { - ret = btrfs_cont_expand(inode, attr->ia_size); + if (newsize > oldsize) { + i_size_write(inode, newsize); + btrfs_ordered_update_i_size(inode, i_size_read(inode), NULL); + truncate_pagecache(inode, oldsize, newsize); + ret = btrfs_cont_expand(inode, oldsize, newsize); if (ret) { - btrfs_truncate(inode); + btrfs_setsize(inode, oldsize); return ret; } - i_size_write(inode, attr->ia_size); - btrfs_ordered_update_i_size(inode, inode->i_size, NULL); + mark_inode_dirty(inode); + } else { - trans = btrfs_start_transaction(root, 0); - BUG_ON(IS_ERR(trans)); - btrfs_set_trans_block_group(trans, inode); - trans->block_rsv = root->orphan_block_rsv; - BUG_ON(!trans->block_rsv); + /* + * We're truncating a file that used to have good data down to + * zero. Make sure it gets into the ordered flush list so that + * any new writes get down to disk quickly. + */ + if (newsize == 0) + BTRFS_I(inode)->ordered_data_close = 1; - ret = btrfs_update_inode(trans, root, inode); - BUG_ON(ret); - if (inode->i_nlink > 0) { - ret = btrfs_orphan_del(trans, inode); - BUG_ON(ret); - } - nr = trans->blocks_used; - btrfs_end_transaction(trans, root); - btrfs_btree_balance_dirty(root, nr); - return 0; + /* we don't support swapfiles, so vmtruncate shouldn't fail */ + truncate_setsize(inode, newsize); + ret = btrfs_truncate(inode); } - /* - * We're truncating a file that used to have good data down to - * zero. Make sure it gets into the ordered flush list so that - * any new writes get down to disk quickly. - */ - if (attr->ia_size == 0) - BTRFS_I(inode)->ordered_data_close = 1; - - /* we don't support swapfiles, so vmtruncate shouldn't fail */ - ret = vmtruncate(inode, attr->ia_size); - BUG_ON(ret); - - return 0; + return ret; } static int btrfs_setattr(struct dentry *dentry, struct iattr *attr) { struct inode *inode = dentry->d_inode; + struct btrfs_root *root = BTRFS_I(inode)->root; int err; + if (btrfs_root_readonly(root)) + return -EROFS; + err = inode_change_ok(inode, attr); if (err) return err; if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) { - err = btrfs_setattr_size(inode, attr); + err = btrfs_setsize(inode, attr->ia_size); if (err) return err; } @@ -3701,6 +3753,8 @@ void btrfs_evict_inode(struct inode *inode) unsigned long nr; int ret; + trace_btrfs_inode_evict(inode); + truncate_inode_pages(&inode->i_data, 0); if (inode->i_nlink && (btrfs_root_refs(&root->root_item) != 0 || root == root->fs_info->tree_root)) @@ -4043,7 +4097,6 @@ struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location, BTRFS_I(inode)->root = root; memcpy(&BTRFS_I(inode)->location, location, sizeof(*location)); btrfs_read_locked_inode(inode); - inode_tree_add(inode); unlock_new_inode(inode); if (new) @@ -4115,11 +4168,13 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) } srcu_read_unlock(&root->fs_info->subvol_srcu, index); - if (root != sub_root) { + if (!IS_ERR(inode) && root != sub_root) { down_read(&root->fs_info->cleanup_work_sem); if (!(inode->i_sb->s_flags & MS_RDONLY)) - btrfs_orphan_cleanup(sub_root); + ret = btrfs_orphan_cleanup(sub_root); up_read(&root->fs_info->cleanup_work_sem); + if (ret) + inode = ERR_PTR(ret); } return inode; @@ -4167,10 +4222,8 @@ static int btrfs_real_readdir(struct file *filp, void *dirent, struct btrfs_key found_key; struct btrfs_path *path; int ret; - u32 nritems; struct extent_buffer *leaf; int slot; - int advance; unsigned char d_type; int over = 0; u32 di_cur; @@ -4213,27 +4266,19 @@ static int btrfs_real_readdir(struct file *filp, void *dirent, ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) goto err; - advance = 0; while (1) { leaf = path->nodes[0]; - nritems = btrfs_header_nritems(leaf); slot = path->slots[0]; - if (advance || slot >= nritems) { - if (slot >= nritems - 1) { - ret = btrfs_next_leaf(root, path); - if (ret) - break; - leaf = path->nodes[0]; - nritems = btrfs_header_nritems(leaf); - slot = path->slots[0]; - } else { - slot++; - path->slots[0]++; - } + if (slot >= btrfs_header_nritems(leaf)) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) + goto err; + else if (ret > 0) + break; + continue; } - advance = 1; item = btrfs_item_nr(leaf, slot); btrfs_item_key_to_cpu(leaf, &found_key, slot); @@ -4242,7 +4287,7 @@ static int btrfs_real_readdir(struct file *filp, void *dirent, if (btrfs_key_type(&found_key) != key_type) break; if (found_key.offset < filp->f_pos) - continue; + goto next; filp->f_pos = found_key.offset; @@ -4253,6 +4298,9 @@ static int btrfs_real_readdir(struct file *filp, void *dirent, while (di_cur < di_total) { struct btrfs_key location; + if (verify_dir_item(root, leaf, di)) + break; + name_len = btrfs_dir_name_len(leaf, di); if (name_len <= sizeof(tmp_name)) { name_ptr = tmp_name; @@ -4292,6 +4340,8 @@ skip: di_cur += di_len; di = (struct btrfs_dir_item *)((char *)di + di_len); } +next: + path->slots[0]++; } /* Reached end of directory/root. Bump pos past the last item. */ @@ -4328,6 +4378,8 @@ int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc) trans = btrfs_join_transaction_nolock(root, 1); else trans = btrfs_join_transaction(root, 1); + if (IS_ERR(trans)) + return PTR_ERR(trans); btrfs_set_trans_block_group(trans, inode); if (nolock) ret = btrfs_end_transaction_nolock(trans, root); @@ -4353,6 +4405,7 @@ void btrfs_dirty_inode(struct inode *inode) return; trans = btrfs_join_transaction(root, 1); + BUG_ON(IS_ERR(trans)); btrfs_set_trans_block_group(trans, inode); ret = btrfs_update_inode(trans, root, inode); @@ -4481,12 +4534,17 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, BUG_ON(!path); inode = new_inode(root->fs_info->sb); - if (!inode) + if (!inode) { + btrfs_free_path(path); return ERR_PTR(-ENOMEM); + } if (dir) { + trace_btrfs_inode_request(dir); + ret = btrfs_set_inode_index(dir, index); if (ret) { + btrfs_free_path(path); iput(inode); return ERR_PTR(ret); } @@ -4553,12 +4611,16 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, if ((mode & S_IFREG)) { if (btrfs_test_opt(root, NODATASUM)) BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM; - if (btrfs_test_opt(root, NODATACOW)) + if (btrfs_test_opt(root, NODATACOW) || + (BTRFS_I(dir)->flags & BTRFS_INODE_NODATACOW)) BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW; } insert_inode_hash(inode); inode_tree_add(inode); + + trace_btrfs_inode_new(inode); + return inode; fail: if (dir) @@ -4673,7 +4735,7 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry, if (IS_ERR(inode)) goto out_unlock; - err = btrfs_init_inode_security(trans, inode, dir); + err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); if (err) { drop_inode = 1; goto out_unlock; @@ -4734,7 +4796,7 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry, if (IS_ERR(inode)) goto out_unlock; - err = btrfs_init_inode_security(trans, inode, dir); + err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); if (err) { drop_inode = 1; goto out_unlock; @@ -4775,30 +4837,31 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, int err; int drop_inode = 0; - if (inode->i_nlink == 0) - return -ENOENT; - /* do not allow sys_link's with other subvols of the same device */ if (root->objectid != BTRFS_I(inode)->root->objectid) - return -EPERM; + return -EXDEV; - btrfs_inc_nlink(inode); - inode->i_ctime = CURRENT_TIME; + if (inode->i_nlink == ~0U) + return -EMLINK; err = btrfs_set_inode_index(dir, &index); if (err) goto fail; /* - * 1 item for inode ref + * 2 items for inode and inode ref * 2 items for dir items + * 1 item for parent inode */ - trans = btrfs_start_transaction(root, 3); + trans = btrfs_start_transaction(root, 5); if (IS_ERR(trans)) { err = PTR_ERR(trans); goto fail; } + btrfs_inc_nlink(inode); + inode->i_ctime = CURRENT_TIME; + btrfs_set_trans_block_group(trans, dir); ihold(inode); @@ -4862,7 +4925,7 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) drop_on_err = 1; - err = btrfs_init_inode_security(trans, inode, dir); + err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); if (err) goto out_fail; @@ -4928,8 +4991,10 @@ static noinline int uncompress_inline(struct btrfs_path *path, size_t max_size; unsigned long inline_size; unsigned long ptr; + int compress_type; WARN_ON(pg_offset != 0); + compress_type = btrfs_file_extent_compression(leaf, item); max_size = btrfs_file_extent_ram_bytes(leaf, item); inline_size = btrfs_file_extent_inline_item_len(leaf, btrfs_item_nr(leaf, path->slots[0])); @@ -4939,8 +5004,8 @@ static noinline int uncompress_inline(struct btrfs_path *path, read_extent_buffer(leaf, tmp, ptr, inline_size); max_size = min_t(unsigned long, PAGE_CACHE_SIZE, max_size); - ret = btrfs_zlib_decompress(tmp, page, extent_offset, - inline_size, max_size); + ret = btrfs_decompress(compress_type, tmp, page, + extent_offset, inline_size, max_size); if (ret) { char *kaddr = kmap_atomic(page, KM_USER0); unsigned long copy_size = min_t(u64, @@ -4982,7 +5047,7 @@ struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page, struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; struct btrfs_trans_handle *trans = NULL; - int compressed; + int compress_type; again: read_lock(&em_tree->lock); @@ -5041,7 +5106,7 @@ again: found_type = btrfs_file_extent_type(leaf, item); extent_start = found_key.offset; - compressed = btrfs_file_extent_compression(leaf, item); + compress_type = btrfs_file_extent_compression(leaf, item); if (found_type == BTRFS_FILE_EXTENT_REG || found_type == BTRFS_FILE_EXTENT_PREALLOC) { extent_end = extent_start + @@ -5087,8 +5152,9 @@ again: em->block_start = EXTENT_MAP_HOLE; goto insert; } - if (compressed) { + if (compress_type != BTRFS_COMPRESS_NONE) { set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); + em->compress_type = compress_type; em->block_start = bytenr; em->block_len = btrfs_file_extent_disk_num_bytes(leaf, item); @@ -5122,12 +5188,14 @@ again: em->len = (copy_size + root->sectorsize - 1) & ~((u64)root->sectorsize - 1); em->orig_start = EXTENT_MAP_INLINE; - if (compressed) + if (compress_type) { set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); + em->compress_type = compress_type; + } ptr = btrfs_file_extent_inline_start(item) + extent_offset; if (create == 0 && !PageUptodate(page)) { - if (btrfs_file_extent_compression(leaf, item) == - BTRFS_COMPRESS_ZLIB) { + if (btrfs_file_extent_compression(leaf, item) != + BTRFS_COMPRESS_NONE) { ret = uncompress_inline(path, inode, page, pg_offset, extent_offset, item); @@ -5152,6 +5220,8 @@ again: em = NULL; btrfs_release_path(root, path); trans = btrfs_join_transaction(root, 1); + if (IS_ERR(trans)) + return ERR_CAST(trans); goto again; } map = kmap(page); @@ -5161,7 +5231,7 @@ again: btrfs_mark_buffer_dirty(leaf); } set_extent_uptodate(io_tree, em->start, - extent_map_end(em) - 1, GFP_NOFS); + extent_map_end(em) - 1, NULL, GFP_NOFS); goto insert; } else { printk(KERN_ERR "btrfs unknown found_type %d\n", found_type); @@ -5228,6 +5298,9 @@ insert: } write_unlock(&em_tree->lock); out: + + trace_btrfs_get_extent(root, em); + if (path) btrfs_free_path(path); if (trans) { @@ -5242,22 +5315,157 @@ out: return em; } +struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *page, + size_t pg_offset, u64 start, u64 len, + int create) +{ + struct extent_map *em; + struct extent_map *hole_em = NULL; + u64 range_start = start; + u64 end; + u64 found; + u64 found_end; + int err = 0; + + em = btrfs_get_extent(inode, page, pg_offset, start, len, create); + if (IS_ERR(em)) + return em; + if (em) { + /* + * if our em maps to a hole, there might + * actually be delalloc bytes behind it + */ + if (em->block_start != EXTENT_MAP_HOLE) + return em; + else + hole_em = em; + } + + /* check to see if we've wrapped (len == -1 or similar) */ + end = start + len; + if (end < start) + end = (u64)-1; + else + end -= 1; + + em = NULL; + + /* ok, we didn't find anything, lets look for delalloc */ + found = count_range_bits(&BTRFS_I(inode)->io_tree, &range_start, + end, len, EXTENT_DELALLOC, 1); + found_end = range_start + found; + if (found_end < range_start) + found_end = (u64)-1; + + /* + * we didn't find anything useful, return + * the original results from get_extent() + */ + if (range_start > end || found_end <= start) { + em = hole_em; + hole_em = NULL; + goto out; + } + + /* adjust the range_start to make sure it doesn't + * go backwards from the start they passed in + */ + range_start = max(start,range_start); + found = found_end - range_start; + + if (found > 0) { + u64 hole_start = start; + u64 hole_len = len; + + em = alloc_extent_map(GFP_NOFS); + if (!em) { + err = -ENOMEM; + goto out; + } + /* + * when btrfs_get_extent can't find anything it + * returns one huge hole + * + * make sure what it found really fits our range, and + * adjust to make sure it is based on the start from + * the caller + */ + if (hole_em) { + u64 calc_end = extent_map_end(hole_em); + + if (calc_end <= start || (hole_em->start > end)) { + free_extent_map(hole_em); + hole_em = NULL; + } else { + hole_start = max(hole_em->start, start); + hole_len = calc_end - hole_start; + } + } + em->bdev = NULL; + if (hole_em && range_start > hole_start) { + /* our hole starts before our delalloc, so we + * have to return just the parts of the hole + * that go until the delalloc starts + */ + em->len = min(hole_len, + range_start - hole_start); + em->start = hole_start; + em->orig_start = hole_start; + /* + * don't adjust block start at all, + * it is fixed at EXTENT_MAP_HOLE + */ + em->block_start = hole_em->block_start; + em->block_len = hole_len; + } else { + em->start = range_start; + em->len = found; + em->orig_start = range_start; + em->block_start = EXTENT_MAP_DELALLOC; + em->block_len = found; + } + } else if (hole_em) { + return hole_em; + } +out: + + free_extent_map(hole_em); + if (err) { + free_extent_map(em); + return ERR_PTR(err); + } + return em; +} + static struct extent_map *btrfs_new_extent_direct(struct inode *inode, + struct extent_map *em, u64 start, u64 len) { struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_trans_handle *trans; - struct extent_map *em; struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; struct btrfs_key ins; u64 alloc_hint; int ret; + bool insert = false; - btrfs_drop_extent_cache(inode, start, start + len - 1, 0); + /* + * Ok if the extent map we looked up is a hole and is for the exact + * range we want, there is no reason to allocate a new one, however if + * it is not right then we need to free this one and drop the cache for + * our range. + */ + if (em->block_start != EXTENT_MAP_HOLE || em->start != start || + em->len != len) { + free_extent_map(em); + em = NULL; + insert = true; + btrfs_drop_extent_cache(inode, start, start + len - 1, 0); + } trans = btrfs_join_transaction(root, 0); - if (!trans) - return ERR_PTR(-ENOMEM); + if (IS_ERR(trans)) + return ERR_CAST(trans); trans->block_rsv = &root->fs_info->delalloc_block_rsv; @@ -5269,10 +5477,12 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode, goto out; } - em = alloc_extent_map(GFP_NOFS); if (!em) { - em = ERR_PTR(-ENOMEM); - goto out; + em = alloc_extent_map(GFP_NOFS); + if (!em) { + em = ERR_PTR(-ENOMEM); + goto out; + } } em->start = start; @@ -5282,9 +5492,15 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode, em->block_start = ins.objectid; em->block_len = ins.offset; em->bdev = root->fs_info->fs_devices->latest_bdev; + + /* + * We need to do this because if we're using the original em we searched + * for, we could have EXTENT_FLAG_VACANCY set, and we don't want that. + */ + em->flags = 0; set_bit(EXTENT_FLAG_PINNED, &em->flags); - while (1) { + while (insert) { write_lock(&em_tree->lock); ret = add_extent_mapping(em_tree, em); write_unlock(&em_tree->lock); @@ -5481,7 +5697,7 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, * while we look for nocow cross refs */ trans = btrfs_join_transaction(root, 0); - if (!trans) + if (IS_ERR(trans)) goto must_cow; if (can_nocow_odirect(trans, inode, start, len) == 1) { @@ -5502,8 +5718,7 @@ must_cow: * it above */ len = bh_result->b_size; - free_extent_map(em); - em = btrfs_new_extent_direct(inode, start, len); + em = btrfs_new_extent_direct(inode, em, start, len); if (IS_ERR(em)) return PTR_ERR(em); len = min(len, em->len - (start - em->start)); @@ -5589,6 +5804,10 @@ static void btrfs_endio_direct_read(struct bio *bio, int err) kfree(dip->csums); kfree(dip); + + /* If we had a csum failure make sure to clear the uptodate flag */ + if (err) + clear_bit(BIO_UPTODATE, &bio->bi_flags); dio_end_io(bio, err); } @@ -5616,7 +5835,7 @@ again: BUG_ON(!ordered); trans = btrfs_join_transaction(root, 1); - if (!trans) { + if (IS_ERR(trans)) { err = -ENOMEM; goto out; } @@ -5662,8 +5881,10 @@ again: } add_pending_csums(trans, inode, ordered->file_offset, &ordered->list); - btrfs_ordered_update_i_size(inode, 0, ordered); - btrfs_update_inode(trans, root, inode); + ret = btrfs_ordered_update_i_size(inode, 0, ordered); + if (!ret) + btrfs_update_inode(trans, root, inode); + ret = 0; out_unlock: unlock_extent_cached(&BTRFS_I(inode)->io_tree, ordered->file_offset, ordered->file_offset + ordered->len - 1, @@ -5690,6 +5911,10 @@ out_done: kfree(dip->csums); kfree(dip); + + /* If we had an error make sure to clear the uptodate flag */ + if (err) + clear_bit(BIO_UPTODATE, &bio->bi_flags); dio_end_io(bio, err); } @@ -5745,7 +5970,7 @@ static struct bio *btrfs_dio_bio_alloc(struct block_device *bdev, static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode, int rw, u64 file_offset, int skip_sum, - u32 *csums) + u32 *csums, int async_submit) { int write = rw & REQ_WRITE; struct btrfs_root *root = BTRFS_I(inode)->root; @@ -5756,18 +5981,33 @@ static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode, if (ret) goto err; - if (write && !skip_sum) { + if (skip_sum) + goto map; + + if (write && async_submit) { ret = btrfs_wq_submit_bio(root->fs_info, inode, rw, bio, 0, 0, file_offset, __btrfs_submit_bio_start_direct_io, __btrfs_submit_bio_done); goto err; - } else if (!skip_sum) - btrfs_lookup_bio_sums_dio(root, inode, bio, + } else if (write) { + /* + * If we aren't doing async submit, calculate the csum of the + * bio now. + */ + ret = btrfs_csum_one_bio(root, inode, bio, file_offset, 1); + if (ret) + goto err; + } else if (!skip_sum) { + ret = btrfs_lookup_bio_sums_dio(root, inode, bio, file_offset, csums); + if (ret) + goto err; + } - ret = btrfs_map_bio(root, rw, bio, 0, 1); +map: + ret = btrfs_map_bio(root, rw, bio, 0, async_submit); err: bio_put(bio); return ret; @@ -5789,13 +6029,8 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip, int nr_pages = 0; u32 *csums = dip->csums; int ret = 0; - - bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev, start_sector, GFP_NOFS); - if (!bio) - return -ENOMEM; - bio->bi_private = dip; - bio->bi_end_io = btrfs_end_dio_bio; - atomic_inc(&dip->pending_bios); + int async_submit = 0; + int write = rw & REQ_WRITE; map_length = orig_bio->bi_size; ret = btrfs_map_block(map_tree, READ, start_sector << 9, @@ -5805,6 +6040,19 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip, return -EIO; } + if (map_length >= orig_bio->bi_size) { + bio = orig_bio; + goto submit; + } + + async_submit = 1; + bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev, start_sector, GFP_NOFS); + if (!bio) + return -ENOMEM; + bio->bi_private = dip; + bio->bi_end_io = btrfs_end_dio_bio; + atomic_inc(&dip->pending_bios); + while (bvec <= (orig_bio->bi_io_vec + orig_bio->bi_vcnt - 1)) { if (unlikely(map_length < submit_len + bvec->bv_len || bio_add_page(bio, bvec->bv_page, bvec->bv_len, @@ -5818,14 +6066,15 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip, atomic_inc(&dip->pending_bios); ret = __btrfs_submit_dio_bio(bio, inode, rw, file_offset, skip_sum, - csums); + csums, async_submit); if (ret) { bio_put(bio); atomic_dec(&dip->pending_bios); goto out_err; } - if (!skip_sum) + /* Write's use the ordered csums */ + if (!write && !skip_sum) csums = csums + nr_pages; start_sector += submit_len >> 9; file_offset += submit_len; @@ -5854,8 +6103,9 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip, } } +submit: ret = __btrfs_submit_dio_bio(bio, inode, rw, file_offset, skip_sum, - csums); + csums, async_submit); if (!ret) return 0; @@ -5893,9 +6143,11 @@ static void btrfs_submit_direct(int rw, struct bio *bio, struct inode *inode, } dip->csums = NULL; - if (!skip_sum) { + /* Write's use the ordered csum stuff, so we don't need dip->csums */ + if (!write && !skip_sum) { dip->csums = kmalloc(sizeof(u32) * bio->bi_vcnt, GFP_NOFS); if (!dip->csums) { + kfree(dip); ret = -ENOMEM; goto free_ordered; } @@ -5948,6 +6200,7 @@ static ssize_t check_direct_IO(struct btrfs_root *root, int rw, struct kiocb *io unsigned long nr_segs) { int seg; + int i; size_t size; unsigned long addr; unsigned blocksize_mask = root->sectorsize - 1; @@ -5962,8 +6215,22 @@ static ssize_t check_direct_IO(struct btrfs_root *root, int rw, struct kiocb *io addr = (unsigned long)iov[seg].iov_base; size = iov[seg].iov_len; end += size; - if ((addr & blocksize_mask) || (size & blocksize_mask)) + if ((addr & blocksize_mask) || (size & blocksize_mask)) goto out; + + /* If this is a write we don't need to check anymore */ + if (rw & WRITE) + continue; + + /* + * Check to make sure we don't have duplicate iov_base's in this + * iovec, if so return EINVAL, otherwise we'll get csum errors + * when reading back. + */ + for (i = seg + 1; i < nr_segs; i++) { + if (iov[seg].iov_base == iov[i].iov_base) + goto out; + } } retval = 0; out: @@ -6064,7 +6331,7 @@ out: static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, __u64 start, __u64 len) { - return extent_fiemap(inode, fieinfo, start, len, btrfs_get_extent); + return extent_fiemap(inode, fieinfo, start, len, btrfs_get_extent_fiemap); } int btrfs_readpage(struct file *file, struct page *page) @@ -6314,28 +6581,42 @@ out: return ret; } -static void btrfs_truncate(struct inode *inode) +static int btrfs_truncate(struct inode *inode) { struct btrfs_root *root = BTRFS_I(inode)->root; int ret; + int err = 0; struct btrfs_trans_handle *trans; unsigned long nr; u64 mask = root->sectorsize - 1; - if (!S_ISREG(inode->i_mode)) { - WARN_ON(1); - return; - } - ret = btrfs_truncate_page(inode->i_mapping, inode->i_size); if (ret) - return; + return ret; btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1); btrfs_ordered_update_i_size(inode, inode->i_size, NULL); + trans = btrfs_start_transaction(root, 5); + if (IS_ERR(trans)) + return PTR_ERR(trans); + + btrfs_set_trans_block_group(trans, inode); + + ret = btrfs_orphan_add(trans, inode); + if (ret) { + btrfs_end_transaction(trans, root); + return ret; + } + + nr = trans->blocks_used; + btrfs_end_transaction(trans, root); + btrfs_btree_balance_dirty(root, nr); + + /* Now start a transaction for the truncate */ trans = btrfs_start_transaction(root, 0); - BUG_ON(IS_ERR(trans)); + if (IS_ERR(trans)) + return PTR_ERR(trans); btrfs_set_trans_block_group(trans, inode); trans->block_rsv = root->orphan_block_rsv; @@ -6362,29 +6643,38 @@ static void btrfs_truncate(struct inode *inode) while (1) { if (!trans) { trans = btrfs_start_transaction(root, 0); - BUG_ON(IS_ERR(trans)); + if (IS_ERR(trans)) + return PTR_ERR(trans); btrfs_set_trans_block_group(trans, inode); trans->block_rsv = root->orphan_block_rsv; } ret = btrfs_block_rsv_check(trans, root, root->orphan_block_rsv, 0, 5); - if (ret) { - BUG_ON(ret != -EAGAIN); + if (ret == -EAGAIN) { ret = btrfs_commit_transaction(trans, root); - BUG_ON(ret); + if (ret) + return ret; trans = NULL; continue; + } else if (ret) { + err = ret; + break; } ret = btrfs_truncate_inode_items(trans, root, inode, inode->i_size, BTRFS_EXTENT_DATA_KEY); - if (ret != -EAGAIN) + if (ret != -EAGAIN) { + err = ret; break; + } ret = btrfs_update_inode(trans, root, inode); - BUG_ON(ret); + if (ret) { + err = ret; + break; + } nr = trans->blocks_used; btrfs_end_transaction(trans, root); @@ -6394,16 +6684,27 @@ static void btrfs_truncate(struct inode *inode) if (ret == 0 && inode->i_nlink > 0) { ret = btrfs_orphan_del(trans, inode); - BUG_ON(ret); + if (ret) + err = ret; + } else if (ret && inode->i_nlink > 0) { + /* + * Failed to do the truncate, remove us from the in memory + * orphan list. + */ + ret = btrfs_orphan_del(NULL, inode); } ret = btrfs_update_inode(trans, root, inode); - BUG_ON(ret); + if (ret && !err) + err = ret; nr = trans->blocks_used; ret = btrfs_end_transaction_throttle(trans, root); - BUG_ON(ret); + if (ret && !err) + err = ret; btrfs_btree_balance_dirty(root, nr); + + return err; } /* @@ -6470,14 +6771,13 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) ei->index_cnt = (u64)-1; ei->last_unlink_trans = 0; - spin_lock_init(&ei->accounting_lock); atomic_set(&ei->outstanding_extents, 0); - ei->reserved_extents = 0; + atomic_set(&ei->reserved_extents, 0); ei->ordered_data_close = 0; ei->orphan_meta_reserved = 0; ei->dummy_inode = 0; - ei->force_compress = 0; + ei->force_compress = BTRFS_COMPRESS_NONE; inode = &ei->vfs_inode; extent_map_tree_init(&ei->extent_tree, GFP_NOFS); @@ -6508,7 +6808,7 @@ void btrfs_destroy_inode(struct inode *inode) WARN_ON(!list_empty(&inode->i_dentry)); WARN_ON(inode->i_data.nrpages); WARN_ON(atomic_read(&BTRFS_I(inode)->outstanding_extents)); - WARN_ON(BTRFS_I(inode)->reserved_extents); + WARN_ON(atomic_read(&BTRFS_I(inode)->reserved_extents)); /* * This can happen where we create an inode, but somebody else also @@ -6600,6 +6900,8 @@ void btrfs_destroy_cachep(void) kmem_cache_destroy(btrfs_transaction_cachep); if (btrfs_path_cachep) kmem_cache_destroy(btrfs_path_cachep); + if (btrfs_free_space_cachep) + kmem_cache_destroy(btrfs_free_space_cachep); } int btrfs_init_cachep(void) @@ -6628,6 +6930,12 @@ int btrfs_init_cachep(void) if (!btrfs_path_cachep) goto fail; + btrfs_free_space_cachep = kmem_cache_create("btrfs_free_space_cache", + sizeof(struct btrfs_free_space), 0, + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); + if (!btrfs_free_space_cachep) + goto fail; + return 0; fail: btrfs_destroy_cachep(); @@ -6646,6 +6954,26 @@ static int btrfs_getattr(struct vfsmount *mnt, return 0; } +/* + * If a file is moved, it will inherit the cow and compression flags of the new + * directory. + */ +static void fixup_inode_flags(struct inode *dir, struct inode *inode) +{ + struct btrfs_inode *b_dir = BTRFS_I(dir); + struct btrfs_inode *b_inode = BTRFS_I(inode); + + if (b_dir->flags & BTRFS_INODE_NODATACOW) + b_inode->flags |= BTRFS_INODE_NODATACOW; + else + b_inode->flags &= ~BTRFS_INODE_NODATACOW; + + if (b_dir->flags & BTRFS_INODE_COMPRESS) + b_inode->flags |= BTRFS_INODE_COMPRESS; + else + b_inode->flags &= ~BTRFS_INODE_COMPRESS; +} + static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry) { @@ -6694,8 +7022,10 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, * should cover the worst case number of items we'll modify. */ trans = btrfs_start_transaction(root, 20); - if (IS_ERR(trans)) - return PTR_ERR(trans); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto out_notrans; + } btrfs_set_trans_block_group(trans, new_dir); @@ -6748,11 +7078,12 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, old_dentry->d_name.name, old_dentry->d_name.len); } else { - btrfs_inc_nlink(old_dentry->d_inode); - ret = btrfs_unlink_inode(trans, root, old_dir, - old_dentry->d_inode, - old_dentry->d_name.name, - old_dentry->d_name.len); + ret = __btrfs_unlink_inode(trans, root, old_dir, + old_dentry->d_inode, + old_dentry->d_name.name, + old_dentry->d_name.len); + if (!ret) + ret = btrfs_update_inode(trans, root, old_inode); } BUG_ON(ret); @@ -6779,6 +7110,8 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, } } + fixup_inode_flags(new_dir, old_inode); + ret = btrfs_add_link(trans, new_dir, old_inode, new_dentry->d_name.name, new_dentry->d_name.len, 0, index); @@ -6792,7 +7125,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, } out_fail: btrfs_end_transaction_throttle(trans, root); - +out_notrans: if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) up_read(&root->fs_info->subvol_sem); @@ -6944,7 +7277,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry, if (IS_ERR(inode)) goto out_unlock; - err = btrfs_init_inode_security(trans, inode, dir); + err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); if (err) { drop_inode = 1; goto out_unlock; @@ -7098,116 +7431,6 @@ int btrfs_prealloc_file_range_trans(struct inode *inode, min_size, actual_len, alloc_hint, trans); } -static long btrfs_fallocate(struct inode *inode, int mode, - loff_t offset, loff_t len) -{ - struct extent_state *cached_state = NULL; - u64 cur_offset; - u64 last_byte; - u64 alloc_start; - u64 alloc_end; - u64 alloc_hint = 0; - u64 locked_end; - u64 mask = BTRFS_I(inode)->root->sectorsize - 1; - struct extent_map *em; - int ret; - - alloc_start = offset & ~mask; - alloc_end = (offset + len + mask) & ~mask; - - /* We only support the FALLOC_FL_KEEP_SIZE mode */ - if (mode && (mode != FALLOC_FL_KEEP_SIZE)) - return -EOPNOTSUPP; - - /* - * wait for ordered IO before we have any locks. We'll loop again - * below with the locks held. - */ - btrfs_wait_ordered_range(inode, alloc_start, alloc_end - alloc_start); - - mutex_lock(&inode->i_mutex); - ret = inode_newsize_ok(inode, alloc_end); - if (ret) - goto out; - - if (alloc_start > inode->i_size) { - ret = btrfs_cont_expand(inode, alloc_start); - if (ret) - goto out; - } - - ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start); - if (ret) - goto out; - - locked_end = alloc_end - 1; - while (1) { - struct btrfs_ordered_extent *ordered; - - /* the extent lock is ordered inside the running - * transaction - */ - lock_extent_bits(&BTRFS_I(inode)->io_tree, alloc_start, - locked_end, 0, &cached_state, GFP_NOFS); - ordered = btrfs_lookup_first_ordered_extent(inode, - alloc_end - 1); - if (ordered && - ordered->file_offset + ordered->len > alloc_start && - ordered->file_offset < alloc_end) { - btrfs_put_ordered_extent(ordered); - unlock_extent_cached(&BTRFS_I(inode)->io_tree, - alloc_start, locked_end, - &cached_state, GFP_NOFS); - /* - * we can't wait on the range with the transaction - * running or with the extent lock held - */ - btrfs_wait_ordered_range(inode, alloc_start, - alloc_end - alloc_start); - } else { - if (ordered) - btrfs_put_ordered_extent(ordered); - break; - } - } - - cur_offset = alloc_start; - while (1) { - em = btrfs_get_extent(inode, NULL, 0, cur_offset, - alloc_end - cur_offset, 0); - BUG_ON(IS_ERR(em) || !em); - last_byte = min(extent_map_end(em), alloc_end); - last_byte = (last_byte + mask) & ~mask; - if (em->block_start == EXTENT_MAP_HOLE || - (cur_offset >= inode->i_size && - !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) { - ret = btrfs_prealloc_file_range(inode, mode, cur_offset, - last_byte - cur_offset, - 1 << inode->i_blkbits, - offset + len, - &alloc_hint); - if (ret < 0) { - free_extent_map(em); - break; - } - } - free_extent_map(em); - - cur_offset = last_byte; - if (cur_offset >= alloc_end) { - ret = 0; - break; - } - } - unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, - &cached_state, GFP_NOFS); - - btrfs_free_reserved_data_space(inode, alloc_end - alloc_start); -out: - mutex_unlock(&inode->i_mutex); - return ret; -} - static int btrfs_set_page_dirty(struct page *page) { return __set_page_dirty_nobuffers(page); @@ -7215,6 +7438,10 @@ static int btrfs_set_page_dirty(struct page *page) static int btrfs_permission(struct inode *inode, int mask, unsigned int flags) { + struct btrfs_root *root = BTRFS_I(inode)->root; + + if (btrfs_root_readonly(root) && (mask & MAY_WRITE)) + return -EROFS; if ((BTRFS_I(inode)->flags & BTRFS_INODE_READONLY) && (mask & MAY_WRITE)) return -EACCES; return generic_permission(inode, mask, flags, btrfs_check_acl); @@ -7286,7 +7513,6 @@ static const struct address_space_operations btrfs_aops = { .writepage = btrfs_writepage, .writepages = btrfs_writepages, .readpages = btrfs_readpages, - .sync_page = block_sync_page, .direct_IO = btrfs_direct_IO, .invalidatepage = btrfs_invalidatepage, .releasepage = btrfs_releasepage, @@ -7302,7 +7528,6 @@ static const struct address_space_operations btrfs_symlink_aops = { }; static const struct inode_operations btrfs_file_inode_operations = { - .truncate = btrfs_truncate, .getattr = btrfs_getattr, .setattr = btrfs_setattr, .setxattr = btrfs_setxattr, @@ -7310,7 +7535,6 @@ static const struct inode_operations btrfs_file_inode_operations = { .listxattr = btrfs_listxattr, .removexattr = btrfs_removexattr, .permission = btrfs_permission, - .fallocate = btrfs_fallocate, .fiemap = btrfs_fiemap, }; static const struct inode_operations btrfs_special_inode_operations = { diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index f87552a1d7ea..ffb48d6c5433 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -40,6 +40,7 @@ #include <linux/xattr.h> #include <linux/vmalloc.h> #include <linux/slab.h> +#include <linux/blkdev.h> #include "compat.h" #include "ctree.h" #include "disk-io.h" @@ -138,6 +139,24 @@ static int btrfs_ioctl_getflags(struct file *file, void __user *arg) return 0; } +static int check_flags(unsigned int flags) +{ + if (flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL | \ + FS_NOATIME_FL | FS_NODUMP_FL | \ + FS_SYNC_FL | FS_DIRSYNC_FL | \ + FS_NOCOMP_FL | FS_COMPR_FL | \ + FS_NOCOW_FL | FS_COW_FL)) + return -EOPNOTSUPP; + + if ((flags & FS_NOCOMP_FL) && (flags & FS_COMPR_FL)) + return -EINVAL; + + if ((flags & FS_NOCOW_FL) && (flags & FS_COW_FL)) + return -EINVAL; + + return 0; +} + static int btrfs_ioctl_setflags(struct file *file, void __user *arg) { struct inode *inode = file->f_path.dentry->d_inode; @@ -147,15 +166,17 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) unsigned int flags, oldflags; int ret; + if (btrfs_root_readonly(root)) + return -EROFS; + if (copy_from_user(&flags, arg, sizeof(flags))) return -EFAULT; - if (flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL | \ - FS_NOATIME_FL | FS_NODUMP_FL | \ - FS_SYNC_FL | FS_DIRSYNC_FL)) - return -EOPNOTSUPP; + ret = check_flags(flags); + if (ret) + return ret; - if (!is_owner_or_cap(inode)) + if (!inode_owner_or_capable(inode)) return -EACCES; mutex_lock(&inode->i_mutex); @@ -198,9 +219,25 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) else ip->flags &= ~BTRFS_INODE_DIRSYNC; + /* + * The COMPRESS flag can only be changed by users, while the NOCOMPRESS + * flag may be changed automatically if compression code won't make + * things smaller. + */ + if (flags & FS_NOCOMP_FL) { + ip->flags &= ~BTRFS_INODE_COMPRESS; + ip->flags |= BTRFS_INODE_NOCOMPRESS; + } else if (flags & FS_COMPR_FL) { + ip->flags |= BTRFS_INODE_COMPRESS; + ip->flags &= ~BTRFS_INODE_NOCOMPRESS; + } + if (flags & FS_NOCOW_FL) + ip->flags |= BTRFS_INODE_NODATACOW; + else if (flags & FS_COW_FL) + ip->flags &= ~BTRFS_INODE_NODATACOW; trans = btrfs_join_transaction(root, 1); - BUG_ON(!trans); + BUG_ON(IS_ERR(trans)); ret = btrfs_update_inode(trans, root, inode); BUG_ON(ret); @@ -210,9 +247,11 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) btrfs_end_transaction(trans, root); mnt_drop_write(file->f_path.mnt); + + ret = 0; out_unlock: mutex_unlock(&inode->i_mutex); - return 0; + return ret; } static int btrfs_ioctl_getversion(struct file *file, int __user *arg) @@ -222,6 +261,49 @@ static int btrfs_ioctl_getversion(struct file *file, int __user *arg) return put_user(inode->i_generation, arg); } +static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg) +{ + struct btrfs_root *root = fdentry(file)->d_sb->s_fs_info; + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_device *device; + struct request_queue *q; + struct fstrim_range range; + u64 minlen = ULLONG_MAX; + u64 num_devices = 0; + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + mutex_lock(&fs_info->fs_devices->device_list_mutex); + list_for_each_entry(device, &fs_info->fs_devices->devices, dev_list) { + if (!device->bdev) + continue; + q = bdev_get_queue(device->bdev); + if (blk_queue_discard(q)) { + num_devices++; + minlen = min((u64)q->limits.discard_granularity, + minlen); + } + } + mutex_unlock(&fs_info->fs_devices->device_list_mutex); + if (!num_devices) + return -EOPNOTSUPP; + + if (copy_from_user(&range, arg, sizeof(range))) + return -EFAULT; + + range.minlen = max(range.minlen, minlen); + ret = btrfs_trim_fs(root, &range); + if (ret < 0) + return ret; + + if (copy_to_user(arg, &range, sizeof(range))) + return -EFAULT; + + return 0; +} + static noinline int create_subvol(struct btrfs_root *root, struct dentry *dentry, char *name, int namelen, @@ -291,6 +373,10 @@ static noinline int create_subvol(struct btrfs_root *root, inode_item->nbytes = cpu_to_le64(root->leafsize); inode_item->mode = cpu_to_le32(S_IFDIR | 0755); + root_item.flags = 0; + root_item.byte_limit = 0; + inode_item->flags = cpu_to_le64(BTRFS_INODE_ROOT_ITEM_INIT); + btrfs_set_root_bytenr(&root_item, leaf->start); btrfs_set_root_generation(&root_item, trans->transid); btrfs_set_root_level(&root_item, 0); @@ -360,7 +446,8 @@ fail: } static int create_snapshot(struct btrfs_root *root, struct dentry *dentry, - char *name, int namelen, u64 *async_transid) + char *name, int namelen, u64 *async_transid, + bool readonly) { struct inode *inode; struct dentry *parent; @@ -378,6 +465,7 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry, btrfs_init_block_rsv(&pending_snapshot->block_rsv); pending_snapshot->dentry = dentry; pending_snapshot->root = root; + pending_snapshot->readonly = readonly; trans = btrfs_start_transaction(root->fs_info->extent_root, 5); if (IS_ERR(trans)) { @@ -404,7 +492,9 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry, if (ret) goto fail; - btrfs_orphan_cleanup(pending_snapshot->snap); + ret = btrfs_orphan_cleanup(pending_snapshot->snap); + if (ret) + goto fail; parent = dget_parent(dentry); inode = btrfs_lookup_dentry(parent->d_inode, dentry); @@ -509,7 +599,7 @@ static inline int btrfs_may_create(struct inode *dir, struct dentry *child) static noinline int btrfs_mksubvol(struct path *parent, char *name, int namelen, struct btrfs_root *snap_src, - u64 *async_transid) + u64 *async_transid, bool readonly) { struct inode *dir = parent->dentry->d_inode; struct dentry *dentry; @@ -541,7 +631,7 @@ static noinline int btrfs_mksubvol(struct path *parent, if (snap_src) { error = create_snapshot(snap_src, dentry, - name, namelen, async_transid); + name, namelen, async_transid, readonly); } else { error = create_subvol(BTRFS_I(dir)->root, dentry, name, namelen, async_transid); @@ -638,9 +728,11 @@ static int btrfs_defrag_file(struct file *file, struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; struct btrfs_ordered_extent *ordered; struct page *page; + struct btrfs_super_block *disk_super; unsigned long last_index; unsigned long ra_pages = root->fs_info->bdi.ra_pages; unsigned long total_read = 0; + u64 features; u64 page_start; u64 page_end; u64 last_len = 0; @@ -648,6 +740,14 @@ static int btrfs_defrag_file(struct file *file, u64 defrag_end = 0; unsigned long i; int ret; + int compress_type = BTRFS_COMPRESS_ZLIB; + + if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS) { + if (range->compress_type > BTRFS_COMPRESS_TYPES) + return -EINVAL; + if (range->compress_type) + compress_type = range->compress_type; + } if (inode->i_size == 0) return 0; @@ -683,7 +783,7 @@ static int btrfs_defrag_file(struct file *file, total_read++; mutex_lock(&inode->i_mutex); if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS) - BTRFS_I(inode)->force_compress = 1; + BTRFS_I(inode)->force_compress = compress_type; ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE); if (ret) @@ -781,10 +881,17 @@ loop_unlock: atomic_dec(&root->fs_info->async_submit_draining); mutex_lock(&inode->i_mutex); - BTRFS_I(inode)->force_compress = 0; + BTRFS_I(inode)->force_compress = BTRFS_COMPRESS_NONE; mutex_unlock(&inode->i_mutex); } + disk_super = &root->fs_info->super_copy; + features = btrfs_super_incompat_flags(disk_super); + if (range->compress_type == BTRFS_COMPRESS_LZO) { + features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO; + btrfs_set_super_incompat_flags(disk_super, features); + } + return 0; err_reservations: @@ -885,6 +992,10 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root, if (new_size > old_size) { trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto out_unlock; + } ret = btrfs_grow_device(trans, device, new_size); btrfs_commit_transaction(trans, root); } else { @@ -901,7 +1012,8 @@ static noinline int btrfs_ioctl_snap_create_transid(struct file *file, char *name, unsigned long fd, int subvol, - u64 *transid) + u64 *transid, + bool readonly) { struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root; struct file *src_file; @@ -919,7 +1031,7 @@ static noinline int btrfs_ioctl_snap_create_transid(struct file *file, if (subvol) { ret = btrfs_mksubvol(&file->f_path, name, namelen, - NULL, transid); + NULL, transid, readonly); } else { struct inode *src_inode; src_file = fget(fd); @@ -938,7 +1050,7 @@ static noinline int btrfs_ioctl_snap_create_transid(struct file *file, } ret = btrfs_mksubvol(&file->f_path, name, namelen, BTRFS_I(src_inode)->root, - transid); + transid, readonly); fput(src_file); } out: @@ -946,61 +1058,145 @@ out: } static noinline int btrfs_ioctl_snap_create(struct file *file, - void __user *arg, int subvol, - int v2) + void __user *arg, int subvol) { - struct btrfs_ioctl_vol_args *vol_args = NULL; - struct btrfs_ioctl_vol_args_v2 *vol_args_v2 = NULL; - char *name; - u64 fd; + struct btrfs_ioctl_vol_args *vol_args; int ret; - if (v2) { - u64 transid = 0; - u64 *ptr = NULL; + vol_args = memdup_user(arg, sizeof(*vol_args)); + if (IS_ERR(vol_args)) + return PTR_ERR(vol_args); + vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; - vol_args_v2 = memdup_user(arg, sizeof(*vol_args_v2)); - if (IS_ERR(vol_args_v2)) - return PTR_ERR(vol_args_v2); + ret = btrfs_ioctl_snap_create_transid(file, vol_args->name, + vol_args->fd, subvol, + NULL, false); - if (vol_args_v2->flags & ~BTRFS_SUBVOL_CREATE_ASYNC) { - ret = -EINVAL; - goto out; - } - - name = vol_args_v2->name; - fd = vol_args_v2->fd; - vol_args_v2->name[BTRFS_SUBVOL_NAME_MAX] = '\0'; + kfree(vol_args); + return ret; +} - if (vol_args_v2->flags & BTRFS_SUBVOL_CREATE_ASYNC) - ptr = &transid; +static noinline int btrfs_ioctl_snap_create_v2(struct file *file, + void __user *arg, int subvol) +{ + struct btrfs_ioctl_vol_args_v2 *vol_args; + int ret; + u64 transid = 0; + u64 *ptr = NULL; + bool readonly = false; - ret = btrfs_ioctl_snap_create_transid(file, name, fd, - subvol, ptr); + vol_args = memdup_user(arg, sizeof(*vol_args)); + if (IS_ERR(vol_args)) + return PTR_ERR(vol_args); + vol_args->name[BTRFS_SUBVOL_NAME_MAX] = '\0'; - if (ret == 0 && ptr && - copy_to_user(arg + - offsetof(struct btrfs_ioctl_vol_args_v2, - transid), ptr, sizeof(*ptr))) - ret = -EFAULT; - } else { - vol_args = memdup_user(arg, sizeof(*vol_args)); - if (IS_ERR(vol_args)) - return PTR_ERR(vol_args); - name = vol_args->name; - fd = vol_args->fd; - vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; - - ret = btrfs_ioctl_snap_create_transid(file, name, fd, - subvol, NULL); + if (vol_args->flags & + ~(BTRFS_SUBVOL_CREATE_ASYNC | BTRFS_SUBVOL_RDONLY)) { + ret = -EOPNOTSUPP; + goto out; } + + if (vol_args->flags & BTRFS_SUBVOL_CREATE_ASYNC) + ptr = &transid; + if (vol_args->flags & BTRFS_SUBVOL_RDONLY) + readonly = true; + + ret = btrfs_ioctl_snap_create_transid(file, vol_args->name, + vol_args->fd, subvol, + ptr, readonly); + + if (ret == 0 && ptr && + copy_to_user(arg + + offsetof(struct btrfs_ioctl_vol_args_v2, + transid), ptr, sizeof(*ptr))) + ret = -EFAULT; out: kfree(vol_args); - kfree(vol_args_v2); + return ret; +} + +static noinline int btrfs_ioctl_subvol_getflags(struct file *file, + void __user *arg) +{ + struct inode *inode = fdentry(file)->d_inode; + struct btrfs_root *root = BTRFS_I(inode)->root; + int ret = 0; + u64 flags = 0; + + if (inode->i_ino != BTRFS_FIRST_FREE_OBJECTID) + return -EINVAL; + + down_read(&root->fs_info->subvol_sem); + if (btrfs_root_readonly(root)) + flags |= BTRFS_SUBVOL_RDONLY; + up_read(&root->fs_info->subvol_sem); + + if (copy_to_user(arg, &flags, sizeof(flags))) + ret = -EFAULT; return ret; } +static noinline int btrfs_ioctl_subvol_setflags(struct file *file, + void __user *arg) +{ + struct inode *inode = fdentry(file)->d_inode; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_trans_handle *trans; + u64 root_flags; + u64 flags; + int ret = 0; + + if (root->fs_info->sb->s_flags & MS_RDONLY) + return -EROFS; + + if (inode->i_ino != BTRFS_FIRST_FREE_OBJECTID) + return -EINVAL; + + if (copy_from_user(&flags, arg, sizeof(flags))) + return -EFAULT; + + if (flags & BTRFS_SUBVOL_CREATE_ASYNC) + return -EINVAL; + + if (flags & ~BTRFS_SUBVOL_RDONLY) + return -EOPNOTSUPP; + + if (!inode_owner_or_capable(inode)) + return -EACCES; + + down_write(&root->fs_info->subvol_sem); + + /* nothing to do */ + if (!!(flags & BTRFS_SUBVOL_RDONLY) == btrfs_root_readonly(root)) + goto out; + + root_flags = btrfs_root_flags(&root->root_item); + if (flags & BTRFS_SUBVOL_RDONLY) + btrfs_set_root_flags(&root->root_item, + root_flags | BTRFS_ROOT_SUBVOL_RDONLY); + else + btrfs_set_root_flags(&root->root_item, + root_flags & ~BTRFS_ROOT_SUBVOL_RDONLY); + + trans = btrfs_start_transaction(root, 1); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto out_reset; + } + + ret = btrfs_update_root(trans, root->fs_info->tree_root, + &root->root_key, &root->root_item); + + btrfs_commit_transaction(trans, root); +out_reset: + if (ret) + btrfs_set_root_flags(&root->root_item, root_flags); +out: + up_write(&root->fs_info->subvol_sem); + return ret; +} + /* * helper to check if the subvolume references other subvolumes */ @@ -1509,6 +1705,9 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp) struct btrfs_ioctl_defrag_range_args *range; int ret; + if (btrfs_root_readonly(root)) + return -EROFS; + ret = mnt_want_write(file->f_path.mnt); if (ret) return ret; @@ -1637,6 +1836,9 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, if (!(file->f_mode & FMODE_WRITE) || (file->f_flags & O_APPEND)) return -EINVAL; + if (btrfs_root_readonly(root)) + return -EROFS; + ret = mnt_want_write(file->f_path.mnt); if (ret) return ret; @@ -1788,7 +1990,10 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, memcpy(&new_key, &key, sizeof(new_key)); new_key.objectid = inode->i_ino; - new_key.offset = key.offset + destoff - off; + if (off <= key.offset) + new_key.offset = key.offset + destoff - off; + else + new_key.offset = destoff; trans = btrfs_start_transaction(root, 1); if (IS_ERR(trans)) { @@ -1958,6 +2163,10 @@ static long btrfs_ioctl_trans_start(struct file *file) if (file->private_data) goto out; + ret = -EROFS; + if (btrfs_root_readonly(root)) + goto out; + ret = mnt_want_write(file->f_path.mnt); if (ret) goto out; @@ -1968,7 +2177,7 @@ static long btrfs_ioctl_trans_start(struct file *file) ret = -ENOMEM; trans = btrfs_start_ioctl_transaction(root, 0); - if (!trans) + if (IS_ERR(trans)) goto out_drop; file->private_data = trans; @@ -2024,9 +2233,9 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp) path->leave_spinning = 1; trans = btrfs_start_transaction(root, 1); - if (!trans) { + if (IS_ERR(trans)) { btrfs_free_path(path); - return -ENOMEM; + return PTR_ERR(trans); } dir_id = btrfs_super_root_dir(&root->fs_info->super_copy); @@ -2078,7 +2287,7 @@ long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg) struct btrfs_ioctl_space_info space; struct btrfs_ioctl_space_info *dest; struct btrfs_ioctl_space_info *dest_orig; - struct btrfs_ioctl_space_info *user_dest; + struct btrfs_ioctl_space_info __user *user_dest; struct btrfs_space_info *info; u64 types[] = {BTRFS_BLOCK_GROUP_DATA, BTRFS_BLOCK_GROUP_SYSTEM, @@ -2087,7 +2296,7 @@ long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg) int num_types = 4; int alloc_size; int ret = 0; - int slot_count = 0; + u64 slot_count = 0; int i, c; if (copy_from_user(&space_args, @@ -2126,7 +2335,7 @@ long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg) goto out; } - slot_count = min_t(int, space_args.space_slots, slot_count); + slot_count = min_t(u64, space_args.space_slots, slot_count); alloc_size = sizeof(*dest) * slot_count; @@ -2146,6 +2355,9 @@ long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg) for (i = 0; i < num_types; i++) { struct btrfs_space_info *tmp; + if (!slot_count) + break; + info = NULL; rcu_read_lock(); list_for_each_entry_rcu(tmp, &root->fs_info->space_info, @@ -2167,7 +2379,10 @@ long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg) memcpy(dest, &space, sizeof(space)); dest++; space_args.total_spaces++; + slot_count--; } + if (!slot_count) + break; } up_read(&info->groups_sem); } @@ -2218,10 +2433,17 @@ static noinline long btrfs_ioctl_start_sync(struct file *file, void __user *argp struct btrfs_root *root = BTRFS_I(file->f_dentry->d_inode)->root; struct btrfs_trans_handle *trans; u64 transid; + int ret; trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) + return PTR_ERR(trans); transid = trans->transid; - btrfs_commit_transaction_async(trans, root, 0); + ret = btrfs_commit_transaction_async(trans, root, 0); + if (ret) { + btrfs_end_transaction(trans, root); + return ret; + } if (argp) if (copy_to_user(argp, &transid, sizeof(transid))) @@ -2256,14 +2478,20 @@ long btrfs_ioctl(struct file *file, unsigned int return btrfs_ioctl_setflags(file, argp); case FS_IOC_GETVERSION: return btrfs_ioctl_getversion(file, argp); + case FITRIM: + return btrfs_ioctl_fitrim(file, argp); case BTRFS_IOC_SNAP_CREATE: - return btrfs_ioctl_snap_create(file, argp, 0, 0); + return btrfs_ioctl_snap_create(file, argp, 0); case BTRFS_IOC_SNAP_CREATE_V2: - return btrfs_ioctl_snap_create(file, argp, 0, 1); + return btrfs_ioctl_snap_create_v2(file, argp, 0); case BTRFS_IOC_SUBVOL_CREATE: - return btrfs_ioctl_snap_create(file, argp, 1, 0); + return btrfs_ioctl_snap_create(file, argp, 1); case BTRFS_IOC_SNAP_DESTROY: return btrfs_ioctl_snap_destroy(file, argp); + case BTRFS_IOC_SUBVOL_GETFLAGS: + return btrfs_ioctl_subvol_getflags(file, argp); + case BTRFS_IOC_SUBVOL_SETFLAGS: + return btrfs_ioctl_subvol_setflags(file, argp); case BTRFS_IOC_DEFAULT_SUBVOL: return btrfs_ioctl_default_subvol(file, argp); case BTRFS_IOC_DEFRAG: diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h index c344d12c646b..8fb382167b13 100644 --- a/fs/btrfs/ioctl.h +++ b/fs/btrfs/ioctl.h @@ -31,6 +31,7 @@ struct btrfs_ioctl_vol_args { }; #define BTRFS_SUBVOL_CREATE_ASYNC (1ULL << 0) +#define BTRFS_SUBVOL_RDONLY (1ULL << 1) #define BTRFS_SUBVOL_NAME_MAX 4039 struct btrfs_ioctl_vol_args_v2 { @@ -133,8 +134,15 @@ struct btrfs_ioctl_defrag_range_args { */ __u32 extent_thresh; + /* + * which compression method to use if turning on compression + * for this defrag operation. If unspecified, zlib will + * be used + */ + __u32 compress_type; + /* spare for later */ - __u32 unused[5]; + __u32 unused[4]; }; struct btrfs_ioctl_space_info { @@ -193,4 +201,6 @@ struct btrfs_ioctl_space_args { #define BTRFS_IOC_WAIT_SYNC _IOW(BTRFS_IOCTL_MAGIC, 22, __u64) #define BTRFS_IOC_SNAP_CREATE_V2 _IOW(BTRFS_IOCTL_MAGIC, 23, \ struct btrfs_ioctl_vol_args_v2) +#define BTRFS_IOC_SUBVOL_GETFLAGS _IOW(BTRFS_IOCTL_MAGIC, 25, __u64) +#define BTRFS_IOC_SUBVOL_SETFLAGS _IOW(BTRFS_IOCTL_MAGIC, 26, __u64) #endif diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c new file mode 100644 index 000000000000..a178f5ebea78 --- /dev/null +++ b/fs/btrfs/lzo.c @@ -0,0 +1,427 @@ +/* + * Copyright (C) 2008 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/kernel.h> +#include <linux/slab.h> +#include <linux/vmalloc.h> +#include <linux/init.h> +#include <linux/err.h> +#include <linux/sched.h> +#include <linux/pagemap.h> +#include <linux/bio.h> +#include <linux/lzo.h> +#include "compression.h" + +#define LZO_LEN 4 + +struct workspace { + void *mem; + void *buf; /* where compressed data goes */ + void *cbuf; /* where decompressed data goes */ + struct list_head list; +}; + +static void lzo_free_workspace(struct list_head *ws) +{ + struct workspace *workspace = list_entry(ws, struct workspace, list); + + vfree(workspace->buf); + vfree(workspace->cbuf); + vfree(workspace->mem); + kfree(workspace); +} + +static struct list_head *lzo_alloc_workspace(void) +{ + struct workspace *workspace; + + workspace = kzalloc(sizeof(*workspace), GFP_NOFS); + if (!workspace) + return ERR_PTR(-ENOMEM); + + workspace->mem = vmalloc(LZO1X_MEM_COMPRESS); + workspace->buf = vmalloc(lzo1x_worst_compress(PAGE_CACHE_SIZE)); + workspace->cbuf = vmalloc(lzo1x_worst_compress(PAGE_CACHE_SIZE)); + if (!workspace->mem || !workspace->buf || !workspace->cbuf) + goto fail; + + INIT_LIST_HEAD(&workspace->list); + + return &workspace->list; +fail: + lzo_free_workspace(&workspace->list); + return ERR_PTR(-ENOMEM); +} + +static inline void write_compress_length(char *buf, size_t len) +{ + __le32 dlen; + + dlen = cpu_to_le32(len); + memcpy(buf, &dlen, LZO_LEN); +} + +static inline size_t read_compress_length(char *buf) +{ + __le32 dlen; + + memcpy(&dlen, buf, LZO_LEN); + return le32_to_cpu(dlen); +} + +static int lzo_compress_pages(struct list_head *ws, + struct address_space *mapping, + u64 start, unsigned long len, + struct page **pages, + unsigned long nr_dest_pages, + unsigned long *out_pages, + unsigned long *total_in, + unsigned long *total_out, + unsigned long max_out) +{ + struct workspace *workspace = list_entry(ws, struct workspace, list); + int ret = 0; + char *data_in; + char *cpage_out; + int nr_pages = 0; + struct page *in_page = NULL; + struct page *out_page = NULL; + unsigned long bytes_left; + + size_t in_len; + size_t out_len; + char *buf; + unsigned long tot_in = 0; + unsigned long tot_out = 0; + unsigned long pg_bytes_left; + unsigned long out_offset; + unsigned long bytes; + + *out_pages = 0; + *total_out = 0; + *total_in = 0; + + in_page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT); + data_in = kmap(in_page); + + /* + * store the size of all chunks of compressed data in + * the first 4 bytes + */ + out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); + if (out_page == NULL) { + ret = -ENOMEM; + goto out; + } + cpage_out = kmap(out_page); + out_offset = LZO_LEN; + tot_out = LZO_LEN; + pages[0] = out_page; + nr_pages = 1; + pg_bytes_left = PAGE_CACHE_SIZE - LZO_LEN; + + /* compress at most one page of data each time */ + in_len = min(len, PAGE_CACHE_SIZE); + while (tot_in < len) { + ret = lzo1x_1_compress(data_in, in_len, workspace->cbuf, + &out_len, workspace->mem); + if (ret != LZO_E_OK) { + printk(KERN_DEBUG "btrfs deflate in loop returned %d\n", + ret); + ret = -1; + goto out; + } + + /* store the size of this chunk of compressed data */ + write_compress_length(cpage_out + out_offset, out_len); + tot_out += LZO_LEN; + out_offset += LZO_LEN; + pg_bytes_left -= LZO_LEN; + + tot_in += in_len; + tot_out += out_len; + + /* copy bytes from the working buffer into the pages */ + buf = workspace->cbuf; + while (out_len) { + bytes = min_t(unsigned long, pg_bytes_left, out_len); + + memcpy(cpage_out + out_offset, buf, bytes); + + out_len -= bytes; + pg_bytes_left -= bytes; + buf += bytes; + out_offset += bytes; + + /* + * we need another page for writing out. + * + * Note if there's less than 4 bytes left, we just + * skip to a new page. + */ + if ((out_len == 0 && pg_bytes_left < LZO_LEN) || + pg_bytes_left == 0) { + if (pg_bytes_left) { + memset(cpage_out + out_offset, 0, + pg_bytes_left); + tot_out += pg_bytes_left; + } + + /* we're done, don't allocate new page */ + if (out_len == 0 && tot_in >= len) + break; + + kunmap(out_page); + if (nr_pages == nr_dest_pages) { + out_page = NULL; + ret = -1; + goto out; + } + + out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); + if (out_page == NULL) { + ret = -ENOMEM; + goto out; + } + cpage_out = kmap(out_page); + pages[nr_pages++] = out_page; + + pg_bytes_left = PAGE_CACHE_SIZE; + out_offset = 0; + } + } + + /* we're making it bigger, give up */ + if (tot_in > 8192 && tot_in < tot_out) + goto out; + + /* we're all done */ + if (tot_in >= len) + break; + + if (tot_out > max_out) + break; + + bytes_left = len - tot_in; + kunmap(in_page); + page_cache_release(in_page); + + start += PAGE_CACHE_SIZE; + in_page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT); + data_in = kmap(in_page); + in_len = min(bytes_left, PAGE_CACHE_SIZE); + } + + if (tot_out > tot_in) + goto out; + + /* store the size of all chunks of compressed data */ + cpage_out = kmap(pages[0]); + write_compress_length(cpage_out, tot_out); + + kunmap(pages[0]); + + ret = 0; + *total_out = tot_out; + *total_in = tot_in; +out: + *out_pages = nr_pages; + if (out_page) + kunmap(out_page); + + if (in_page) { + kunmap(in_page); + page_cache_release(in_page); + } + + return ret; +} + +static int lzo_decompress_biovec(struct list_head *ws, + struct page **pages_in, + u64 disk_start, + struct bio_vec *bvec, + int vcnt, + size_t srclen) +{ + struct workspace *workspace = list_entry(ws, struct workspace, list); + int ret = 0, ret2; + char *data_in; + unsigned long page_in_index = 0; + unsigned long page_out_index = 0; + unsigned long total_pages_in = (srclen + PAGE_CACHE_SIZE - 1) / + PAGE_CACHE_SIZE; + unsigned long buf_start; + unsigned long buf_offset = 0; + unsigned long bytes; + unsigned long working_bytes; + unsigned long pg_offset; + + size_t in_len; + size_t out_len; + unsigned long in_offset; + unsigned long in_page_bytes_left; + unsigned long tot_in; + unsigned long tot_out; + unsigned long tot_len; + char *buf; + bool may_late_unmap, need_unmap; + + data_in = kmap(pages_in[0]); + tot_len = read_compress_length(data_in); + + tot_in = LZO_LEN; + in_offset = LZO_LEN; + tot_len = min_t(size_t, srclen, tot_len); + in_page_bytes_left = PAGE_CACHE_SIZE - LZO_LEN; + + tot_out = 0; + pg_offset = 0; + + while (tot_in < tot_len) { + in_len = read_compress_length(data_in + in_offset); + in_page_bytes_left -= LZO_LEN; + in_offset += LZO_LEN; + tot_in += LZO_LEN; + + tot_in += in_len; + working_bytes = in_len; + may_late_unmap = need_unmap = false; + + /* fast path: avoid using the working buffer */ + if (in_page_bytes_left >= in_len) { + buf = data_in + in_offset; + bytes = in_len; + may_late_unmap = true; + goto cont; + } + + /* copy bytes from the pages into the working buffer */ + buf = workspace->cbuf; + buf_offset = 0; + while (working_bytes) { + bytes = min(working_bytes, in_page_bytes_left); + + memcpy(buf + buf_offset, data_in + in_offset, bytes); + buf_offset += bytes; +cont: + working_bytes -= bytes; + in_page_bytes_left -= bytes; + in_offset += bytes; + + /* check if we need to pick another page */ + if ((working_bytes == 0 && in_page_bytes_left < LZO_LEN) + || in_page_bytes_left == 0) { + tot_in += in_page_bytes_left; + + if (working_bytes == 0 && tot_in >= tot_len) + break; + + if (page_in_index + 1 >= total_pages_in) { + ret = -1; + goto done; + } + + if (may_late_unmap) + need_unmap = true; + else + kunmap(pages_in[page_in_index]); + + data_in = kmap(pages_in[++page_in_index]); + + in_page_bytes_left = PAGE_CACHE_SIZE; + in_offset = 0; + } + } + + out_len = lzo1x_worst_compress(PAGE_CACHE_SIZE); + ret = lzo1x_decompress_safe(buf, in_len, workspace->buf, + &out_len); + if (need_unmap) + kunmap(pages_in[page_in_index - 1]); + if (ret != LZO_E_OK) { + printk(KERN_WARNING "btrfs decompress failed\n"); + ret = -1; + break; + } + + buf_start = tot_out; + tot_out += out_len; + + ret2 = btrfs_decompress_buf2page(workspace->buf, buf_start, + tot_out, disk_start, + bvec, vcnt, + &page_out_index, &pg_offset); + if (ret2 == 0) + break; + } +done: + kunmap(pages_in[page_in_index]); + return ret; +} + +static int lzo_decompress(struct list_head *ws, unsigned char *data_in, + struct page *dest_page, + unsigned long start_byte, + size_t srclen, size_t destlen) +{ + struct workspace *workspace = list_entry(ws, struct workspace, list); + size_t in_len; + size_t out_len; + size_t tot_len; + int ret = 0; + char *kaddr; + unsigned long bytes; + + BUG_ON(srclen < LZO_LEN); + + tot_len = read_compress_length(data_in); + data_in += LZO_LEN; + + in_len = read_compress_length(data_in); + data_in += LZO_LEN; + + out_len = PAGE_CACHE_SIZE; + ret = lzo1x_decompress_safe(data_in, in_len, workspace->buf, &out_len); + if (ret != LZO_E_OK) { + printk(KERN_WARNING "btrfs decompress failed!\n"); + ret = -1; + goto out; + } + + if (out_len < start_byte) { + ret = -1; + goto out; + } + + bytes = min_t(unsigned long, destlen, out_len - start_byte); + + kaddr = kmap_atomic(dest_page, KM_USER0); + memcpy(kaddr, workspace->buf + start_byte, bytes); + kunmap_atomic(kaddr, KM_USER0); +out: + return ret; +} + +struct btrfs_compress_op btrfs_lzo_compress = { + .alloc_workspace = lzo_alloc_workspace, + .free_workspace = lzo_free_workspace, + .compress_pages = lzo_compress_pages, + .decompress_biovec = lzo_decompress_biovec, + .decompress = lzo_decompress, +}; diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index ae7737e352c9..a1c940425307 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -141,7 +141,7 @@ static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree, u64 file_offset) { struct rb_root *root = &tree->tree; - struct rb_node *prev; + struct rb_node *prev = NULL; struct rb_node *ret; struct btrfs_ordered_extent *entry; @@ -172,7 +172,7 @@ static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree, */ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, u64 start, u64 len, u64 disk_len, - int type, int dio) + int type, int dio, int compress_type) { struct btrfs_ordered_inode_tree *tree; struct rb_node *node; @@ -189,6 +189,7 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, entry->disk_len = disk_len; entry->bytes_left = len; entry->inode = inode; + entry->compress_type = compress_type; if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE) set_bit(type, &entry->flags); @@ -201,6 +202,8 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, INIT_LIST_HEAD(&entry->list); INIT_LIST_HEAD(&entry->root_extent_list); + trace_btrfs_ordered_extent_add(inode, entry); + spin_lock(&tree->lock); node = tree_insert(&tree->tree, file_offset, &entry->rb_node); @@ -220,14 +223,25 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, u64 start, u64 len, u64 disk_len, int type) { return __btrfs_add_ordered_extent(inode, file_offset, start, len, - disk_len, type, 0); + disk_len, type, 0, + BTRFS_COMPRESS_NONE); } int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset, u64 start, u64 len, u64 disk_len, int type) { return __btrfs_add_ordered_extent(inode, file_offset, start, len, - disk_len, type, 1); + disk_len, type, 1, + BTRFS_COMPRESS_NONE); +} + +int btrfs_add_ordered_extent_compress(struct inode *inode, u64 file_offset, + u64 start, u64 len, u64 disk_len, + int type, int compress_type) +{ + return __btrfs_add_ordered_extent(inode, file_offset, start, len, + disk_len, type, 0, + compress_type); } /* @@ -375,6 +389,8 @@ int btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry) struct list_head *cur; struct btrfs_ordered_sum *sum; + trace_btrfs_ordered_extent_put(entry->inode, entry); + if (atomic_dec_and_test(&entry->refs)) { while (!list_empty(&entry->list)) { cur = entry->list.next; @@ -408,6 +424,8 @@ static int __btrfs_remove_ordered_extent(struct inode *inode, spin_lock(&root->fs_info->ordered_extent_lock); list_del_init(&entry->root_extent_list); + trace_btrfs_ordered_extent_remove(inode, entry); + /* * we have no more ordered extents for this inode and * no dirty pages. We can safely remove it from the @@ -573,6 +591,8 @@ void btrfs_start_ordered_extent(struct inode *inode, u64 start = entry->file_offset; u64 end = start + entry->len - 1; + trace_btrfs_ordered_extent_start(inode, entry); + /* * pages in the range can be dirty, clean or writeback. We * start IO on any dirty ones so the wait doesn't stall waiting diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index 61dca83119dd..ff1f69aa1883 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -68,7 +68,7 @@ struct btrfs_ordered_sum { #define BTRFS_ORDERED_NOCOW 2 /* set when we want to write in place */ -#define BTRFS_ORDERED_COMPRESSED 3 /* writing a compressed extent */ +#define BTRFS_ORDERED_COMPRESSED 3 /* writing a zlib compressed extent */ #define BTRFS_ORDERED_PREALLOC 4 /* set when writing to prealloced extent */ @@ -93,6 +93,9 @@ struct btrfs_ordered_extent { /* flags (described above) */ unsigned long flags; + /* compression algorithm */ + int compress_type; + /* reference count */ atomic_t refs; @@ -148,6 +151,9 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, u64 start, u64 len, u64 disk_len, int type); int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset, u64 start, u64 len, u64 disk_len, int type); +int btrfs_add_ordered_extent_compress(struct inode *inode, u64 file_offset, + u64 start, u64 len, u64 disk_len, + int type, int compress_type); int btrfs_add_ordered_sum(struct inode *inode, struct btrfs_ordered_extent *entry, struct btrfs_ordered_sum *sum); diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c index 0d126be22b63..fb2605d998e9 100644 --- a/fs/btrfs/print-tree.c +++ b/fs/btrfs/print-tree.c @@ -260,6 +260,7 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l) #else BUG(); #endif + break; case BTRFS_BLOCK_GROUP_ITEM_KEY: bi = btrfs_item_ptr(l, i, struct btrfs_block_group_item); diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 045c9c2b2d7e..199a80134312 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -1157,6 +1157,7 @@ static int clone_backref_node(struct btrfs_trans_handle *trans, new_node->bytenr = dest->node->start; new_node->level = node->level; new_node->lowest = node->lowest; + new_node->checked = 1; new_node->root = dest; if (!node->lowest) { @@ -1723,6 +1724,7 @@ again: eb = read_tree_block(dest, old_bytenr, blocksize, old_ptr_gen); + BUG_ON(!eb); btrfs_tree_lock(eb); if (cow) { ret = btrfs_cow_block(trans, dest, eb, parent, @@ -2028,6 +2030,7 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc, while (1) { trans = btrfs_start_transaction(root, 0); + BUG_ON(IS_ERR(trans)); trans->block_rsv = rc->block_rsv; ret = btrfs_block_rsv_check(trans, root, rc->block_rsv, @@ -2147,6 +2150,12 @@ again: } trans = btrfs_join_transaction(rc->extent_root, 1); + if (IS_ERR(trans)) { + if (!err) + btrfs_block_rsv_release(rc->extent_root, + rc->block_rsv, num_bytes); + return PTR_ERR(trans); + } if (!err) { if (num_bytes != rc->merging_rsv_size) { @@ -2337,7 +2346,7 @@ struct btrfs_root *select_one_root(struct btrfs_trans_handle *trans, root = next->root; BUG_ON(!root); - /* no other choice for non-refernce counted tree */ + /* no other choice for non-references counted tree */ if (!root->ref_cows) return root; @@ -2505,6 +2514,10 @@ static int do_relocation(struct btrfs_trans_handle *trans, blocksize = btrfs_level_size(root, node->level); generation = btrfs_node_ptr_generation(upper->eb, slot); eb = read_tree_block(root, bytenr, blocksize, generation); + if (!eb) { + err = -EIO; + goto next; + } btrfs_tree_lock(eb); btrfs_set_lock_blocking(eb); @@ -2662,6 +2675,7 @@ static int get_tree_block_key(struct reloc_control *rc, BUG_ON(block->key_ready); eb = read_tree_block(rc->extent_root, block->bytenr, block->key.objectid, block->key.offset); + BUG_ON(!eb); WARN_ON(btrfs_header_level(eb) != block->level); if (block->level == 0) btrfs_item_key_to_cpu(eb, &block->key, 0); @@ -3222,6 +3236,7 @@ truncate: trans = btrfs_join_transaction(root, 0); if (IS_ERR(trans)) { btrfs_free_path(path); + ret = PTR_ERR(trans); goto out; } @@ -3628,6 +3643,7 @@ int prepare_to_relocate(struct reloc_control *rc) set_reloc_control(rc); trans = btrfs_join_transaction(rc->extent_root, 1); + BUG_ON(IS_ERR(trans)); btrfs_commit_transaction(trans, rc->extent_root); return 0; } @@ -3644,6 +3660,7 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc) u32 item_size; int ret; int err = 0; + int progress = 0; path = btrfs_alloc_path(); if (!path) @@ -3656,8 +3673,10 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc) } while (1) { + progress++; trans = btrfs_start_transaction(rc->extent_root, 0); - + BUG_ON(IS_ERR(trans)); +restart: if (update_backref_cache(trans, &rc->backref_cache)) { btrfs_end_transaction(trans, rc->extent_root); continue; @@ -3770,6 +3789,15 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc) } } } + if (trans && progress && err == -ENOSPC) { + ret = btrfs_force_chunk_alloc(trans, rc->extent_root, + rc->block_group->flags); + if (ret == 0) { + err = 0; + progress = 0; + goto restart; + } + } btrfs_release_path(rc->extent_root, path); clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY, @@ -3804,7 +3832,10 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc) /* get rid of pinned extents */ trans = btrfs_join_transaction(rc->extent_root, 1); - btrfs_commit_transaction(trans, rc->extent_root); + if (IS_ERR(trans)) + err = PTR_ERR(trans); + else + btrfs_commit_transaction(trans, rc->extent_root); out_free: btrfs_free_block_rsv(rc->extent_root, rc->block_rsv); btrfs_free_path(path); @@ -4022,6 +4053,7 @@ static noinline_for_stack int mark_garbage_root(struct btrfs_root *root) int ret; trans = btrfs_start_transaction(root->fs_info->tree_root, 0); + BUG_ON(IS_ERR(trans)); memset(&root->root_item.drop_progress, 0, sizeof(root->root_item.drop_progress)); @@ -4125,6 +4157,11 @@ int btrfs_recover_relocation(struct btrfs_root *root) set_reloc_control(rc); trans = btrfs_join_transaction(rc->extent_root, 1); + if (IS_ERR(trans)) { + unset_reloc_control(rc); + err = PTR_ERR(trans); + goto out_free; + } rc->merge_reloc_tree = 1; @@ -4154,9 +4191,13 @@ int btrfs_recover_relocation(struct btrfs_root *root) unset_reloc_control(rc); trans = btrfs_join_transaction(rc->extent_root, 1); - btrfs_commit_transaction(trans, rc->extent_root); -out: + if (IS_ERR(trans)) + err = PTR_ERR(trans); + else + btrfs_commit_transaction(trans, rc->extent_root); +out_free: kfree(rc); +out: while (!list_empty(&reloc_roots)) { reloc_root = list_entry(reloc_roots.next, struct btrfs_root, root_list); @@ -4174,7 +4215,7 @@ out: if (IS_ERR(fs_root)) err = PTR_ERR(fs_root); else - btrfs_orphan_cleanup(fs_root); + err = btrfs_orphan_cleanup(fs_root); } return err; } diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index 6a1086e83ffc..6928bff62daa 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -88,7 +88,8 @@ int btrfs_find_last_root(struct btrfs_root *root, u64 objectid, search_key.offset = (u64)-1; path = btrfs_alloc_path(); - BUG_ON(!path); + if (!path) + return -ENOMEM; ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); if (ret < 0) goto out; @@ -332,7 +333,8 @@ int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *leaf; path = btrfs_alloc_path(); - BUG_ON(!path); + if (!path) + return -ENOMEM; ret = btrfs_search_slot(trans, root, key, path, -1, 1); if (ret < 0) goto out; @@ -471,3 +473,21 @@ again: btrfs_free_path(path); return 0; } + +/* + * Old btrfs forgets to init root_item->flags and root_item->byte_limit + * for subvolumes. To work around this problem, we steal a bit from + * root_item->inode_item->flags, and use it to indicate if those fields + * have been properly initialized. + */ +void btrfs_check_and_init_root_item(struct btrfs_root_item *root_item) +{ + u64 inode_flags = le64_to_cpu(root_item->inode.flags); + + if (!(inode_flags & BTRFS_INODE_ROOT_ITEM_INIT)) { + inode_flags |= BTRFS_INODE_ROOT_ITEM_INIT; + root_item->inode.flags = cpu_to_le64(inode_flags); + root_item->flags = 0; + root_item->byte_limit = 0; + } +} diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 22acdaa78ce1..0ac712efcdf2 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -52,8 +52,95 @@ #include "export.h" #include "compression.h" +#define CREATE_TRACE_POINTS +#include <trace/events/btrfs.h> + static const struct super_operations btrfs_super_ops; +static const char *btrfs_decode_error(struct btrfs_fs_info *fs_info, int errno, + char nbuf[16]) +{ + char *errstr = NULL; + + switch (errno) { + case -EIO: + errstr = "IO failure"; + break; + case -ENOMEM: + errstr = "Out of memory"; + break; + case -EROFS: + errstr = "Readonly filesystem"; + break; + default: + if (nbuf) { + if (snprintf(nbuf, 16, "error %d", -errno) >= 0) + errstr = nbuf; + } + break; + } + + return errstr; +} + +static void __save_error_info(struct btrfs_fs_info *fs_info) +{ + /* + * today we only save the error info into ram. Long term we'll + * also send it down to the disk + */ + fs_info->fs_state = BTRFS_SUPER_FLAG_ERROR; +} + +/* NOTE: + * We move write_super stuff at umount in order to avoid deadlock + * for umount hold all lock. + */ +static void save_error_info(struct btrfs_fs_info *fs_info) +{ + __save_error_info(fs_info); +} + +/* btrfs handle error by forcing the filesystem readonly */ +static void btrfs_handle_error(struct btrfs_fs_info *fs_info) +{ + struct super_block *sb = fs_info->sb; + + if (sb->s_flags & MS_RDONLY) + return; + + if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) { + sb->s_flags |= MS_RDONLY; + printk(KERN_INFO "btrfs is forced readonly\n"); + } +} + +/* + * __btrfs_std_error decodes expected errors from the caller and + * invokes the approciate error response. + */ +void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function, + unsigned int line, int errno) +{ + struct super_block *sb = fs_info->sb; + char nbuf[16]; + const char *errstr; + + /* + * Special case: if the error is EROFS, and we're already + * under MS_RDONLY, then it is safe here. + */ + if (errno == -EROFS && (sb->s_flags & MS_RDONLY)) + return; + + errstr = btrfs_decode_error(fs_info, errno, nbuf); + printk(KERN_CRIT "BTRFS error (device %s) in %s:%d: %s\n", + sb->s_id, function, line, errstr); + save_error_info(fs_info); + + btrfs_handle_error(fs_info); +} + static void btrfs_put_super(struct super_block *sb) { struct btrfs_root *root = btrfs_sb(sb); @@ -69,9 +156,10 @@ enum { Opt_degraded, Opt_subvol, Opt_subvolid, Opt_device, Opt_nodatasum, Opt_nodatacow, Opt_max_inline, Opt_alloc_start, Opt_nobarrier, Opt_ssd, Opt_nossd, Opt_ssd_spread, Opt_thread_pool, Opt_noacl, Opt_compress, - Opt_compress_force, Opt_notreelog, Opt_ratio, Opt_flushoncommit, - Opt_discard, Opt_space_cache, Opt_clear_cache, Opt_err, - Opt_user_subvol_rm_allowed, + Opt_compress_type, Opt_compress_force, Opt_compress_force_type, + Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard, + Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed, + Opt_enospc_debug, Opt_subvolrootid, Opt_err, }; static match_table_t tokens = { @@ -86,7 +174,9 @@ static match_table_t tokens = { {Opt_alloc_start, "alloc_start=%s"}, {Opt_thread_pool, "thread_pool=%d"}, {Opt_compress, "compress"}, + {Opt_compress_type, "compress=%s"}, {Opt_compress_force, "compress-force"}, + {Opt_compress_force_type, "compress-force=%s"}, {Opt_ssd, "ssd"}, {Opt_ssd_spread, "ssd_spread"}, {Opt_nossd, "nossd"}, @@ -98,6 +188,8 @@ static match_table_t tokens = { {Opt_space_cache, "space_cache"}, {Opt_clear_cache, "clear_cache"}, {Opt_user_subvol_rm_allowed, "user_subvol_rm_allowed"}, + {Opt_enospc_debug, "enospc_debug"}, + {Opt_subvolrootid, "subvolrootid=%d"}, {Opt_err, NULL}, }; @@ -112,6 +204,8 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) char *p, *num, *orig; int intarg; int ret = 0; + char *compress_type; + bool compress_force = false; if (!options) return 0; @@ -139,6 +233,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) break; case Opt_subvol: case Opt_subvolid: + case Opt_subvolrootid: case Opt_device: /* * These are parsed by btrfs_parse_early_options @@ -154,14 +249,32 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) btrfs_set_opt(info->mount_opt, NODATACOW); btrfs_set_opt(info->mount_opt, NODATASUM); break; - case Opt_compress: - printk(KERN_INFO "btrfs: use compression\n"); - btrfs_set_opt(info->mount_opt, COMPRESS); - break; case Opt_compress_force: - printk(KERN_INFO "btrfs: forcing compression\n"); - btrfs_set_opt(info->mount_opt, FORCE_COMPRESS); + case Opt_compress_force_type: + compress_force = true; + case Opt_compress: + case Opt_compress_type: + if (token == Opt_compress || + token == Opt_compress_force || + strcmp(args[0].from, "zlib") == 0) { + compress_type = "zlib"; + info->compress_type = BTRFS_COMPRESS_ZLIB; + } else if (strcmp(args[0].from, "lzo") == 0) { + compress_type = "lzo"; + info->compress_type = BTRFS_COMPRESS_LZO; + } else { + ret = -EINVAL; + goto out; + } + btrfs_set_opt(info->mount_opt, COMPRESS); + if (compress_force) { + btrfs_set_opt(info->mount_opt, FORCE_COMPRESS); + pr_info("btrfs: force %s compression\n", + compress_type); + } else + pr_info("btrfs: use %s compression\n", + compress_type); break; case Opt_ssd: printk(KERN_INFO "btrfs: use ssd allocation scheme\n"); @@ -252,6 +365,9 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) case Opt_user_subvol_rm_allowed: btrfs_set_opt(info->mount_opt, USER_SUBVOL_RM_ALLOWED); break; + case Opt_enospc_debug: + btrfs_set_opt(info->mount_opt, ENOSPC_DEBUG); + break; case Opt_err: printk(KERN_INFO "btrfs: unrecognized mount option " "'%s'\n", p); @@ -274,10 +390,10 @@ out: */ static int btrfs_parse_early_options(const char *options, fmode_t flags, void *holder, char **subvol_name, u64 *subvol_objectid, - struct btrfs_fs_devices **fs_devices) + u64 *subvol_rootid, struct btrfs_fs_devices **fs_devices) { substring_t args[MAX_OPT_ARGS]; - char *opts, *p; + char *opts, *orig, *p; int error = 0; int intarg; @@ -291,6 +407,7 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags, opts = kstrdup(options, GFP_KERNEL); if (!opts) return -ENOMEM; + orig = opts; while ((p = strsep(&opts, ",")) != NULL) { int token; @@ -314,6 +431,18 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags, *subvol_objectid = intarg; } break; + case Opt_subvolrootid: + intarg = 0; + error = match_int(&args[0], &intarg); + if (!error) { + /* we want the original fs_tree */ + if (!intarg) + *subvol_rootid = + BTRFS_FS_TREE_OBJECTID; + else + *subvol_rootid = intarg; + } + break; case Opt_device: error = btrfs_scan_one_device(match_strdup(&args[0]), flags, holder, fs_devices); @@ -326,7 +455,7 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags, } out_free_opts: - kfree(opts); + kfree(orig); out: /* * If no subvolume name is specified we use the default one. Allocate @@ -508,6 +637,8 @@ int btrfs_sync_fs(struct super_block *sb, int wait) struct btrfs_root *root = btrfs_sb(sb); int ret; + trace_btrfs_sync_fs(wait); + if (!wait) { filemap_flush(root->fs_info->btree_inode->i_mapping); return 0; @@ -517,6 +648,8 @@ int btrfs_sync_fs(struct super_block *sb, int wait) btrfs_wait_ordered_extents(root, 0, 0); trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) + return PTR_ERR(trans); ret = btrfs_commit_transaction(trans, root); return ret; } @@ -525,6 +658,7 @@ static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs) { struct btrfs_root *root = btrfs_sb(vfs->mnt_sb); struct btrfs_fs_info *info = root->fs_info; + char *compress_type; if (btrfs_test_opt(root, DEGRADED)) seq_puts(seq, ",degraded"); @@ -543,8 +677,16 @@ static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs) if (info->thread_pool_size != min_t(unsigned long, num_online_cpus() + 2, 8)) seq_printf(seq, ",thread_pool=%d", info->thread_pool_size); - if (btrfs_test_opt(root, COMPRESS)) - seq_puts(seq, ",compress"); + if (btrfs_test_opt(root, COMPRESS)) { + if (info->compress_type == BTRFS_COMPRESS_ZLIB) + compress_type = "zlib"; + else + compress_type = "lzo"; + if (btrfs_test_opt(root, FORCE_COMPRESS)) + seq_printf(seq, ",compress-force=%s", compress_type); + else + seq_printf(seq, ",compress=%s", compress_type); + } if (btrfs_test_opt(root, NOSSD)) seq_puts(seq, ",nossd"); if (btrfs_test_opt(root, SSD_SPREAD)) @@ -559,6 +701,12 @@ static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs) seq_puts(seq, ",discard"); if (!(root->fs_info->sb->s_flags & MS_POSIXACL)) seq_puts(seq, ",noacl"); + if (btrfs_test_opt(root, SPACE_CACHE)) + seq_puts(seq, ",space_cache"); + if (btrfs_test_opt(root, CLEAR_CACHE)) + seq_puts(seq, ",clear_cache"); + if (btrfs_test_opt(root, USER_SUBVOL_RM_ALLOWED)) + seq_puts(seq, ",user_subvol_rm_allowed"); return 0; } @@ -602,6 +750,7 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, fmode_t mode = FMODE_READ; char *subvol_name = NULL; u64 subvol_objectid = 0; + u64 subvol_rootid = 0; int error = 0; if (!(flags & MS_RDONLY)) @@ -609,7 +758,7 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, error = btrfs_parse_early_options(data, mode, fs_type, &subvol_name, &subvol_objectid, - &fs_devices); + &subvol_rootid, &fs_devices); if (error) return ERR_PTR(error); @@ -655,6 +804,8 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, } btrfs_close_devices(fs_devices); + kfree(fs_info); + kfree(tree_root); } else { char b[BDEVNAME_SIZE]; @@ -671,15 +822,17 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, s->s_flags |= MS_ACTIVE; } - root = get_default_root(s, subvol_objectid); - if (IS_ERR(root)) { - error = PTR_ERR(root); - deactivate_locked_super(s); - goto error_free_subvol_name; - } /* if they gave us a subvolume name bind mount into that */ if (strcmp(subvol_name, ".")) { struct dentry *new_root; + + root = get_default_root(s, subvol_rootid); + if (IS_ERR(root)) { + error = PTR_ERR(root); + deactivate_locked_super(s); + goto error_free_subvol_name; + } + mutex_lock(&root->d_inode->i_mutex); new_root = lookup_one_len(subvol_name, root, strlen(subvol_name)); @@ -700,6 +853,13 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, } dput(root); root = new_root; + } else { + root = get_default_root(s, subvol_objectid); + if (IS_ERR(root)) { + error = PTR_ERR(root); + deactivate_locked_super(s); + goto error_free_subvol_name; + } } kfree(subvol_name); @@ -753,6 +913,127 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) return 0; } +/* + * The helper to calc the free space on the devices that can be used to store + * file data. + */ +static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_device_info *devices_info; + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; + struct btrfs_device *device; + u64 skip_space; + u64 type; + u64 avail_space; + u64 used_space; + u64 min_stripe_size; + int min_stripes = 1; + int i = 0, nr_devices; + int ret; + + nr_devices = fs_info->fs_devices->rw_devices; + BUG_ON(!nr_devices); + + devices_info = kmalloc(sizeof(*devices_info) * nr_devices, + GFP_NOFS); + if (!devices_info) + return -ENOMEM; + + /* calc min stripe number for data space alloction */ + type = btrfs_get_alloc_profile(root, 1); + if (type & BTRFS_BLOCK_GROUP_RAID0) + min_stripes = 2; + else if (type & BTRFS_BLOCK_GROUP_RAID1) + min_stripes = 2; + else if (type & BTRFS_BLOCK_GROUP_RAID10) + min_stripes = 4; + + if (type & BTRFS_BLOCK_GROUP_DUP) + min_stripe_size = 2 * BTRFS_STRIPE_LEN; + else + min_stripe_size = BTRFS_STRIPE_LEN; + + list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) { + if (!device->in_fs_metadata) + continue; + + avail_space = device->total_bytes - device->bytes_used; + + /* align with stripe_len */ + do_div(avail_space, BTRFS_STRIPE_LEN); + avail_space *= BTRFS_STRIPE_LEN; + + /* + * In order to avoid overwritting the superblock on the drive, + * btrfs starts at an offset of at least 1MB when doing chunk + * allocation. + */ + skip_space = 1024 * 1024; + + /* user can set the offset in fs_info->alloc_start. */ + if (fs_info->alloc_start + BTRFS_STRIPE_LEN <= + device->total_bytes) + skip_space = max(fs_info->alloc_start, skip_space); + + /* + * btrfs can not use the free space in [0, skip_space - 1], + * we must subtract it from the total. In order to implement + * it, we account the used space in this range first. + */ + ret = btrfs_account_dev_extents_size(device, 0, skip_space - 1, + &used_space); + if (ret) { + kfree(devices_info); + return ret; + } + + /* calc the free space in [0, skip_space - 1] */ + skip_space -= used_space; + + /* + * we can use the free space in [0, skip_space - 1], subtract + * it from the total. + */ + if (avail_space && avail_space >= skip_space) + avail_space -= skip_space; + else + avail_space = 0; + + if (avail_space < min_stripe_size) + continue; + + devices_info[i].dev = device; + devices_info[i].max_avail = avail_space; + + i++; + } + + nr_devices = i; + + btrfs_descending_sort_devices(devices_info, nr_devices); + + i = nr_devices - 1; + avail_space = 0; + while (nr_devices >= min_stripes) { + if (devices_info[i].max_avail >= min_stripe_size) { + int j; + u64 alloc_size; + + avail_space += devices_info[i].max_avail * min_stripes; + alloc_size = devices_info[i].max_avail; + for (j = i + 1 - min_stripes; j <= i; j++) + devices_info[j].max_avail -= alloc_size; + } + i--; + nr_devices--; + } + + kfree(devices_info); + *free_bytes = avail_space; + return 0; +} + static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) { struct btrfs_root *root = btrfs_sb(dentry->d_sb); @@ -760,17 +1041,21 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) struct list_head *head = &root->fs_info->space_info; struct btrfs_space_info *found; u64 total_used = 0; - u64 total_used_data = 0; + u64 total_free_data = 0; int bits = dentry->d_sb->s_blocksize_bits; __be32 *fsid = (__be32 *)root->fs_info->fsid; + int ret; + /* holding chunk_muext to avoid allocating new chunks */ + mutex_lock(&root->fs_info->chunk_mutex); rcu_read_lock(); list_for_each_entry_rcu(found, head, list) { - if (found->flags & (BTRFS_BLOCK_GROUP_METADATA | - BTRFS_BLOCK_GROUP_SYSTEM)) - total_used_data += found->disk_total; - else - total_used_data += found->disk_used; + if (found->flags & BTRFS_BLOCK_GROUP_DATA) { + total_free_data += found->disk_total - found->disk_used; + total_free_data -= + btrfs_account_ro_block_groups_free_space(found); + } + total_used += found->disk_used; } rcu_read_unlock(); @@ -778,9 +1063,17 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_namelen = BTRFS_NAME_LEN; buf->f_blocks = btrfs_super_total_bytes(disk_super) >> bits; buf->f_bfree = buf->f_blocks - (total_used >> bits); - buf->f_bavail = buf->f_blocks - (total_used_data >> bits); buf->f_bsize = dentry->d_sb->s_blocksize; buf->f_type = BTRFS_SUPER_MAGIC; + buf->f_bavail = total_free_data; + ret = btrfs_calc_avail_data_space(root, &total_free_data); + if (ret) { + mutex_unlock(&root->fs_info->chunk_mutex); + return ret; + } + buf->f_bavail += total_free_data; + buf->f_bavail = buf->f_bavail >> bits; + mutex_unlock(&root->fs_info->chunk_mutex); /* We treat it as constant endianness (it doesn't matter _which_) because we want the fsid to come out the same whether mounted @@ -897,10 +1190,14 @@ static int __init init_btrfs_fs(void) if (err) return err; - err = btrfs_init_cachep(); + err = btrfs_init_compress(); if (err) goto free_sysfs; + err = btrfs_init_cachep(); + if (err) + goto free_compress; + err = extent_io_init(); if (err) goto free_cachep; @@ -928,6 +1225,8 @@ free_extent_io: extent_io_exit(); free_cachep: btrfs_destroy_cachep(); +free_compress: + btrfs_exit_compress(); free_sysfs: btrfs_exit_sysfs(); return err; @@ -942,7 +1241,7 @@ static void __exit exit_btrfs_fs(void) unregister_filesystem(&btrfs_fs_type); btrfs_exit_sysfs(); btrfs_cleanup_fs_uuids(); - btrfs_zlib_exit(); + btrfs_exit_compress(); } module_init(init_btrfs_fs) diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index f50e931fc217..c571734d5e5a 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -32,10 +32,8 @@ static noinline void put_transaction(struct btrfs_transaction *transaction) { - WARN_ON(transaction->use_count == 0); - transaction->use_count--; - if (transaction->use_count == 0) { - list_del_init(&transaction->list); + WARN_ON(atomic_read(&transaction->use_count) == 0); + if (atomic_dec_and_test(&transaction->use_count)) { memset(transaction, 0, sizeof(*transaction)); kmem_cache_free(btrfs_transaction_cachep, transaction); } @@ -57,16 +55,17 @@ static noinline int join_transaction(struct btrfs_root *root) if (!cur_trans) { cur_trans = kmem_cache_alloc(btrfs_transaction_cachep, GFP_NOFS); - BUG_ON(!cur_trans); + if (!cur_trans) + return -ENOMEM; root->fs_info->generation++; - cur_trans->num_writers = 1; + atomic_set(&cur_trans->num_writers, 1); cur_trans->num_joined = 0; cur_trans->transid = root->fs_info->generation; init_waitqueue_head(&cur_trans->writer_wait); init_waitqueue_head(&cur_trans->commit_wait); cur_trans->in_commit = 0; cur_trans->blocked = 0; - cur_trans->use_count = 1; + atomic_set(&cur_trans->use_count, 1); cur_trans->commit_done = 0; cur_trans->start_time = get_seconds(); @@ -87,7 +86,7 @@ static noinline int join_transaction(struct btrfs_root *root) root->fs_info->running_transaction = cur_trans; spin_unlock(&root->fs_info->new_trans_lock); } else { - cur_trans->num_writers++; + atomic_inc(&cur_trans->num_writers); cur_trans->num_joined++; } @@ -144,7 +143,7 @@ static void wait_current_trans(struct btrfs_root *root) cur_trans = root->fs_info->running_transaction; if (cur_trans && cur_trans->blocked) { DEFINE_WAIT(wait); - cur_trans->use_count++; + atomic_inc(&cur_trans->use_count); while (1) { prepare_to_wait(&root->fs_info->transaction_wait, &wait, TASK_UNINTERRUPTIBLE); @@ -180,7 +179,11 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root, { struct btrfs_trans_handle *h; struct btrfs_transaction *cur_trans; + int retries = 0; int ret; + + if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) + return ERR_PTR(-EROFS); again: h = kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS); if (!h) @@ -192,10 +195,15 @@ again: wait_current_trans(root); ret = join_transaction(root); - BUG_ON(ret); + if (ret < 0) { + kmem_cache_free(btrfs_trans_handle_cachep, h); + if (type != TRANS_JOIN_NOLOCK) + mutex_unlock(&root->fs_info->trans_mutex); + return ERR_PTR(ret); + } cur_trans = root->fs_info->running_transaction; - cur_trans->use_count++; + atomic_inc(&cur_trans->use_count); if (type != TRANS_JOIN_NOLOCK) mutex_unlock(&root->fs_info->trans_mutex); @@ -215,10 +223,18 @@ again: if (num_items > 0) { ret = btrfs_trans_reserve_metadata(h, root, num_items); - if (ret == -EAGAIN) { + if (ret == -EAGAIN && !retries) { + retries++; btrfs_commit_transaction(h, root); goto again; + } else if (ret == -EAGAIN) { + /* + * We have already retried and got EAGAIN, so really we + * don't have space, so set ret to -ENOSPC. + */ + ret = -ENOSPC; } + if (ret < 0) { btrfs_end_transaction(h, root); return ERR_PTR(ret); @@ -318,7 +334,7 @@ int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid) goto out_unlock; /* nothing committing|committed */ } - cur_trans->use_count++; + atomic_inc(&cur_trans->use_count); mutex_unlock(&root->fs_info->trans_mutex); wait_for_commit(root, cur_trans); @@ -448,18 +464,14 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, wake_up_process(info->transaction_kthread); } - if (lock) - mutex_lock(&info->trans_mutex); WARN_ON(cur_trans != info->running_transaction); - WARN_ON(cur_trans->num_writers < 1); - cur_trans->num_writers--; + WARN_ON(atomic_read(&cur_trans->num_writers) < 1); + atomic_dec(&cur_trans->num_writers); smp_mb(); if (waitqueue_active(&cur_trans->writer_wait)) wake_up(&cur_trans->writer_wait); put_transaction(cur_trans); - if (lock) - mutex_unlock(&info->trans_mutex); if (current->journal_info == trans) current->journal_info = NULL; @@ -910,6 +922,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, u64 to_reserve = 0; u64 index = 0; u64 objectid; + u64 root_flags; new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS); if (!new_root_item) { @@ -966,6 +979,14 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, record_root_in_trans(trans, root); btrfs_set_root_last_snapshot(&root->root_item, trans->transid); memcpy(new_root_item, &root->root_item, sizeof(*new_root_item)); + btrfs_check_and_init_root_item(new_root_item); + + root_flags = btrfs_root_flags(new_root_item); + if (pending->readonly) + root_flags |= BTRFS_ROOT_SUBVOL_RDONLY; + else + root_flags &= ~BTRFS_ROOT_SUBVOL_RDONLY; + btrfs_set_root_flags(new_root_item, root_flags); old = btrfs_lock_root_node(root); btrfs_cow_block(trans, root, old, NULL, 0, &old); @@ -1145,16 +1166,22 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans, struct btrfs_transaction *cur_trans; ac = kmalloc(sizeof(*ac), GFP_NOFS); - BUG_ON(!ac); + if (!ac) + return -ENOMEM; INIT_DELAYED_WORK(&ac->work, do_async_commit); ac->root = root; ac->newtrans = btrfs_join_transaction(root, 0); + if (IS_ERR(ac->newtrans)) { + int err = PTR_ERR(ac->newtrans); + kfree(ac); + return err; + } /* take transaction reference */ mutex_lock(&root->fs_info->trans_mutex); cur_trans = trans->transaction; - cur_trans->use_count++; + atomic_inc(&cur_trans->use_count); mutex_unlock(&root->fs_info->trans_mutex); btrfs_end_transaction(trans, root); @@ -1213,7 +1240,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, mutex_lock(&root->fs_info->trans_mutex); if (cur_trans->in_commit) { - cur_trans->use_count++; + atomic_inc(&cur_trans->use_count); mutex_unlock(&root->fs_info->trans_mutex); btrfs_end_transaction(trans, root); @@ -1235,7 +1262,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, prev_trans = list_entry(cur_trans->list.prev, struct btrfs_transaction, list); if (!prev_trans->commit_done) { - prev_trans->use_count++; + atomic_inc(&prev_trans->use_count); mutex_unlock(&root->fs_info->trans_mutex); wait_for_commit(root, prev_trans); @@ -1276,14 +1303,14 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, TASK_UNINTERRUPTIBLE); smp_mb(); - if (cur_trans->num_writers > 1) + if (atomic_read(&cur_trans->num_writers) > 1) schedule_timeout(MAX_SCHEDULE_TIMEOUT); else if (should_grow) schedule_timeout(1); mutex_lock(&root->fs_info->trans_mutex); finish_wait(&cur_trans->writer_wait, &wait); - } while (cur_trans->num_writers > 1 || + } while (atomic_read(&cur_trans->num_writers) > 1 || (should_grow && cur_trans->num_joined != joined)); ret = create_pending_snapshots(trans, root->fs_info); @@ -1370,9 +1397,12 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, wake_up(&cur_trans->commit_wait); + list_del_init(&cur_trans->list); put_transaction(cur_trans); put_transaction(cur_trans); + trace_btrfs_transaction_commit(root); + mutex_unlock(&root->fs_info->trans_mutex); if (current->journal_info == trans) diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index f104b57ad4ef..e441acc6c584 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -27,11 +27,11 @@ struct btrfs_transaction { * total writers in this transaction, it must be zero before the * transaction can end */ - unsigned long num_writers; + atomic_t num_writers; unsigned long num_joined; int in_commit; - int use_count; + atomic_t use_count; int commit_done; int blocked; struct list_head list; @@ -62,6 +62,7 @@ struct btrfs_pending_snapshot { struct btrfs_block_rsv block_rsv; /* extra metadata reseration for relocation */ int error; + bool readonly; struct list_head list; }; diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 054744ac5719..c50271ad3157 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -338,6 +338,12 @@ static noinline int overwrite_item(struct btrfs_trans_handle *trans, } dst_copy = kmalloc(item_size, GFP_NOFS); src_copy = kmalloc(item_size, GFP_NOFS); + if (!dst_copy || !src_copy) { + btrfs_release_path(root, path); + kfree(dst_copy); + kfree(src_copy); + return -ENOMEM; + } read_extent_buffer(eb, src_copy, src_ptr, item_size); @@ -665,6 +671,9 @@ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans, btrfs_dir_item_key_to_cpu(leaf, di, &location); name_len = btrfs_dir_name_len(leaf, di); name = kmalloc(name_len, GFP_NOFS); + if (!name) + return -ENOMEM; + read_extent_buffer(leaf, name, (unsigned long)(di + 1), name_len); btrfs_release_path(root, path); @@ -744,6 +753,9 @@ static noinline int backref_in_log(struct btrfs_root *log, int match = 0; path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + ret = btrfs_search_slot(NULL, log, key, path, 0, 0); if (ret != 0) goto out; @@ -787,12 +799,12 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, struct inode *dir; int ret; struct btrfs_inode_ref *ref; - struct btrfs_dir_item *di; struct inode *inode; char *name; int namelen; unsigned long ref_ptr; unsigned long ref_end; + int search_done = 0; /* * it is possible that we didn't log all the parent directories @@ -833,7 +845,10 @@ again: * existing back reference, and we don't want to create * dangling pointers in the directory. */ -conflict_again: + + if (search_done) + goto insert; + ret = btrfs_search_slot(NULL, root, key, path, 0, 0); if (ret == 0) { char *victim_name; @@ -874,37 +889,21 @@ conflict_again: ret = btrfs_unlink_inode(trans, root, dir, inode, victim_name, victim_name_len); - kfree(victim_name); - btrfs_release_path(root, path); - goto conflict_again; } kfree(victim_name); ptr = (unsigned long)(victim_ref + 1) + victim_name_len; } BUG_ON(ret); - } - btrfs_release_path(root, path); - /* look for a conflicting sequence number */ - di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino, - btrfs_inode_ref_index(eb, ref), - name, namelen, 0); - if (di && !IS_ERR(di)) { - ret = drop_one_dir_item(trans, root, path, dir, di); - BUG_ON(ret); - } - btrfs_release_path(root, path); - - - /* look for a conflicting name */ - di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino, - name, namelen, 0); - if (di && !IS_ERR(di)) { - ret = drop_one_dir_item(trans, root, path, dir, di); - BUG_ON(ret); + /* + * NOTE: we have searched root tree and checked the + * coresponding ref, it does not need to check again. + */ + search_done = 1; } btrfs_release_path(root, path); +insert: /* insert our name */ ret = btrfs_add_link(trans, dir, inode, name, namelen, 0, btrfs_inode_ref_index(eb, ref)); @@ -967,6 +966,8 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans, key.offset = (u64)-1; path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; while (1) { ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); @@ -1178,6 +1179,9 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, name_len = btrfs_dir_name_len(eb, di); name = kmalloc(name_len, GFP_NOFS); + if (!name) + return -ENOMEM; + log_type = btrfs_dir_type(eb, di); read_extent_buffer(eb, name, (unsigned long)(di + 1), name_len); @@ -1269,6 +1273,8 @@ static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans, ptr_end = ptr + item_size; while (ptr < ptr_end) { di = (struct btrfs_dir_item *)ptr; + if (verify_dir_item(root, eb, di)) + return -EIO; name_len = btrfs_dir_name_len(eb, di); ret = replay_one_name(trans, root, path, eb, di, key); BUG_ON(ret); @@ -1395,6 +1401,11 @@ again: ptr_end = ptr + item_size; while (ptr < ptr_end) { di = (struct btrfs_dir_item *)ptr; + if (verify_dir_item(root, eb, di)) { + ret = -EIO; + goto out; + } + name_len = btrfs_dir_name_len(eb, di); name = kmalloc(name_len, GFP_NOFS); if (!name) { @@ -1692,6 +1703,8 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, root_owner = btrfs_header_owner(parent); next = btrfs_find_create_tree_block(root, bytenr, blocksize); + if (!next) + return -ENOMEM; if (*level == 1) { wc->process_func(root, next, wc, ptr_gen); @@ -1802,7 +1815,8 @@ static int walk_log_tree(struct btrfs_trans_handle *trans, int orig_level; path = btrfs_alloc_path(); - BUG_ON(!path); + if (!path) + return -ENOMEM; level = btrfs_header_level(log->node); orig_level = level; @@ -2032,6 +2046,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, wait_log_commit(trans, log_root_tree, log_root_tree->log_transid); mutex_unlock(&log_root_tree->log_mutex); + ret = 0; goto out; } atomic_set(&log_root_tree->log_commit[index2], 1); @@ -2096,7 +2111,7 @@ out: smp_mb(); if (waitqueue_active(&root->log_commit_wait[index1])) wake_up(&root->log_commit_wait[index1]); - return 0; + return ret; } static void free_log_tree(struct btrfs_trans_handle *trans, @@ -2194,6 +2209,9 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, log = root->log_root; path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + di = btrfs_lookup_dir_item(trans, log, path, dir->i_ino, name, name_len, -1); if (IS_ERR(di)) { @@ -2594,6 +2612,9 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, ins_data = kmalloc(nr * sizeof(struct btrfs_key) + nr * sizeof(u32), GFP_NOFS); + if (!ins_data) + return -ENOMEM; + ins_sizes = (u32 *)ins_data; ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32)); @@ -2725,7 +2746,13 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, log = root->log_root; path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; dst_path = btrfs_alloc_path(); + if (!dst_path) { + btrfs_free_path(path); + return -ENOMEM; + } min_key.objectid = inode->i_ino; min_key.type = BTRFS_INODE_ITEM_KEY; @@ -3075,16 +3102,20 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree) .stage = 0, }; - fs_info->log_root_recovering = 1; path = btrfs_alloc_path(); - BUG_ON(!path); + if (!path) + return -ENOMEM; + + fs_info->log_root_recovering = 1; trans = btrfs_start_transaction(fs_info->tree_root, 0); + BUG_ON(IS_ERR(trans)); wc.trans = trans; wc.pin = 1; - walk_log_tree(trans, log_root_tree, &wc); + ret = walk_log_tree(trans, log_root_tree, &wc); + BUG_ON(ret); again: key.objectid = BTRFS_TREE_LOG_OBJECTID; @@ -3108,8 +3139,7 @@ again: log = btrfs_read_fs_root_no_radix(log_root_tree, &found_key); - BUG_ON(!log); - + BUG_ON(IS_ERR(log)); tmp_key.objectid = found_key.offset; tmp_key.type = BTRFS_ROOT_ITEM_KEY; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 1718e1a5c320..309a57b9fc85 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -22,6 +22,7 @@ #include <linux/blkdev.h> #include <linux/random.h> #include <linux/iocontext.h> +#include <linux/capability.h> #include <asm/div64.h> #include "compat.h" #include "ctree.h" @@ -32,17 +33,6 @@ #include "volumes.h" #include "async-thread.h" -struct map_lookup { - u64 type; - int io_align; - int io_width; - int stripe_len; - int sector_size; - int num_stripes; - int sub_stripes; - struct btrfs_bio_stripe stripes[]; -}; - static int init_first_rw_device(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_device *device); @@ -161,7 +151,6 @@ static noinline int run_scheduled_bios(struct btrfs_device *device) struct bio *cur; int again = 0; unsigned long num_run; - unsigned long num_sync_run; unsigned long batch_run = 0; unsigned long limit; unsigned long last_waited = 0; @@ -172,11 +161,6 @@ static noinline int run_scheduled_bios(struct btrfs_device *device) limit = btrfs_async_submit_limit(fs_info); limit = limit * 2 / 3; - /* we want to make sure that every time we switch from the sync - * list to the normal list, we unplug - */ - num_sync_run = 0; - loop: spin_lock(&device->io_lock); @@ -222,15 +206,6 @@ loop_lock: spin_unlock(&device->io_lock); - /* - * if we're doing the regular priority list, make sure we unplug - * for any high prio bios we've sent down - */ - if (pending_bios == &device->pending_bios && num_sync_run > 0) { - num_sync_run = 0; - blk_run_backing_dev(bdi, NULL); - } - while (pending) { rmb(); @@ -258,19 +233,11 @@ loop_lock: BUG_ON(atomic_read(&cur->bi_cnt) == 0); - if (cur->bi_rw & REQ_SYNC) - num_sync_run++; - submit_bio(cur->bi_rw, cur); num_run++; batch_run++; - if (need_resched()) { - if (num_sync_run) { - blk_run_backing_dev(bdi, NULL); - num_sync_run = 0; - } + if (need_resched()) cond_resched(); - } /* * we made progress, there is more work to do and the bdi @@ -303,13 +270,8 @@ loop_lock: * against it before looping */ last_waited = ioc->last_waited; - if (need_resched()) { - if (num_sync_run) { - blk_run_backing_dev(bdi, NULL); - num_sync_run = 0; - } + if (need_resched()) cond_resched(); - } continue; } spin_lock(&device->io_lock); @@ -322,22 +284,6 @@ loop_lock: } } - if (num_sync_run) { - num_sync_run = 0; - blk_run_backing_dev(bdi, NULL); - } - /* - * IO has already been through a long path to get here. Checksumming, - * async helper threads, perhaps compression. We've done a pretty - * good job of collecting a batch of IO and should just unplug - * the device right away. - * - * This will help anyone who is waiting on the IO, they might have - * already unplugged, but managed to do so before the bio they - * cared about found its way down here. - */ - blk_run_backing_dev(bdi, NULL); - cond_resched(); if (again) goto loop; @@ -600,8 +546,10 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices, set_blocksize(bdev, 4096); bh = btrfs_read_dev_super(bdev); - if (!bh) + if (!bh) { + ret = -EINVAL; goto error_close; + } disk_super = (struct btrfs_super_block *)bh->b_data; devid = btrfs_stack_device_id(&disk_super->dev_item); @@ -703,7 +651,7 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, goto error_close; bh = btrfs_read_dev_super(bdev); if (!bh) { - ret = -EIO; + ret = -EINVAL; goto error_close; } disk_super = (struct btrfs_super_block *)bh->b_data; @@ -729,59 +677,167 @@ error: return ret; } +/* helper to account the used device space in the range */ +int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start, + u64 end, u64 *length) +{ + struct btrfs_key key; + struct btrfs_root *root = device->dev_root; + struct btrfs_dev_extent *dev_extent; + struct btrfs_path *path; + u64 extent_end; + int ret; + int slot; + struct extent_buffer *l; + + *length = 0; + + if (start >= device->total_bytes) + return 0; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + path->reada = 2; + + key.objectid = device->devid; + key.offset = start; + key.type = BTRFS_DEV_EXTENT_KEY; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto out; + if (ret > 0) { + ret = btrfs_previous_item(root, path, key.objectid, key.type); + if (ret < 0) + goto out; + } + + while (1) { + l = path->nodes[0]; + slot = path->slots[0]; + if (slot >= btrfs_header_nritems(l)) { + ret = btrfs_next_leaf(root, path); + if (ret == 0) + continue; + if (ret < 0) + goto out; + + break; + } + btrfs_item_key_to_cpu(l, &key, slot); + + if (key.objectid < device->devid) + goto next; + + if (key.objectid > device->devid) + break; + + if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY) + goto next; + + dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); + extent_end = key.offset + btrfs_dev_extent_length(l, + dev_extent); + if (key.offset <= start && extent_end > end) { + *length = end - start + 1; + break; + } else if (key.offset <= start && extent_end > start) + *length += extent_end - start; + else if (key.offset > start && extent_end <= end) + *length += extent_end - key.offset; + else if (key.offset > start && key.offset <= end) { + *length += end - key.offset + 1; + break; + } else if (key.offset > end) + break; + +next: + path->slots[0]++; + } + ret = 0; +out: + btrfs_free_path(path); + return ret; +} + /* + * find_free_dev_extent - find free space in the specified device + * @trans: transaction handler + * @device: the device which we search the free space in + * @num_bytes: the size of the free space that we need + * @start: store the start of the free space. + * @len: the size of the free space. that we find, or the size of the max + * free space if we don't find suitable free space + * * this uses a pretty simple search, the expectation is that it is * called very infrequently and that a given device has a small number * of extents + * + * @start is used to store the start of the free space if we find. But if we + * don't find suitable free space, it will be used to store the start position + * of the max free space. + * + * @len is used to store the size of the free space that we find. + * But if we don't find suitable free space, it is used to store the size of + * the max free space. */ int find_free_dev_extent(struct btrfs_trans_handle *trans, struct btrfs_device *device, u64 num_bytes, - u64 *start, u64 *max_avail) + u64 *start, u64 *len) { struct btrfs_key key; struct btrfs_root *root = device->dev_root; - struct btrfs_dev_extent *dev_extent = NULL; + struct btrfs_dev_extent *dev_extent; struct btrfs_path *path; - u64 hole_size = 0; - u64 last_byte = 0; - u64 search_start = 0; + u64 hole_size; + u64 max_hole_start; + u64 max_hole_size; + u64 extent_end; + u64 search_start; u64 search_end = device->total_bytes; int ret; - int slot = 0; - int start_found; + int slot; struct extent_buffer *l; - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - path->reada = 2; - start_found = 0; - /* FIXME use last free of some kind */ /* we don't want to overwrite the superblock on the drive, * so we make sure to start at an offset of at least 1MB */ - search_start = max((u64)1024 * 1024, search_start); + search_start = 1024 * 1024; - if (root->fs_info->alloc_start + num_bytes <= device->total_bytes) + if (root->fs_info->alloc_start + num_bytes <= search_end) search_start = max(root->fs_info->alloc_start, search_start); + max_hole_start = search_start; + max_hole_size = 0; + + if (search_start >= search_end) { + ret = -ENOSPC; + goto error; + } + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto error; + } + path->reada = 2; + key.objectid = device->devid; key.offset = search_start; key.type = BTRFS_DEV_EXTENT_KEY; + ret = btrfs_search_slot(trans, root, &key, path, 0, 0); if (ret < 0) - goto error; + goto out; if (ret > 0) { ret = btrfs_previous_item(root, path, key.objectid, key.type); if (ret < 0) - goto error; - if (ret > 0) - start_found = 1; + goto out; } - l = path->nodes[0]; - btrfs_item_key_to_cpu(l, &key, path->slots[0]); + while (1) { l = path->nodes[0]; slot = path->slots[0]; @@ -790,24 +846,9 @@ int find_free_dev_extent(struct btrfs_trans_handle *trans, if (ret == 0) continue; if (ret < 0) - goto error; -no_more_items: - if (!start_found) { - if (search_start >= search_end) { - ret = -ENOSPC; - goto error; - } - *start = search_start; - start_found = 1; - goto check_pending; - } - *start = last_byte > search_start ? - last_byte : search_start; - if (search_end <= *start) { - ret = -ENOSPC; - goto error; - } - goto check_pending; + goto out; + + break; } btrfs_item_key_to_cpu(l, &key, slot); @@ -815,48 +856,62 @@ no_more_items: goto next; if (key.objectid > device->devid) - goto no_more_items; + break; - if (key.offset >= search_start && key.offset > last_byte && - start_found) { - if (last_byte < search_start) - last_byte = search_start; - hole_size = key.offset - last_byte; + if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY) + goto next; - if (hole_size > *max_avail) - *max_avail = hole_size; + if (key.offset > search_start) { + hole_size = key.offset - search_start; - if (key.offset > last_byte && - hole_size >= num_bytes) { - *start = last_byte; - goto check_pending; + if (hole_size > max_hole_size) { + max_hole_start = search_start; + max_hole_size = hole_size; + } + + /* + * If this free space is greater than which we need, + * it must be the max free space that we have found + * until now, so max_hole_start must point to the start + * of this free space and the length of this free space + * is stored in max_hole_size. Thus, we return + * max_hole_start and max_hole_size and go back to the + * caller. + */ + if (hole_size >= num_bytes) { + ret = 0; + goto out; } } - if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY) - goto next; - start_found = 1; dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); - last_byte = key.offset + btrfs_dev_extent_length(l, dev_extent); + extent_end = key.offset + btrfs_dev_extent_length(l, + dev_extent); + if (extent_end > search_start) + search_start = extent_end; next: path->slots[0]++; cond_resched(); } -check_pending: - /* we have to make sure we didn't find an extent that has already - * been allocated by the map tree or the original allocation - */ - BUG_ON(*start < search_start); - if (*start + num_bytes > search_end) { - ret = -ENOSPC; - goto error; + hole_size = search_end- search_start; + if (hole_size > max_hole_size) { + max_hole_start = search_start; + max_hole_size = hole_size; } - /* check for pending inserts here */ - ret = 0; -error: + /* See above. */ + if (hole_size < num_bytes) + ret = -ENOSPC; + else + ret = 0; + +out: btrfs_free_path(path); +error: + *start = max_hole_start; + if (len) + *len = max_hole_size; return ret; } @@ -1103,6 +1158,10 @@ static int btrfs_rm_dev_item(struct btrfs_root *root, return -ENOMEM; trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) { + btrfs_free_path(path); + return PTR_ERR(trans); + } key.objectid = BTRFS_DEV_ITEMS_OBJECTID; key.type = BTRFS_DEV_ITEM_KEY; key.offset = device->devid; @@ -1196,7 +1255,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) set_blocksize(bdev, 4096); bh = btrfs_read_dev_super(bdev); if (!bh) { - ret = -EIO; + ret = -EINVAL; goto error_close; } disk_super = (struct btrfs_super_block *)bh->b_data; @@ -1224,11 +1283,11 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) ret = btrfs_shrink_device(device, 0); if (ret) - goto error_brelse; + goto error_undo; ret = btrfs_rm_dev_item(root->fs_info->chunk_root, device); if (ret) - goto error_brelse; + goto error_undo; device->in_fs_metadata = 0; @@ -1302,6 +1361,13 @@ out: mutex_unlock(&root->fs_info->volume_mutex); mutex_unlock(&uuid_mutex); return ret; +error_undo: + if (device->writeable) { + list_add(&device->dev_alloc_list, + &root->fs_info->fs_devices->alloc_list); + root->fs_info->fs_devices->rw_devices++; + } + goto error_brelse; } /* @@ -1491,11 +1557,19 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) ret = find_next_devid(root, &device->devid); if (ret) { + kfree(device->name); kfree(device); goto error; } trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) { + kfree(device->name); + kfree(device); + ret = PTR_ERR(trans); + goto error; + } + lock_chunks(root); device->writeable = 1; @@ -1511,7 +1585,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) device->dev_root = root->fs_info->dev_root; device->bdev = bdev; device->in_fs_metadata = 1; - device->mode = 0; + device->mode = FMODE_EXCL; set_blocksize(device->bdev, 4096); if (seeding_dev) { @@ -1763,7 +1837,7 @@ static int btrfs_relocate_chunk(struct btrfs_root *root, return ret; trans = btrfs_start_transaction(root, 0); - BUG_ON(!trans); + BUG_ON(IS_ERR(trans)); lock_chunks(root); @@ -1794,6 +1868,8 @@ static int btrfs_relocate_chunk(struct btrfs_root *root, BUG_ON(ret); + trace_btrfs_chunk_free(root, map, chunk_offset, em->len); + if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) { ret = btrfs_del_sys_chunk(root, chunk_objectid, chunk_offset); BUG_ON(ret); @@ -1916,6 +1992,9 @@ int btrfs_balance(struct btrfs_root *dev_root) if (dev_root->fs_info->sb->s_flags & MS_RDONLY) return -EROFS; + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + mutex_lock(&dev_root->fs_info->volume_mutex); dev_root = dev_root->fs_info->dev_root; @@ -1934,7 +2013,7 @@ int btrfs_balance(struct btrfs_root *dev_root) BUG_ON(ret); trans = btrfs_start_transaction(dev_root, 0); - BUG_ON(!trans); + BUG_ON(IS_ERR(trans)); ret = btrfs_grow_device(trans, device, old_size); BUG_ON(ret); @@ -2100,6 +2179,11 @@ again: /* Shrinking succeeded, else we would be at "done". */ trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto done; + } + lock_chunks(root); device->disk_total_bytes = new_size; @@ -2154,66 +2238,67 @@ static noinline u64 chunk_bytes_by_type(u64 type, u64 calc_size, return calc_size * num_stripes; } -static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, - struct btrfs_root *extent_root, - struct map_lookup **map_ret, - u64 *num_bytes, u64 *stripe_size, - u64 start, u64 type) +/* Used to sort the devices by max_avail(descending sort) */ +int btrfs_cmp_device_free_bytes(const void *dev_info1, const void *dev_info2) { - struct btrfs_fs_info *info = extent_root->fs_info; - struct btrfs_device *device = NULL; - struct btrfs_fs_devices *fs_devices = info->fs_devices; - struct list_head *cur; - struct map_lookup *map = NULL; - struct extent_map_tree *em_tree; - struct extent_map *em; - struct list_head private_devs; - int min_stripe_size = 1 * 1024 * 1024; - u64 calc_size = 1024 * 1024 * 1024; - u64 max_chunk_size = calc_size; - u64 min_free; - u64 avail; - u64 max_avail = 0; - u64 dev_offset; - int num_stripes = 1; - int min_stripes = 1; - int sub_stripes = 0; - int looped = 0; - int ret; - int index; - int stripe_len = 64 * 1024; + if (((struct btrfs_device_info *)dev_info1)->max_avail > + ((struct btrfs_device_info *)dev_info2)->max_avail) + return -1; + else if (((struct btrfs_device_info *)dev_info1)->max_avail < + ((struct btrfs_device_info *)dev_info2)->max_avail) + return 1; + else + return 0; +} - if ((type & BTRFS_BLOCK_GROUP_RAID1) && - (type & BTRFS_BLOCK_GROUP_DUP)) { - WARN_ON(1); - type &= ~BTRFS_BLOCK_GROUP_DUP; - } - if (list_empty(&fs_devices->alloc_list)) - return -ENOSPC; +static int __btrfs_calc_nstripes(struct btrfs_fs_devices *fs_devices, u64 type, + int *num_stripes, int *min_stripes, + int *sub_stripes) +{ + *num_stripes = 1; + *min_stripes = 1; + *sub_stripes = 0; if (type & (BTRFS_BLOCK_GROUP_RAID0)) { - num_stripes = fs_devices->rw_devices; - min_stripes = 2; + *num_stripes = fs_devices->rw_devices; + *min_stripes = 2; } if (type & (BTRFS_BLOCK_GROUP_DUP)) { - num_stripes = 2; - min_stripes = 2; + *num_stripes = 2; + *min_stripes = 2; } if (type & (BTRFS_BLOCK_GROUP_RAID1)) { if (fs_devices->rw_devices < 2) return -ENOSPC; - num_stripes = 2; - min_stripes = 2; + *num_stripes = 2; + *min_stripes = 2; } if (type & (BTRFS_BLOCK_GROUP_RAID10)) { - num_stripes = fs_devices->rw_devices; - if (num_stripes < 4) + *num_stripes = fs_devices->rw_devices; + if (*num_stripes < 4) return -ENOSPC; - num_stripes &= ~(u32)1; - sub_stripes = 2; - min_stripes = 4; + *num_stripes &= ~(u32)1; + *sub_stripes = 2; + *min_stripes = 4; } + return 0; +} + +static u64 __btrfs_calc_stripe_size(struct btrfs_fs_devices *fs_devices, + u64 proposed_size, u64 type, + int num_stripes, int small_stripe) +{ + int min_stripe_size = 1 * 1024 * 1024; + u64 calc_size = proposed_size; + u64 max_chunk_size = calc_size; + int ncopies = 1; + + if (type & (BTRFS_BLOCK_GROUP_RAID1 | + BTRFS_BLOCK_GROUP_DUP | + BTRFS_BLOCK_GROUP_RAID10)) + ncopies = 2; + if (type & BTRFS_BLOCK_GROUP_DATA) { max_chunk_size = 10 * calc_size; min_stripe_size = 64 * 1024 * 1024; @@ -2230,51 +2315,209 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1), max_chunk_size); -again: - max_avail = 0; - if (!map || map->num_stripes != num_stripes) { - kfree(map); - map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS); - if (!map) - return -ENOMEM; - map->num_stripes = num_stripes; - } - - if (calc_size * num_stripes > max_chunk_size) { - calc_size = max_chunk_size; + if (calc_size * num_stripes > max_chunk_size * ncopies) { + calc_size = max_chunk_size * ncopies; do_div(calc_size, num_stripes); - do_div(calc_size, stripe_len); - calc_size *= stripe_len; + do_div(calc_size, BTRFS_STRIPE_LEN); + calc_size *= BTRFS_STRIPE_LEN; } /* we don't want tiny stripes */ - if (!looped) + if (!small_stripe) calc_size = max_t(u64, min_stripe_size, calc_size); /* - * we're about to do_div by the stripe_len so lets make sure + * we're about to do_div by the BTRFS_STRIPE_LEN so lets make sure * we end up with something bigger than a stripe */ - calc_size = max_t(u64, calc_size, stripe_len * 4); + calc_size = max_t(u64, calc_size, BTRFS_STRIPE_LEN); + + do_div(calc_size, BTRFS_STRIPE_LEN); + calc_size *= BTRFS_STRIPE_LEN; + + return calc_size; +} + +static struct map_lookup *__shrink_map_lookup_stripes(struct map_lookup *map, + int num_stripes) +{ + struct map_lookup *new; + size_t len = map_lookup_size(num_stripes); + + BUG_ON(map->num_stripes < num_stripes); + + if (map->num_stripes == num_stripes) + return map; + + new = kmalloc(len, GFP_NOFS); + if (!new) { + /* just change map->num_stripes */ + map->num_stripes = num_stripes; + return map; + } + + memcpy(new, map, len); + new->num_stripes = num_stripes; + kfree(map); + return new; +} + +/* + * helper to allocate device space from btrfs_device_info, in which we stored + * max free space information of every device. It is used when we can not + * allocate chunks by default size. + * + * By this helper, we can allocate a new chunk as larger as possible. + */ +static int __btrfs_alloc_tiny_space(struct btrfs_trans_handle *trans, + struct btrfs_fs_devices *fs_devices, + struct btrfs_device_info *devices, + int nr_device, u64 type, + struct map_lookup **map_lookup, + int min_stripes, u64 *stripe_size) +{ + int i, index, sort_again = 0; + int min_devices = min_stripes; + u64 max_avail, min_free; + struct map_lookup *map = *map_lookup; + int ret; + + if (nr_device < min_stripes) + return -ENOSPC; + + btrfs_descending_sort_devices(devices, nr_device); + + max_avail = devices[0].max_avail; + if (!max_avail) + return -ENOSPC; + + for (i = 0; i < nr_device; i++) { + /* + * if dev_offset = 0, it means the free space of this device + * is less than what we need, and we didn't search max avail + * extent on this device, so do it now. + */ + if (!devices[i].dev_offset) { + ret = find_free_dev_extent(trans, devices[i].dev, + max_avail, + &devices[i].dev_offset, + &devices[i].max_avail); + if (ret != 0 && ret != -ENOSPC) + return ret; + sort_again = 1; + } + } + + /* we update the max avail free extent of each devices, sort again */ + if (sort_again) + btrfs_descending_sort_devices(devices, nr_device); + + if (type & BTRFS_BLOCK_GROUP_DUP) + min_devices = 1; + + if (!devices[min_devices - 1].max_avail) + return -ENOSPC; + + max_avail = devices[min_devices - 1].max_avail; + if (type & BTRFS_BLOCK_GROUP_DUP) + do_div(max_avail, 2); + + max_avail = __btrfs_calc_stripe_size(fs_devices, max_avail, type, + min_stripes, 1); + if (type & BTRFS_BLOCK_GROUP_DUP) + min_free = max_avail * 2; + else + min_free = max_avail; + + if (min_free > devices[min_devices - 1].max_avail) + return -ENOSPC; + + map = __shrink_map_lookup_stripes(map, min_stripes); + *stripe_size = max_avail; + + index = 0; + for (i = 0; i < min_stripes; i++) { + map->stripes[i].dev = devices[index].dev; + map->stripes[i].physical = devices[index].dev_offset; + if (type & BTRFS_BLOCK_GROUP_DUP) { + i++; + map->stripes[i].dev = devices[index].dev; + map->stripes[i].physical = devices[index].dev_offset + + max_avail; + } + index++; + } + *map_lookup = map; + + return 0; +} + +static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, + struct btrfs_root *extent_root, + struct map_lookup **map_ret, + u64 *num_bytes, u64 *stripe_size, + u64 start, u64 type) +{ + struct btrfs_fs_info *info = extent_root->fs_info; + struct btrfs_device *device = NULL; + struct btrfs_fs_devices *fs_devices = info->fs_devices; + struct list_head *cur; + struct map_lookup *map; + struct extent_map_tree *em_tree; + struct extent_map *em; + struct btrfs_device_info *devices_info; + struct list_head private_devs; + u64 calc_size = 1024 * 1024 * 1024; + u64 min_free; + u64 avail; + u64 dev_offset; + int num_stripes; + int min_stripes; + int sub_stripes; + int min_devices; /* the min number of devices we need */ + int i; + int ret; + int index; - do_div(calc_size, stripe_len); - calc_size *= stripe_len; + if ((type & BTRFS_BLOCK_GROUP_RAID1) && + (type & BTRFS_BLOCK_GROUP_DUP)) { + WARN_ON(1); + type &= ~BTRFS_BLOCK_GROUP_DUP; + } + if (list_empty(&fs_devices->alloc_list)) + return -ENOSPC; + + ret = __btrfs_calc_nstripes(fs_devices, type, &num_stripes, + &min_stripes, &sub_stripes); + if (ret) + return ret; + + devices_info = kzalloc(sizeof(*devices_info) * fs_devices->rw_devices, + GFP_NOFS); + if (!devices_info) + return -ENOMEM; + + map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS); + if (!map) { + ret = -ENOMEM; + goto error; + } + map->num_stripes = num_stripes; cur = fs_devices->alloc_list.next; index = 0; + i = 0; - if (type & BTRFS_BLOCK_GROUP_DUP) + calc_size = __btrfs_calc_stripe_size(fs_devices, calc_size, type, + num_stripes, 0); + + if (type & BTRFS_BLOCK_GROUP_DUP) { min_free = calc_size * 2; - else + min_devices = 1; + } else { min_free = calc_size; - - /* - * we add 1MB because we never use the first 1MB of the device, unless - * we've looped, then we are likely allocating the maximum amount of - * space left already - */ - if (!looped) - min_free += 1024 * 1024; + min_devices = min_stripes; + } INIT_LIST_HEAD(&private_devs); while (index < num_stripes) { @@ -2287,27 +2530,39 @@ again: cur = cur->next; if (device->in_fs_metadata && avail >= min_free) { - ret = find_free_dev_extent(trans, device, - min_free, &dev_offset, - &max_avail); + ret = find_free_dev_extent(trans, device, min_free, + &devices_info[i].dev_offset, + &devices_info[i].max_avail); if (ret == 0) { list_move_tail(&device->dev_alloc_list, &private_devs); map->stripes[index].dev = device; - map->stripes[index].physical = dev_offset; + map->stripes[index].physical = + devices_info[i].dev_offset; index++; if (type & BTRFS_BLOCK_GROUP_DUP) { map->stripes[index].dev = device; map->stripes[index].physical = - dev_offset + calc_size; + devices_info[i].dev_offset + + calc_size; index++; } - } - } else if (device->in_fs_metadata && avail > max_avail) - max_avail = avail; + } else if (ret != -ENOSPC) + goto error; + + devices_info[i].dev = device; + i++; + } else if (device->in_fs_metadata && + avail >= BTRFS_STRIPE_LEN) { + devices_info[i].dev = device; + devices_info[i].max_avail = avail; + i++; + } + if (cur == &fs_devices->alloc_list) break; } + list_splice(&private_devs, &fs_devices->alloc_list); if (index < num_stripes) { if (index >= min_stripes) { @@ -2316,34 +2571,38 @@ again: num_stripes /= sub_stripes; num_stripes *= sub_stripes; } - looped = 1; - goto again; - } - if (!looped && max_avail > 0) { - looped = 1; - calc_size = max_avail; - goto again; + + map = __shrink_map_lookup_stripes(map, num_stripes); + } else if (i >= min_devices) { + ret = __btrfs_alloc_tiny_space(trans, fs_devices, + devices_info, i, type, + &map, min_stripes, + &calc_size); + if (ret) + goto error; + } else { + ret = -ENOSPC; + goto error; } - kfree(map); - return -ENOSPC; } map->sector_size = extent_root->sectorsize; - map->stripe_len = stripe_len; - map->io_align = stripe_len; - map->io_width = stripe_len; + map->stripe_len = BTRFS_STRIPE_LEN; + map->io_align = BTRFS_STRIPE_LEN; + map->io_width = BTRFS_STRIPE_LEN; map->type = type; - map->num_stripes = num_stripes; map->sub_stripes = sub_stripes; *map_ret = map; *stripe_size = calc_size; *num_bytes = chunk_bytes_by_type(type, calc_size, - num_stripes, sub_stripes); + map->num_stripes, sub_stripes); + + trace_btrfs_chunk_alloc(info->chunk_root, map, start, *num_bytes); em = alloc_extent_map(GFP_NOFS); if (!em) { - kfree(map); - return -ENOMEM; + ret = -ENOMEM; + goto error; } em->bdev = (struct block_device *)map; em->start = start; @@ -2376,7 +2635,13 @@ again: index++; } + kfree(devices_info); return 0; + +error: + kfree(map); + kfree(devices_info); + return ret; } static int __finish_chunk_alloc(struct btrfs_trans_handle *trans, @@ -2442,6 +2707,7 @@ static int __finish_chunk_alloc(struct btrfs_trans_handle *trans, item_size); BUG_ON(ret); } + kfree(chunk); return 0; } @@ -2639,14 +2905,17 @@ static int find_live_mirror(struct map_lookup *map, int first, int num, static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, u64 logical, u64 *length, struct btrfs_multi_bio **multi_ret, - int mirror_num, struct page *unplug_page) + int mirror_num) { struct extent_map *em; struct map_lookup *map; struct extent_map_tree *em_tree = &map_tree->map_tree; u64 offset; u64 stripe_offset; + u64 stripe_end_offset; u64 stripe_nr; + u64 stripe_nr_orig; + u64 stripe_nr_end; int stripes_allocated = 8; int stripes_required = 1; int stripe_index; @@ -2655,7 +2924,7 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, int max_errors = 0; struct btrfs_multi_bio *multi = NULL; - if (multi_ret && !(rw & REQ_WRITE)) + if (multi_ret && !(rw & (REQ_WRITE | REQ_DISCARD))) stripes_allocated = 1; again: if (multi_ret) { @@ -2671,11 +2940,6 @@ again: em = lookup_extent_mapping(em_tree, logical, *length); read_unlock(&em_tree->lock); - if (!em && unplug_page) { - kfree(multi); - return 0; - } - if (!em) { printk(KERN_CRIT "unable to find logical %llu len %llu\n", (unsigned long long)logical, @@ -2701,7 +2965,15 @@ again: max_errors = 1; } } - if (multi_ret && (rw & REQ_WRITE) && + if (rw & REQ_DISCARD) { + if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | + BTRFS_BLOCK_GROUP_RAID1 | + BTRFS_BLOCK_GROUP_DUP | + BTRFS_BLOCK_GROUP_RAID10)) { + stripes_required = map->num_stripes; + } + } + if (multi_ret && (rw & (REQ_WRITE | REQ_DISCARD)) && stripes_allocated < stripes_required) { stripes_allocated = map->num_stripes; free_extent_map(em); @@ -2721,23 +2993,37 @@ again: /* stripe_offset is the offset of this block in its stripe*/ stripe_offset = offset - stripe_offset; - if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 | - BTRFS_BLOCK_GROUP_RAID10 | - BTRFS_BLOCK_GROUP_DUP)) { + if (rw & REQ_DISCARD) + *length = min_t(u64, em->len - offset, *length); + else if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | + BTRFS_BLOCK_GROUP_RAID1 | + BTRFS_BLOCK_GROUP_RAID10 | + BTRFS_BLOCK_GROUP_DUP)) { /* we limit the length of each bio to what fits in a stripe */ *length = min_t(u64, em->len - offset, - map->stripe_len - stripe_offset); + map->stripe_len - stripe_offset); } else { *length = em->len - offset; } - if (!multi_ret && !unplug_page) + if (!multi_ret) goto out; num_stripes = 1; stripe_index = 0; - if (map->type & BTRFS_BLOCK_GROUP_RAID1) { - if (unplug_page || (rw & REQ_WRITE)) + stripe_nr_orig = stripe_nr; + stripe_nr_end = (offset + *length + map->stripe_len - 1) & + (~(map->stripe_len - 1)); + do_div(stripe_nr_end, map->stripe_len); + stripe_end_offset = stripe_nr_end * map->stripe_len - + (offset + *length); + if (map->type & BTRFS_BLOCK_GROUP_RAID0) { + if (rw & REQ_DISCARD) + num_stripes = min_t(u64, map->num_stripes, + stripe_nr_end - stripe_nr_orig); + stripe_index = do_div(stripe_nr, map->num_stripes); + } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) { + if (rw & (REQ_WRITE | REQ_DISCARD)) num_stripes = map->num_stripes; else if (mirror_num) stripe_index = mirror_num - 1; @@ -2748,7 +3034,7 @@ again: } } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { - if (rw & REQ_WRITE) + if (rw & (REQ_WRITE | REQ_DISCARD)) num_stripes = map->num_stripes; else if (mirror_num) stripe_index = mirror_num - 1; @@ -2759,8 +3045,12 @@ again: stripe_index = do_div(stripe_nr, factor); stripe_index *= map->sub_stripes; - if (unplug_page || (rw & REQ_WRITE)) + if (rw & REQ_WRITE) num_stripes = map->sub_stripes; + else if (rw & REQ_DISCARD) + num_stripes = min_t(u64, map->sub_stripes * + (stripe_nr_end - stripe_nr_orig), + map->num_stripes); else if (mirror_num) stripe_index += mirror_num - 1; else { @@ -2778,24 +3068,101 @@ again: } BUG_ON(stripe_index >= map->num_stripes); - for (i = 0; i < num_stripes; i++) { - if (unplug_page) { - struct btrfs_device *device; - struct backing_dev_info *bdi; - - device = map->stripes[stripe_index].dev; - if (device->bdev) { - bdi = blk_get_backing_dev_info(device->bdev); - if (bdi->unplug_io_fn) - bdi->unplug_io_fn(bdi, unplug_page); - } - } else { + if (rw & REQ_DISCARD) { + for (i = 0; i < num_stripes; i++) { multi->stripes[i].physical = map->stripes[stripe_index].physical + stripe_offset + stripe_nr * map->stripe_len; multi->stripes[i].dev = map->stripes[stripe_index].dev; + + if (map->type & BTRFS_BLOCK_GROUP_RAID0) { + u64 stripes; + u32 last_stripe = 0; + int j; + + div_u64_rem(stripe_nr_end - 1, + map->num_stripes, + &last_stripe); + + for (j = 0; j < map->num_stripes; j++) { + u32 test; + + div_u64_rem(stripe_nr_end - 1 - j, + map->num_stripes, &test); + if (test == stripe_index) + break; + } + stripes = stripe_nr_end - 1 - j; + do_div(stripes, map->num_stripes); + multi->stripes[i].length = map->stripe_len * + (stripes - stripe_nr + 1); + + if (i == 0) { + multi->stripes[i].length -= + stripe_offset; + stripe_offset = 0; + } + if (stripe_index == last_stripe) + multi->stripes[i].length -= + stripe_end_offset; + } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { + u64 stripes; + int j; + int factor = map->num_stripes / + map->sub_stripes; + u32 last_stripe = 0; + + div_u64_rem(stripe_nr_end - 1, + factor, &last_stripe); + last_stripe *= map->sub_stripes; + + for (j = 0; j < factor; j++) { + u32 test; + + div_u64_rem(stripe_nr_end - 1 - j, + factor, &test); + + if (test == + stripe_index / map->sub_stripes) + break; + } + stripes = stripe_nr_end - 1 - j; + do_div(stripes, factor); + multi->stripes[i].length = map->stripe_len * + (stripes - stripe_nr + 1); + + if (i < map->sub_stripes) { + multi->stripes[i].length -= + stripe_offset; + if (i == map->sub_stripes - 1) + stripe_offset = 0; + } + if (stripe_index >= last_stripe && + stripe_index <= (last_stripe + + map->sub_stripes - 1)) { + multi->stripes[i].length -= + stripe_end_offset; + } + } else + multi->stripes[i].length = *length; + + stripe_index++; + if (stripe_index == map->num_stripes) { + /* This could only happen for RAID0/10 */ + stripe_index = 0; + stripe_nr++; + } + } + } else { + for (i = 0; i < num_stripes; i++) { + multi->stripes[i].physical = + map->stripes[stripe_index].physical + + stripe_offset + + stripe_nr * map->stripe_len; + multi->stripes[i].dev = + map->stripes[stripe_index].dev; + stripe_index++; } - stripe_index++; } if (multi_ret) { *multi_ret = multi; @@ -2812,7 +3179,7 @@ int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, struct btrfs_multi_bio **multi_ret, int mirror_num) { return __btrfs_map_block(map_tree, rw, logical, length, multi_ret, - mirror_num, NULL); + mirror_num); } int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, @@ -2880,14 +3247,6 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, return 0; } -int btrfs_unplug_page(struct btrfs_mapping_tree *map_tree, - u64 logical, struct page *page) -{ - u64 length = PAGE_CACHE_SIZE; - return __btrfs_map_block(map_tree, READ, logical, &length, - NULL, 0, page); -} - static void end_bio_multi_stripe(struct bio *bio, int err) { struct btrfs_multi_bio *multi = bio->bi_private; diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 1be781079450..cc2eadaf7a27 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -20,8 +20,11 @@ #define __BTRFS_VOLUMES_ #include <linux/bio.h> +#include <linux/sort.h> #include "async-thread.h" +#define BTRFS_STRIPE_LEN (64 * 1024) + struct buffer_head; struct btrfs_pending_bios { struct bio *head; @@ -123,6 +126,7 @@ struct btrfs_fs_devices { struct btrfs_bio_stripe { struct btrfs_device *dev; u64 physical; + u64 length; /* only used for discard mappings */ }; struct btrfs_multi_bio { @@ -136,6 +140,41 @@ struct btrfs_multi_bio { struct btrfs_bio_stripe stripes[]; }; +struct btrfs_device_info { + struct btrfs_device *dev; + u64 dev_offset; + u64 max_avail; +}; + +struct map_lookup { + u64 type; + int io_align; + int io_width; + int stripe_len; + int sector_size; + int num_stripes; + int sub_stripes; + struct btrfs_bio_stripe stripes[]; +}; + +/* Used to sort the devices by max_avail(descending sort) */ +int btrfs_cmp_device_free_bytes(const void *dev_info1, const void *dev_info2); + +/* + * sort the devices by max_avail, in which max free extent size of each device + * is stored.(Descending Sort) + */ +static inline void btrfs_descending_sort_devices( + struct btrfs_device_info *devices, + size_t nr_devices) +{ + sort(devices, nr_devices, sizeof(struct btrfs_device_info), + btrfs_cmp_device_free_bytes, NULL); +} + +int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start, + u64 end, u64 *length); + #define btrfs_multi_bio_size(n) (sizeof(struct btrfs_multi_bio) + \ (sizeof(struct btrfs_bio_stripe) * (n))) diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index 698fdd2c739c..cfd660550ded 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -180,11 +180,10 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size) struct btrfs_path *path; struct extent_buffer *leaf; struct btrfs_dir_item *di; - int ret = 0, slot, advance; + int ret = 0, slot; size_t total_size = 0, size_left = size; unsigned long name_ptr; size_t name_len; - u32 nritems; /* * ok we want all objects associated with this id. @@ -204,34 +203,24 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size) ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) goto err; - advance = 0; + while (1) { leaf = path->nodes[0]; - nritems = btrfs_header_nritems(leaf); slot = path->slots[0]; /* this is where we start walking through the path */ - if (advance || slot >= nritems) { + if (slot >= btrfs_header_nritems(leaf)) { /* * if we've reached the last slot in this leaf we need * to go to the next leaf and reset everything */ - if (slot >= nritems-1) { - ret = btrfs_next_leaf(root, path); - if (ret) - break; - leaf = path->nodes[0]; - nritems = btrfs_header_nritems(leaf); - slot = path->slots[0]; - } else { - /* - * just walking through the slots on this leaf - */ - slot++; - path->slots[0]++; - } + ret = btrfs_next_leaf(root, path); + if (ret < 0) + goto err; + else if (ret > 0) + break; + continue; } - advance = 1; btrfs_item_key_to_cpu(leaf, &found_key, slot); @@ -242,13 +231,15 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size) break; di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item); + if (verify_dir_item(root, leaf, di)) + continue; name_len = btrfs_dir_name_len(leaf, di); total_size += name_len + 1; /* we are just looking for how big our buffer needs to be */ if (!size) - continue; + goto next; if (!buffer || (name_len + 1) > size_left) { ret = -ERANGE; @@ -261,6 +252,8 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size) size_left -= name_len + 1; buffer += name_len + 1; +next: + path->slots[0]++; } ret = total_size; @@ -316,6 +309,15 @@ ssize_t btrfs_getxattr(struct dentry *dentry, const char *name, int btrfs_setxattr(struct dentry *dentry, const char *name, const void *value, size_t size, int flags) { + struct btrfs_root *root = BTRFS_I(dentry->d_inode)->root; + + /* + * The permission on security.* and system.* is not checked + * in permission(). + */ + if (btrfs_root_readonly(root)) + return -EROFS; + /* * If this is a request for a synthetic attribute in the system.* * namespace use the generic infrastructure to resolve a handler @@ -336,6 +338,15 @@ int btrfs_setxattr(struct dentry *dentry, const char *name, const void *value, int btrfs_removexattr(struct dentry *dentry, const char *name) { + struct btrfs_root *root = BTRFS_I(dentry->d_inode)->root; + + /* + * The permission on security.* and system.* is not checked + * in permission(). + */ + if (btrfs_root_readonly(root)) + return -EROFS; + /* * If this is a request for a synthetic attribute in the system.* * namespace use the generic infrastructure to resolve a handler @@ -352,7 +363,8 @@ int btrfs_removexattr(struct dentry *dentry, const char *name) } int btrfs_xattr_security_init(struct btrfs_trans_handle *trans, - struct inode *inode, struct inode *dir) + struct inode *inode, struct inode *dir, + const struct qstr *qstr) { int err; size_t len; @@ -360,7 +372,8 @@ int btrfs_xattr_security_init(struct btrfs_trans_handle *trans, char *suffix; char *name; - err = security_inode_init_security(inode, dir, &suffix, &value, &len); + err = security_inode_init_security(inode, dir, qstr, &suffix, &value, + &len); if (err) { if (err == -EOPNOTSUPP) return 0; diff --git a/fs/btrfs/xattr.h b/fs/btrfs/xattr.h index 7a43fd640bbb..b3cc8039134b 100644 --- a/fs/btrfs/xattr.h +++ b/fs/btrfs/xattr.h @@ -37,6 +37,7 @@ extern int btrfs_setxattr(struct dentry *dentry, const char *name, extern int btrfs_removexattr(struct dentry *dentry, const char *name); extern int btrfs_xattr_security_init(struct btrfs_trans_handle *trans, - struct inode *inode, struct inode *dir); + struct inode *inode, struct inode *dir, + const struct qstr *qstr); #endif /* __XATTR__ */ diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c index b9cd5445f71c..faccd47c6c46 100644 --- a/fs/btrfs/zlib.c +++ b/fs/btrfs/zlib.c @@ -32,15 +32,6 @@ #include <linux/bio.h> #include "compression.h" -/* Plan: call deflate() with avail_in == *sourcelen, - avail_out = *dstlen - 12 and flush == Z_FINISH. - If it doesn't manage to finish, call it again with - avail_in == 0 and avail_out set to the remaining 12 - bytes for it to clean up. - Q: Is 12 bytes sufficient? -*/ -#define STREAM_END_SPACE 12 - struct workspace { z_stream inf_strm; z_stream def_strm; @@ -48,152 +39,52 @@ struct workspace { struct list_head list; }; -static LIST_HEAD(idle_workspace); -static DEFINE_SPINLOCK(workspace_lock); -static unsigned long num_workspace; -static atomic_t alloc_workspace = ATOMIC_INIT(0); -static DECLARE_WAIT_QUEUE_HEAD(workspace_wait); - -/* - * this finds an available zlib workspace or allocates a new one - * NULL or an ERR_PTR is returned if things go bad. - */ -static struct workspace *find_zlib_workspace(void) +static void zlib_free_workspace(struct list_head *ws) { - struct workspace *workspace; - int ret; - int cpus = num_online_cpus(); - -again: - spin_lock(&workspace_lock); - if (!list_empty(&idle_workspace)) { - workspace = list_entry(idle_workspace.next, struct workspace, - list); - list_del(&workspace->list); - num_workspace--; - spin_unlock(&workspace_lock); - return workspace; - - } - spin_unlock(&workspace_lock); - if (atomic_read(&alloc_workspace) > cpus) { - DEFINE_WAIT(wait); - prepare_to_wait(&workspace_wait, &wait, TASK_UNINTERRUPTIBLE); - if (atomic_read(&alloc_workspace) > cpus) - schedule(); - finish_wait(&workspace_wait, &wait); - goto again; - } - atomic_inc(&alloc_workspace); - workspace = kzalloc(sizeof(*workspace), GFP_NOFS); - if (!workspace) { - ret = -ENOMEM; - goto fail; - } - - workspace->def_strm.workspace = vmalloc(zlib_deflate_workspacesize()); - if (!workspace->def_strm.workspace) { - ret = -ENOMEM; - goto fail; - } - workspace->inf_strm.workspace = vmalloc(zlib_inflate_workspacesize()); - if (!workspace->inf_strm.workspace) { - ret = -ENOMEM; - goto fail_inflate; - } - workspace->buf = kmalloc(PAGE_CACHE_SIZE, GFP_NOFS); - if (!workspace->buf) { - ret = -ENOMEM; - goto fail_kmalloc; - } - return workspace; + struct workspace *workspace = list_entry(ws, struct workspace, list); -fail_kmalloc: - vfree(workspace->inf_strm.workspace); -fail_inflate: - vfree(workspace->def_strm.workspace); -fail: - kfree(workspace); - atomic_dec(&alloc_workspace); - wake_up(&workspace_wait); - return ERR_PTR(ret); -} - -/* - * put a workspace struct back on the list or free it if we have enough - * idle ones sitting around - */ -static int free_workspace(struct workspace *workspace) -{ - spin_lock(&workspace_lock); - if (num_workspace < num_online_cpus()) { - list_add_tail(&workspace->list, &idle_workspace); - num_workspace++; - spin_unlock(&workspace_lock); - if (waitqueue_active(&workspace_wait)) - wake_up(&workspace_wait); - return 0; - } - spin_unlock(&workspace_lock); vfree(workspace->def_strm.workspace); vfree(workspace->inf_strm.workspace); kfree(workspace->buf); kfree(workspace); - - atomic_dec(&alloc_workspace); - if (waitqueue_active(&workspace_wait)) - wake_up(&workspace_wait); - return 0; } -/* - * cleanup function for module exit - */ -static void free_workspaces(void) +static struct list_head *zlib_alloc_workspace(void) { struct workspace *workspace; - while (!list_empty(&idle_workspace)) { - workspace = list_entry(idle_workspace.next, struct workspace, - list); - list_del(&workspace->list); - vfree(workspace->def_strm.workspace); - vfree(workspace->inf_strm.workspace); - kfree(workspace->buf); - kfree(workspace); - atomic_dec(&alloc_workspace); - } + + workspace = kzalloc(sizeof(*workspace), GFP_NOFS); + if (!workspace) + return ERR_PTR(-ENOMEM); + + workspace->def_strm.workspace = vmalloc(zlib_deflate_workspacesize( + MAX_WBITS, MAX_MEM_LEVEL)); + workspace->inf_strm.workspace = vmalloc(zlib_inflate_workspacesize()); + workspace->buf = kmalloc(PAGE_CACHE_SIZE, GFP_NOFS); + if (!workspace->def_strm.workspace || + !workspace->inf_strm.workspace || !workspace->buf) + goto fail; + + INIT_LIST_HEAD(&workspace->list); + + return &workspace->list; +fail: + zlib_free_workspace(&workspace->list); + return ERR_PTR(-ENOMEM); } -/* - * given an address space and start/len, compress the bytes. - * - * pages are allocated to hold the compressed result and stored - * in 'pages' - * - * out_pages is used to return the number of pages allocated. There - * may be pages allocated even if we return an error - * - * total_in is used to return the number of bytes actually read. It - * may be smaller then len if we had to exit early because we - * ran out of room in the pages array or because we cross the - * max_out threshold. - * - * total_out is used to return the total number of compressed bytes - * - * max_out tells us the max number of bytes that we're allowed to - * stuff into pages - */ -int btrfs_zlib_compress_pages(struct address_space *mapping, - u64 start, unsigned long len, - struct page **pages, - unsigned long nr_dest_pages, - unsigned long *out_pages, - unsigned long *total_in, - unsigned long *total_out, - unsigned long max_out) +static int zlib_compress_pages(struct list_head *ws, + struct address_space *mapping, + u64 start, unsigned long len, + struct page **pages, + unsigned long nr_dest_pages, + unsigned long *out_pages, + unsigned long *total_in, + unsigned long *total_out, + unsigned long max_out) { + struct workspace *workspace = list_entry(ws, struct workspace, list); int ret; - struct workspace *workspace; char *data_in; char *cpage_out; int nr_pages = 0; @@ -205,10 +96,6 @@ int btrfs_zlib_compress_pages(struct address_space *mapping, *total_out = 0; *total_in = 0; - workspace = find_zlib_workspace(); - if (IS_ERR(workspace)) - return -1; - if (Z_OK != zlib_deflateInit(&workspace->def_strm, 3)) { printk(KERN_WARNING "deflateInit failed\n"); ret = -1; @@ -222,6 +109,10 @@ int btrfs_zlib_compress_pages(struct address_space *mapping, data_in = kmap(in_page); out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); + if (out_page == NULL) { + ret = -1; + goto out; + } cpage_out = kmap(out_page); pages[0] = out_page; nr_pages = 1; @@ -260,6 +151,10 @@ int btrfs_zlib_compress_pages(struct address_space *mapping, goto out; } out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); + if (out_page == NULL) { + ret = -1; + goto out; + } cpage_out = kmap(out_page); pages[nr_pages] = out_page; nr_pages++; @@ -314,55 +209,26 @@ out: kunmap(in_page); page_cache_release(in_page); } - free_workspace(workspace); return ret; } -/* - * pages_in is an array of pages with compressed data. - * - * disk_start is the starting logical offset of this array in the file - * - * bvec is a bio_vec of pages from the file that we want to decompress into - * - * vcnt is the count of pages in the biovec - * - * srclen is the number of bytes in pages_in - * - * The basic idea is that we have a bio that was created by readpages. - * The pages in the bio are for the uncompressed data, and they may not - * be contiguous. They all correspond to the range of bytes covered by - * the compressed extent. - */ -int btrfs_zlib_decompress_biovec(struct page **pages_in, - u64 disk_start, - struct bio_vec *bvec, - int vcnt, - size_t srclen) +static int zlib_decompress_biovec(struct list_head *ws, struct page **pages_in, + u64 disk_start, + struct bio_vec *bvec, + int vcnt, + size_t srclen) { - int ret = 0; + struct workspace *workspace = list_entry(ws, struct workspace, list); + int ret = 0, ret2; int wbits = MAX_WBITS; - struct workspace *workspace; char *data_in; size_t total_out = 0; - unsigned long page_bytes_left; unsigned long page_in_index = 0; unsigned long page_out_index = 0; - struct page *page_out; unsigned long total_pages_in = (srclen + PAGE_CACHE_SIZE - 1) / PAGE_CACHE_SIZE; unsigned long buf_start; - unsigned long buf_offset; - unsigned long bytes; - unsigned long working_bytes; unsigned long pg_offset; - unsigned long start_byte; - unsigned long current_buf_start; - char *kaddr; - - workspace = find_zlib_workspace(); - if (IS_ERR(workspace)) - return -ENOMEM; data_in = kmap(pages_in[page_in_index]); workspace->inf_strm.next_in = data_in; @@ -372,8 +238,6 @@ int btrfs_zlib_decompress_biovec(struct page **pages_in, workspace->inf_strm.total_out = 0; workspace->inf_strm.next_out = workspace->buf; workspace->inf_strm.avail_out = PAGE_CACHE_SIZE; - page_out = bvec[page_out_index].bv_page; - page_bytes_left = PAGE_CACHE_SIZE; pg_offset = 0; /* If it's deflate, and it's got no preset dictionary, then @@ -389,107 +253,29 @@ int btrfs_zlib_decompress_biovec(struct page **pages_in, if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) { printk(KERN_WARNING "inflateInit failed\n"); - ret = -1; - goto out; + return -1; } while (workspace->inf_strm.total_in < srclen) { ret = zlib_inflate(&workspace->inf_strm, Z_NO_FLUSH); if (ret != Z_OK && ret != Z_STREAM_END) break; - /* - * buf start is the byte offset we're of the start of - * our workspace buffer - */ - buf_start = total_out; - /* total_out is the last byte of the workspace buffer */ + buf_start = total_out; total_out = workspace->inf_strm.total_out; - working_bytes = total_out - buf_start; - - /* - * start byte is the first byte of the page we're currently - * copying into relative to the start of the compressed data. - */ - start_byte = page_offset(page_out) - disk_start; - - if (working_bytes == 0) { - /* we didn't make progress in this inflate - * call, we're done - */ - if (ret != Z_STREAM_END) - ret = -1; + /* we didn't make progress in this inflate call, we're done */ + if (buf_start == total_out) break; - } - /* we haven't yet hit data corresponding to this page */ - if (total_out <= start_byte) - goto next; - - /* - * the start of the data we care about is offset into - * the middle of our working buffer - */ - if (total_out > start_byte && buf_start < start_byte) { - buf_offset = start_byte - buf_start; - working_bytes -= buf_offset; - } else { - buf_offset = 0; - } - current_buf_start = buf_start; - - /* copy bytes from the working buffer into the pages */ - while (working_bytes > 0) { - bytes = min(PAGE_CACHE_SIZE - pg_offset, - PAGE_CACHE_SIZE - buf_offset); - bytes = min(bytes, working_bytes); - kaddr = kmap_atomic(page_out, KM_USER0); - memcpy(kaddr + pg_offset, workspace->buf + buf_offset, - bytes); - kunmap_atomic(kaddr, KM_USER0); - flush_dcache_page(page_out); - - pg_offset += bytes; - page_bytes_left -= bytes; - buf_offset += bytes; - working_bytes -= bytes; - current_buf_start += bytes; - - /* check if we need to pick another page */ - if (page_bytes_left == 0) { - page_out_index++; - if (page_out_index >= vcnt) { - ret = 0; - goto done; - } - - page_out = bvec[page_out_index].bv_page; - pg_offset = 0; - page_bytes_left = PAGE_CACHE_SIZE; - start_byte = page_offset(page_out) - disk_start; - - /* - * make sure our new page is covered by this - * working buffer - */ - if (total_out <= start_byte) - goto next; - - /* the next page in the biovec might not - * be adjacent to the last page, but it - * might still be found inside this working - * buffer. bump our offset pointer - */ - if (total_out > start_byte && - current_buf_start < start_byte) { - buf_offset = start_byte - buf_start; - working_bytes = total_out - start_byte; - current_buf_start = buf_start + - buf_offset; - } - } + ret2 = btrfs_decompress_buf2page(workspace->buf, buf_start, + total_out, disk_start, + bvec, vcnt, + &page_out_index, &pg_offset); + if (ret2 == 0) { + ret = 0; + goto done; } -next: + workspace->inf_strm.next_out = workspace->buf; workspace->inf_strm.avail_out = PAGE_CACHE_SIZE; @@ -516,35 +302,21 @@ done: zlib_inflateEnd(&workspace->inf_strm); if (data_in) kunmap(pages_in[page_in_index]); -out: - free_workspace(workspace); return ret; } -/* - * a less complex decompression routine. Our compressed data fits in a - * single page, and we want to read a single page out of it. - * start_byte tells us the offset into the compressed data we're interested in - */ -int btrfs_zlib_decompress(unsigned char *data_in, - struct page *dest_page, - unsigned long start_byte, - size_t srclen, size_t destlen) +static int zlib_decompress(struct list_head *ws, unsigned char *data_in, + struct page *dest_page, + unsigned long start_byte, + size_t srclen, size_t destlen) { + struct workspace *workspace = list_entry(ws, struct workspace, list); int ret = 0; int wbits = MAX_WBITS; - struct workspace *workspace; unsigned long bytes_left = destlen; unsigned long total_out = 0; char *kaddr; - if (destlen > PAGE_CACHE_SIZE) - return -ENOMEM; - - workspace = find_zlib_workspace(); - if (IS_ERR(workspace)) - return -ENOMEM; - workspace->inf_strm.next_in = data_in; workspace->inf_strm.avail_in = srclen; workspace->inf_strm.total_in = 0; @@ -565,8 +337,7 @@ int btrfs_zlib_decompress(unsigned char *data_in, if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) { printk(KERN_WARNING "inflateInit failed\n"); - ret = -1; - goto out; + return -1; } while (bytes_left > 0) { @@ -616,12 +387,13 @@ next: ret = 0; zlib_inflateEnd(&workspace->inf_strm); -out: - free_workspace(workspace); return ret; } -void btrfs_zlib_exit(void) -{ - free_workspaces(); -} +struct btrfs_compress_op btrfs_zlib_compress = { + .alloc_workspace = zlib_alloc_workspace, + .free_workspace = zlib_free_workspace, + .compress_pages = zlib_compress_pages, + .decompress_biovec = zlib_decompress_biovec, + .decompress = zlib_decompress, +}; |