From 0b86a832a1f38abec695864ec2eaedc9d2383f1b Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 24 Mar 2008 15:01:56 -0400
Subject: Btrfs: Add support for multiple devices per filesystem

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/volumes.c | 852 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 852 insertions(+)
 create mode 100644 fs/btrfs/volumes.c

(limited to 'fs/btrfs/volumes.c')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
new file mode 100644
index 000000000000..90a8d45dc6d7
--- /dev/null
+++ b/fs/btrfs/volumes.c
@@ -0,0 +1,852 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#include <linux/sched.h>
+#include <linux/bio.h>
+#include "ctree.h"
+#include "extent_map.h"
+#include "disk-io.h"
+#include "transaction.h"
+#include "print-tree.h"
+#include "volumes.h"
+
+struct map_lookup {
+	struct btrfs_device *dev;
+	u64 physical;
+};
+
+/*
+ * this uses a pretty simple search, the expectation is that it is
+ * called very infrequently and that a given device has a small number
+ * of extents
+ */
+static int find_free_dev_extent(struct btrfs_trans_handle *trans,
+				struct btrfs_device *device,
+				struct btrfs_path *path,
+				u64 num_bytes, u64 *start)
+{
+	struct btrfs_key key;
+	struct btrfs_root *root = device->dev_root;
+	struct btrfs_dev_extent *dev_extent = NULL;
+	u64 hole_size = 0;
+	u64 last_byte = 0;
+	u64 search_start = 0;
+	u64 search_end = device->total_bytes;
+	int ret;
+	int slot = 0;
+	int start_found;
+	struct extent_buffer *l;
+
+	start_found = 0;
+	path->reada = 2;
+
+	/* FIXME use last free of some kind */
+
+	key.objectid = device->devid;
+	key.offset = search_start;
+	key.type = BTRFS_DEV_EXTENT_KEY;
+	ret = btrfs_search_slot(trans, root, &key, path, 0, 0);
+	if (ret < 0)
+		goto error;
+	ret = btrfs_previous_item(root, path, 0, key.type);
+	if (ret < 0)
+		goto error;
+	l = path->nodes[0];
+	btrfs_item_key_to_cpu(l, &key, path->slots[0]);
+	while (1) {
+		l = path->nodes[0];
+		slot = path->slots[0];
+		if (slot >= btrfs_header_nritems(l)) {
+			ret = btrfs_next_leaf(root, path);
+			if (ret == 0)
+				continue;
+			if (ret < 0)
+				goto error;
+no_more_items:
+			if (!start_found) {
+				if (search_start >= search_end) {
+					ret = -ENOSPC;
+					goto error;
+				}
+				*start = search_start;
+				start_found = 1;
+				goto check_pending;
+			}
+			*start = last_byte > search_start ?
+				last_byte : search_start;
+			if (search_end <= *start) {
+				ret = -ENOSPC;
+				goto error;
+			}
+			goto check_pending;
+		}
+		btrfs_item_key_to_cpu(l, &key, slot);
+
+		if (key.objectid < device->devid)
+			goto next;
+
+		if (key.objectid > device->devid)
+			goto no_more_items;
+
+		if (key.offset >= search_start && key.offset > last_byte &&
+		    start_found) {
+			if (last_byte < search_start)
+				last_byte = search_start;
+			hole_size = key.offset - last_byte;
+			if (key.offset > last_byte &&
+			    hole_size >= num_bytes) {
+				*start = last_byte;
+				goto check_pending;
+			}
+		}
+		if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY) {
+			goto next;
+		}
+
+		start_found = 1;
+		dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
+		last_byte = key.offset + btrfs_dev_extent_length(l, dev_extent);
+next:
+		path->slots[0]++;
+		cond_resched();
+	}
+check_pending:
+	/* we have to make sure we didn't find an extent that has already
+	 * been allocated by the map tree or the original allocation
+	 */
+	btrfs_release_path(root, path);
+	BUG_ON(*start < search_start);
+
+	if (*start + num_bytes >= search_end) {
+		ret = -ENOSPC;
+		goto error;
+	}
+	/* check for pending inserts here */
+	return 0;
+
+error:
+	btrfs_release_path(root, path);
+	return ret;
+}
+
+int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
+			   struct btrfs_device *device,
+			   u64 owner, u64 num_bytes, u64 *start)
+{
+	int ret;
+	struct btrfs_path *path;
+	struct btrfs_root *root = device->dev_root;
+	struct btrfs_dev_extent *extent;
+	struct extent_buffer *leaf;
+	struct btrfs_key key;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	ret = find_free_dev_extent(trans, device, path, num_bytes, start);
+	if (ret)
+		goto err;
+
+	key.objectid = device->devid;
+	key.offset = *start;
+	key.type = BTRFS_DEV_EXTENT_KEY;
+	ret = btrfs_insert_empty_item(trans, root, path, &key,
+				      sizeof(*extent));
+	BUG_ON(ret);
+
+	leaf = path->nodes[0];
+	extent = btrfs_item_ptr(leaf, path->slots[0],
+				struct btrfs_dev_extent);
+	btrfs_set_dev_extent_owner(leaf, extent, owner);
+	btrfs_set_dev_extent_length(leaf, extent, num_bytes);
+	btrfs_mark_buffer_dirty(leaf);
+err:
+	btrfs_free_path(path);
+	return ret;
+}
+
+static int find_next_chunk(struct btrfs_root *root, u64 *objectid)
+{
+	struct btrfs_path *path;
+	int ret;
+	struct btrfs_key key;
+	struct btrfs_key found_key;
+
+	path = btrfs_alloc_path();
+	BUG_ON(!path);
+
+	key.objectid = (u64)-1;
+	key.offset = (u64)-1;
+	key.type = BTRFS_CHUNK_ITEM_KEY;
+
+	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+	if (ret < 0)
+		goto error;
+
+	BUG_ON(ret == 0);
+
+	ret = btrfs_previous_item(root, path, 0, BTRFS_CHUNK_ITEM_KEY);
+	if (ret) {
+		*objectid = 0;
+	} else {
+		btrfs_item_key_to_cpu(path->nodes[0], &found_key,
+				      path->slots[0]);
+		*objectid = found_key.objectid + found_key.offset;
+	}
+	ret = 0;
+error:
+	btrfs_free_path(path);
+	return ret;
+}
+
+static struct btrfs_device *next_device(struct list_head *head,
+					struct list_head *last)
+{
+	struct list_head *next = last->next;
+	struct btrfs_device *dev;
+
+	if (list_empty(head))
+		return NULL;
+
+	if (next == head)
+		next = next->next;
+
+	dev = list_entry(next, struct btrfs_device, dev_list);
+	return dev;
+}
+
+static int find_next_devid(struct btrfs_root *root, struct btrfs_path *path,
+			   u64 *objectid)
+{
+	int ret;
+	struct btrfs_key key;
+	struct btrfs_key found_key;
+
+	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
+	key.type = BTRFS_DEV_ITEM_KEY;
+	key.offset = (u64)-1;
+
+	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+	if (ret < 0)
+		goto error;
+
+	BUG_ON(ret == 0);
+
+	ret = btrfs_previous_item(root, path, BTRFS_DEV_ITEMS_OBJECTID,
+				  BTRFS_DEV_ITEM_KEY);
+	if (ret) {
+		*objectid = 1;
+	} else {
+		btrfs_item_key_to_cpu(path->nodes[0], &found_key,
+				      path->slots[0]);
+		*objectid = found_key.offset + 1;
+	}
+	ret = 0;
+error:
+	btrfs_release_path(root, path);
+	return ret;
+}
+
+/*
+ * the device information is stored in the chunk root
+ * the btrfs_device struct should be fully filled in
+ */
+int btrfs_add_device(struct btrfs_trans_handle *trans,
+		     struct btrfs_root *root,
+		     struct btrfs_device *device)
+{
+	int ret;
+	struct btrfs_path *path;
+	struct btrfs_dev_item *dev_item;
+	struct extent_buffer *leaf;
+	struct btrfs_key key;
+	unsigned long ptr;
+	u64 free_devid;
+
+	root = root->fs_info->chunk_root;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	ret = find_next_devid(root, path, &free_devid);
+	if (ret)
+		goto out;
+
+	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
+	key.type = BTRFS_DEV_ITEM_KEY;
+	key.offset = free_devid;
+
+	ret = btrfs_insert_empty_item(trans, root, path, &key,
+				      sizeof(*dev_item) + device->name_len);
+	if (ret)
+		goto out;
+
+	leaf = path->nodes[0];
+	dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
+
+	btrfs_set_device_id(leaf, dev_item, device->devid);
+	btrfs_set_device_type(leaf, dev_item, device->type);
+	btrfs_set_device_io_align(leaf, dev_item, device->io_align);
+	btrfs_set_device_io_width(leaf, dev_item, device->io_width);
+	btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
+	btrfs_set_device_rdev(leaf, dev_item, device->rdev);
+	btrfs_set_device_partition(leaf, dev_item, device->partition);
+	btrfs_set_device_name_len(leaf, dev_item, device->name_len);
+	btrfs_set_device_total_bytes(leaf, dev_item, device->total_bytes);
+	btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used);
+
+	ptr = (unsigned long)btrfs_device_name(dev_item);
+	write_extent_buffer(leaf, device->name, ptr, device->name_len);
+
+	ptr = (unsigned long)btrfs_device_uuid(dev_item);
+	write_extent_buffer(leaf, device->uuid, ptr, BTRFS_DEV_UUID_SIZE);
+	btrfs_mark_buffer_dirty(leaf);
+	ret = 0;
+
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+int btrfs_update_device(struct btrfs_trans_handle *trans,
+			struct btrfs_device *device)
+{
+	int ret;
+	struct btrfs_path *path;
+	struct btrfs_root *root;
+	struct btrfs_dev_item *dev_item;
+	struct extent_buffer *leaf;
+	struct btrfs_key key;
+
+	root = device->dev_root->fs_info->chunk_root;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
+	key.type = BTRFS_DEV_ITEM_KEY;
+	key.offset = device->devid;
+
+	ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
+	if (ret < 0)
+		goto out;
+
+	if (ret > 0) {
+		ret = -ENOENT;
+		goto out;
+	}
+
+	leaf = path->nodes[0];
+	dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
+
+	btrfs_set_device_id(leaf, dev_item, device->devid);
+	btrfs_set_device_type(leaf, dev_item, device->type);
+	btrfs_set_device_io_align(leaf, dev_item, device->io_align);
+	btrfs_set_device_io_width(leaf, dev_item, device->io_width);
+	btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
+	btrfs_set_device_rdev(leaf, dev_item, device->rdev);
+	btrfs_set_device_partition(leaf, dev_item, device->partition);
+	btrfs_set_device_total_bytes(leaf, dev_item, device->total_bytes);
+	btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used);
+	btrfs_mark_buffer_dirty(leaf);
+
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+int btrfs_add_system_chunk(struct btrfs_trans_handle *trans,
+			   struct btrfs_root *root,
+			   struct btrfs_key *key,
+			   struct btrfs_chunk *chunk, int item_size)
+{
+	struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
+	struct btrfs_disk_key disk_key;
+	u32 array_size;
+	u8 *ptr;
+
+	array_size = btrfs_super_sys_array_size(super_copy);
+	if (array_size + item_size > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE)
+		return -EFBIG;
+
+	ptr = super_copy->sys_chunk_array + array_size;
+	btrfs_cpu_key_to_disk(&disk_key, key);
+	memcpy(ptr, &disk_key, sizeof(disk_key));
+	ptr += sizeof(disk_key);
+	memcpy(ptr, chunk, item_size);
+	item_size += sizeof(disk_key);
+	btrfs_set_super_sys_array_size(super_copy, array_size + item_size);
+	return 0;
+}
+
+int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
+		      struct btrfs_root *extent_root, u64 *start,
+		      u64 *num_bytes, u32 type)
+{
+	u64 dev_offset;
+	struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root;
+	struct btrfs_stripe *stripes;
+	struct btrfs_device *device = NULL;
+	struct btrfs_chunk *chunk;
+	struct list_head *dev_list = &extent_root->fs_info->devices;
+	struct list_head *last_dev = extent_root->fs_info->last_device;
+	struct extent_map_tree *em_tree;
+	struct map_lookup *map;
+	struct extent_map *em;
+	u64 physical;
+	u64 calc_size = 1024 * 1024 * 1024;
+	int num_stripes;
+	int ret;
+	int index = 0;
+	struct btrfs_key key;
+
+
+	ret = find_next_chunk(chunk_root, &key.objectid);
+	if (ret)
+		return ret;
+
+	num_stripes = 1;
+	chunk = kmalloc(btrfs_chunk_item_size(num_stripes), GFP_NOFS);
+	if (!chunk)
+		return -ENOMEM;
+
+	stripes = &chunk->stripe;
+
+	*num_bytes = calc_size;
+	while(index < num_stripes) {
+		device = next_device(dev_list, last_dev);
+		BUG_ON(!device);
+		last_dev = &device->dev_list;
+		extent_root->fs_info->last_device = last_dev;
+
+		ret = btrfs_alloc_dev_extent(trans, device,
+					     key.objectid,
+					     calc_size, &dev_offset);
+		BUG_ON(ret);
+
+		device->bytes_used += calc_size;
+		ret = btrfs_update_device(trans, device);
+		BUG_ON(ret);
+
+		btrfs_set_stack_stripe_devid(stripes + index, device->devid);
+		btrfs_set_stack_stripe_offset(stripes + index, dev_offset);
+		physical = dev_offset;
+		index++;
+	}
+
+	/* key.objectid was set above */
+	key.offset = *num_bytes;
+	key.type = BTRFS_CHUNK_ITEM_KEY;
+	btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid);
+	btrfs_set_stack_chunk_stripe_len(chunk, 64 * 1024);
+	btrfs_set_stack_chunk_type(chunk, type);
+	btrfs_set_stack_chunk_num_stripes(chunk, num_stripes);
+	btrfs_set_stack_chunk_io_align(chunk, extent_root->sectorsize);
+	btrfs_set_stack_chunk_io_width(chunk, extent_root->sectorsize);
+	btrfs_set_stack_chunk_sector_size(chunk, extent_root->sectorsize);
+
+	ret = btrfs_insert_item(trans, chunk_root, &key, chunk,
+				btrfs_chunk_item_size(num_stripes));
+	BUG_ON(ret);
+	*start = key.objectid;
+
+	em = alloc_extent_map(GFP_NOFS);
+	if (!em)
+		return -ENOMEM;
+	map = kmalloc(sizeof(*map), GFP_NOFS);
+	if (!map) {
+		free_extent_map(em);
+		return -ENOMEM;
+	}
+
+	em->bdev = (struct block_device *)map;
+	em->start = key.objectid;
+	em->len = key.offset;
+	em->block_start = 0;
+
+	map->physical = physical;
+	map->dev = device;
+
+	if (!map->dev) {
+		kfree(map);
+		free_extent_map(em);
+		return -EIO;
+	}
+	kfree(chunk);
+
+	em_tree = &extent_root->fs_info->mapping_tree.map_tree;
+	spin_lock(&em_tree->lock);
+	ret = add_extent_mapping(em_tree, em);
+	BUG_ON(ret);
+	spin_unlock(&em_tree->lock);
+	free_extent_map(em);
+	return ret;
+}
+
+void btrfs_mapping_init(struct btrfs_mapping_tree *tree)
+{
+	extent_map_tree_init(&tree->map_tree, GFP_NOFS);
+}
+
+void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree)
+{
+	struct extent_map *em;
+
+	while(1) {
+		spin_lock(&tree->map_tree.lock);
+		em = lookup_extent_mapping(&tree->map_tree, 0, (u64)-1);
+		if (em)
+			remove_extent_mapping(&tree->map_tree, em);
+		spin_unlock(&tree->map_tree.lock);
+		if (!em)
+			break;
+		kfree(em->bdev);
+		/* once for us */
+		free_extent_map(em);
+		/* once for the tree */
+		free_extent_map(em);
+	}
+}
+
+int btrfs_map_block(struct btrfs_mapping_tree *map_tree,
+		    u64 logical, u64 *phys, u64 *length,
+		    struct btrfs_device **dev)
+{
+	struct extent_map *em;
+	struct map_lookup *map;
+	struct extent_map_tree *em_tree = &map_tree->map_tree;
+	u64 offset;
+
+
+	spin_lock(&em_tree->lock);
+	em = lookup_extent_mapping(em_tree, logical, *length);
+	BUG_ON(!em);
+
+	BUG_ON(em->start > logical || em->start + em->len < logical);
+	map = (struct map_lookup *)em->bdev;
+	offset = logical - em->start;
+	*phys = map->physical + offset;
+	*length = em->len - offset;
+	*dev = map->dev;
+	free_extent_map(em);
+	spin_unlock(&em_tree->lock);
+	return 0;
+}
+
+int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio)
+{
+	struct btrfs_mapping_tree *map_tree;
+	struct btrfs_device *dev;
+	u64 logical = bio->bi_sector << 9;
+	u64 physical;
+	u64 length = 0;
+	u64 map_length;
+	struct bio_vec *bvec;
+	int i;
+	int ret;
+
+	bio_for_each_segment(bvec, bio, i) {
+		length += bvec->bv_len;
+	}
+	map_tree = &root->fs_info->mapping_tree;
+	map_length = length;
+	ret = btrfs_map_block(map_tree, logical, &physical, &map_length, &dev);
+	BUG_ON(map_length < length);
+	bio->bi_sector = physical >> 9;
+	bio->bi_bdev = dev->bdev;
+	submit_bio(rw, bio);
+	return 0;
+}
+
+struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid)
+{
+	struct btrfs_device *dev;
+	struct list_head *cur = root->fs_info->devices.next;
+	struct list_head *head = &root->fs_info->devices;
+
+	while(cur != head) {
+		dev = list_entry(cur, struct btrfs_device, dev_list);
+		if (dev->devid == devid)
+			return dev;
+		cur = cur->next;
+	}
+	return NULL;
+}
+
+static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
+			  struct extent_buffer *leaf,
+			  struct btrfs_chunk *chunk)
+{
+	struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;
+	struct map_lookup *map;
+	struct extent_map *em;
+	u64 logical;
+	u64 length;
+	u64 devid;
+	int ret;
+
+	logical = key->objectid;
+	length = key->offset;
+	spin_lock(&map_tree->map_tree.lock);
+	em = lookup_extent_mapping(&map_tree->map_tree, logical, 1);
+
+	/* already mapped? */
+	if (em && em->start <= logical && em->start + em->len > logical) {
+		free_extent_map(em);
+		spin_unlock(&map_tree->map_tree.lock);
+		return 0;
+	} else if (em) {
+		free_extent_map(em);
+	}
+	spin_unlock(&map_tree->map_tree.lock);
+
+	map = kzalloc(sizeof(*map), GFP_NOFS);
+	if (!map)
+		return -ENOMEM;
+
+	em = alloc_extent_map(GFP_NOFS);
+	if (!em)
+		return -ENOMEM;
+	map = kmalloc(sizeof(*map), GFP_NOFS);
+	if (!map) {
+		free_extent_map(em);
+		return -ENOMEM;
+	}
+
+	em->bdev = (struct block_device *)map;
+	em->start = logical;
+	em->len = length;
+	em->block_start = 0;
+
+	map->physical = btrfs_stripe_offset_nr(leaf, chunk, 0);
+	devid = btrfs_stripe_devid_nr(leaf, chunk, 0);
+	map->dev = btrfs_find_device(root, devid);
+	if (!map->dev) {
+		kfree(map);
+		free_extent_map(em);
+		return -EIO;
+	}
+
+	spin_lock(&map_tree->map_tree.lock);
+	ret = add_extent_mapping(&map_tree->map_tree, em);
+	BUG_ON(ret);
+	spin_unlock(&map_tree->map_tree.lock);
+	free_extent_map(em);
+
+	return 0;
+}
+
+static int fill_device_from_item(struct extent_buffer *leaf,
+				 struct btrfs_dev_item *dev_item,
+				 struct btrfs_device *device)
+{
+	unsigned long ptr;
+	char *name;
+
+	device->devid = btrfs_device_id(leaf, dev_item);
+	device->total_bytes = btrfs_device_total_bytes(leaf, dev_item);
+	device->bytes_used = btrfs_device_bytes_used(leaf, dev_item);
+	device->type = btrfs_device_type(leaf, dev_item);
+	device->io_align = btrfs_device_io_align(leaf, dev_item);
+	device->io_width = btrfs_device_io_width(leaf, dev_item);
+	device->sector_size = btrfs_device_sector_size(leaf, dev_item);
+	device->rdev = btrfs_device_rdev(leaf, dev_item);
+	device->partition = btrfs_device_partition(leaf, dev_item);
+	device->name_len = btrfs_device_name_len(leaf, dev_item);
+
+	ptr = (unsigned long)btrfs_device_uuid(dev_item);
+	read_extent_buffer(leaf, device->uuid, ptr, BTRFS_DEV_UUID_SIZE);
+
+	name = kmalloc(device->name_len + 1, GFP_NOFS);
+	if (!name)
+		return -ENOMEM;
+	device->name = name;
+	ptr = (unsigned long)btrfs_device_name(dev_item);
+	read_extent_buffer(leaf, name, ptr, device->name_len);
+	name[device->name_len] = '\0';
+	return 0;
+}
+
+static int read_one_dev(struct btrfs_root *root, struct btrfs_key *key,
+			struct extent_buffer *leaf,
+			struct btrfs_dev_item *dev_item)
+{
+	struct btrfs_device *device;
+	u64 devid;
+	int ret;
+
+	devid = btrfs_device_id(leaf, dev_item);
+	if (btrfs_find_device(root, devid))
+		return 0;
+
+	device = kmalloc(sizeof(*device), GFP_NOFS);
+	if (!device)
+		return -ENOMEM;
+
+	fill_device_from_item(leaf, dev_item, device);
+	device->dev_root = root->fs_info->dev_root;
+	device->bdev = root->fs_info->sb->s_bdev;
+	list_add(&device->dev_list, &root->fs_info->devices);
+	memcpy(&device->dev_key, key, sizeof(*key));
+	ret = 0;
+#if 0
+	ret = btrfs_open_device(device);
+	if (ret) {
+		kfree(device);
+	}
+#endif
+	return ret;
+}
+
+int btrfs_read_sys_array(struct btrfs_root *root)
+{
+	struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
+	struct extent_buffer *sb = root->fs_info->sb_buffer;
+	struct btrfs_disk_key *disk_key;
+	struct btrfs_dev_item *dev_item;
+	struct btrfs_chunk *chunk;
+	struct btrfs_key key;
+	u32 num_stripes;
+	u32 array_size;
+	u32 len = 0;
+	u8 *ptr;
+	unsigned long sb_ptr;
+	u32 cur;
+	int ret;
+	int dev_only = 1;
+
+	array_size = btrfs_super_sys_array_size(super_copy);
+
+	/*
+	 * we do this loop twice, once for the device items and
+	 * once for all of the chunks.  This way there are device
+	 * structs filled in for every chunk
+	 */
+again:
+	ptr = super_copy->sys_chunk_array;
+	sb_ptr = offsetof(struct btrfs_super_block, sys_chunk_array);
+	cur = 0;
+
+	while (cur < array_size) {
+		disk_key = (struct btrfs_disk_key *)ptr;
+		btrfs_disk_key_to_cpu(&key, disk_key);
+
+		len = sizeof(*disk_key);
+		ptr += len;
+		sb_ptr += len;
+		cur += len;
+
+		if (key.objectid == BTRFS_DEV_ITEMS_OBJECTID &&
+		    key.type == BTRFS_DEV_ITEM_KEY) {
+			dev_item = (struct btrfs_dev_item *)sb_ptr;
+			if (dev_only) {
+				ret = read_one_dev(root, &key, sb, dev_item);
+				BUG_ON(ret);
+			}
+			len = sizeof(*dev_item);
+			len += btrfs_device_name_len(sb, dev_item);
+		} else if (key.type == BTRFS_CHUNK_ITEM_KEY) {
+
+			chunk = (struct btrfs_chunk *)sb_ptr;
+			if (!dev_only) {
+				ret = read_one_chunk(root, &key, sb, chunk);
+				BUG_ON(ret);
+			}
+			num_stripes = btrfs_chunk_num_stripes(sb, chunk);
+			len = btrfs_chunk_item_size(num_stripes);
+		} else {
+			BUG();
+		}
+		ptr += len;
+		sb_ptr += len;
+		cur += len;
+	}
+	if (dev_only == 1) {
+		dev_only = 0;
+		goto again;
+	}
+	return 0;
+}
+
+int btrfs_read_chunk_tree(struct btrfs_root *root)
+{
+	struct btrfs_path *path;
+	struct extent_buffer *leaf;
+	struct btrfs_key key;
+	struct btrfs_key found_key;
+	int ret;
+	int slot;
+
+	root = root->fs_info->chunk_root;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	/* first we search for all of the device items, and then we
+	 * read in all of the chunk items.  This way we can create chunk
+	 * mappings that reference all of the devices that are afound
+	 */
+	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
+	key.offset = 0;
+	key.type = 0;
+again:
+	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+	while(1) {
+		leaf = path->nodes[0];
+		slot = path->slots[0];
+		if (slot >= btrfs_header_nritems(leaf)) {
+			ret = btrfs_next_leaf(root, path);
+			if (ret == 0)
+				continue;
+			if (ret < 0)
+				goto error;
+			break;
+		}
+		btrfs_item_key_to_cpu(leaf, &found_key, slot);
+		if (key.objectid == BTRFS_DEV_ITEMS_OBJECTID) {
+			if (found_key.objectid != BTRFS_DEV_ITEMS_OBJECTID)
+				break;
+			if (found_key.type == BTRFS_DEV_ITEM_KEY) {
+				struct btrfs_dev_item *dev_item;
+				dev_item = btrfs_item_ptr(leaf, slot,
+						  struct btrfs_dev_item);
+				ret = read_one_dev(root, &found_key, leaf,
+						   dev_item);
+				BUG_ON(ret);
+			}
+		} else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) {
+			struct btrfs_chunk *chunk;
+			chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
+			ret = read_one_chunk(root, &found_key, leaf, chunk);
+		}
+		path->slots[0]++;
+	}
+	if (key.objectid == BTRFS_DEV_ITEMS_OBJECTID) {
+		key.objectid = 0;
+		btrfs_release_path(root, path);
+		goto again;
+	}
+
+	btrfs_free_path(path);
+	ret = 0;
+error:
+	return ret;
+}
+
-- 
cgit v1.2.3


From 6324fbf334f4586325057197da7752f4ffa409d3 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 24 Mar 2008 15:01:59 -0400
Subject: Btrfs: Dynamic chunk and block group allocation

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h       |  12 ++-
 fs/btrfs/disk-io.c     |   2 +-
 fs/btrfs/extent-tree.c | 208 ++++++++++++++++++++++++++++++++++++++++++++-----
 fs/btrfs/inode.c       |   9 ++-
 fs/btrfs/volumes.c     |  87 +++++++++++++--------
 fs/btrfs/volumes.h     |   2 +-
 6 files changed, 263 insertions(+), 57 deletions(-)

(limited to 'fs/btrfs/volumes.c')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 1453d995fef6..a877105f5c47 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -431,9 +431,19 @@ struct btrfs_block_group_item {
 	__le64 flags;
 } __attribute__ ((__packed__));
 
+struct btrfs_space_info {
+	u64 flags;
+	u64 total_bytes;
+	u64 bytes_used;
+	u64 bytes_pinned;
+	int full;
+	struct list_head list;
+};
+
 struct btrfs_block_group_cache {
 	struct btrfs_key key;
 	struct btrfs_block_group_item item;
+	struct btrfs_space_info *space_info;
 	u64 pinned;
 	u64 flags;
 	int cached;
@@ -490,7 +500,7 @@ struct btrfs_fs_info {
 	struct list_head dirty_cowonly_roots;
 
 	struct list_head devices;
-	struct list_head *last_device;
+	struct list_head space_info;
 	spinlock_t delalloc_lock;
 	spinlock_t new_trans_lock;
 	u64 delalloc_bytes;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 8e37fa120cc8..2a239ae49f78 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -700,8 +700,8 @@ struct btrfs_root *open_ctree(struct super_block *sb)
 	fs_info->dev_root = dev_root;
 	INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);
 	INIT_LIST_HEAD(&fs_info->devices);
+	INIT_LIST_HEAD(&fs_info->space_info);
 	btrfs_mapping_init(&fs_info->mapping_tree);
-	fs_info->last_device = &fs_info->devices;
 	fs_info->sb = sb;
 	fs_info->throttles = 0;
 	fs_info->mount_opt = 0;
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 2cd957d6e8d8..15082b1087be 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -36,6 +36,10 @@ static int finish_current_insert(struct btrfs_trans_handle *trans, struct
 				 btrfs_root *extent_root);
 static int del_pending_extents(struct btrfs_trans_handle *trans, struct
 			       btrfs_root *extent_root);
+int btrfs_make_block_group(struct btrfs_trans_handle *trans,
+			   struct btrfs_root *root, u64 bytes_used,
+			   u64 type, u64 chunk_tree, u64 chunk_objectid,
+			   u64 size);
 
 
 static int cache_block_group(struct btrfs_root *root,
@@ -168,16 +172,7 @@ struct btrfs_block_group_cache *btrfs_lookup_block_group(struct
 
 static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
 {
-	if ((bits & BLOCK_GROUP_DATA) &&
-	    (cache->flags & BTRFS_BLOCK_GROUP_DATA))
-		return 1;
-	if ((bits & BLOCK_GROUP_METADATA) &&
-	     (cache->flags & BTRFS_BLOCK_GROUP_METADATA))
-		return 1;
-	if ((bits & BLOCK_GROUP_SYSTEM) &&
-	     (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM))
-		return 1;
-	return 0;
+	return (cache->flags & bits);
 }
 
 static int noinline find_search_start(struct btrfs_root *root,
@@ -276,6 +271,18 @@ static u64 div_factor(u64 num, int factor)
 	return num;
 }
 
+static int block_group_state_bits(u64 flags)
+{
+	int bits = 0;
+	if (flags & BTRFS_BLOCK_GROUP_DATA)
+		bits |= BLOCK_GROUP_DATA;
+	if (flags & BTRFS_BLOCK_GROUP_METADATA)
+		bits |= BLOCK_GROUP_METADATA;
+	if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
+		bits |= BLOCK_GROUP_SYSTEM;
+	return bits;
+}
+
 struct btrfs_block_group_cache *btrfs_find_block_group(struct btrfs_root *root,
 						 struct btrfs_block_group_cache
 						 *hint, u64 search_start,
@@ -304,7 +311,7 @@ struct btrfs_block_group_cache *btrfs_find_block_group(struct btrfs_root *root,
 	if (!owner)
 		factor = 8;
 
-	bit = data;
+	bit = block_group_state_bits(data);
 
 	if (search_start && search_start < total_fs_bytes) {
 		struct btrfs_block_group_cache *shint;
@@ -358,10 +365,15 @@ again:
 			free_check = cache->key.offset;
 		else
 			free_check = div_factor(cache->key.offset, factor);
+
 		if (used + cache->pinned < free_check) {
 			found_group = cache;
 			goto found;
 		}
+		if (full_search) {
+			printk("failed on cache %Lu used %Lu total %Lu\n",
+			       cache->key.objectid, used, cache->key.offset);
+		}
 		cond_resched();
 	}
 	if (!full_search) {
@@ -983,6 +995,58 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
 	return werr;
 }
 
+static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
+						  u64 flags)
+{
+	struct list_head *head = &info->space_info;
+	struct list_head *cur;
+	struct btrfs_space_info *found;
+	list_for_each(cur, head) {
+		found = list_entry(cur, struct btrfs_space_info, list);
+		if (found->flags == flags)
+			return found;
+	}
+	return NULL;
+
+}
+
+static int do_chunk_alloc(struct btrfs_trans_handle *trans,
+			  struct btrfs_root *extent_root, u64 alloc_bytes,
+			  u64 flags)
+{
+	struct btrfs_space_info *space_info;
+	u64 thresh;
+	u64 start;
+	u64 num_bytes;
+	int ret;
+
+	space_info = __find_space_info(extent_root->fs_info, flags);
+	BUG_ON(!space_info);
+
+	if (space_info->full)
+		return 0;
+
+	thresh = div_factor(space_info->total_bytes, 7);
+	if ((space_info->bytes_used + space_info->bytes_pinned + alloc_bytes) <
+	    thresh)
+		return 0;
+
+	ret = btrfs_alloc_chunk(trans, extent_root, &start, &num_bytes, flags);
+	if (ret == -ENOSPC) {
+printk("space info full %Lu\n", flags);
+		space_info->full = 1;
+		return 0;
+	}
+
+	BUG_ON(ret);
+
+	ret = btrfs_make_block_group(trans, extent_root, 0, flags,
+		     extent_root->fs_info->chunk_root->root_key.objectid,
+		     start, num_bytes);
+	BUG_ON(ret);
+	return 0;
+}
+
 static int update_block_group(struct btrfs_trans_handle *trans,
 			      struct btrfs_root *root,
 			      u64 bytenr, u64 num_bytes, int alloc,
@@ -1012,8 +1076,10 @@ static int update_block_group(struct btrfs_trans_handle *trans,
 		num_bytes = min(total, cache->key.offset - byte_in_group);
 		if (alloc) {
 			old_val += num_bytes;
+			cache->space_info->bytes_used += num_bytes;
 		} else {
 			old_val -= num_bytes;
+			cache->space_info->bytes_used -= num_bytes;
 			if (mark_free) {
 				set_extent_dirty(&info->free_space_cache,
 						 bytenr, bytenr + num_bytes - 1,
@@ -1026,6 +1092,7 @@ static int update_block_group(struct btrfs_trans_handle *trans,
 	}
 	return 0;
 }
+
 static int update_pinned_extents(struct btrfs_root *root,
 				u64 bytenr, u64 num, int pin)
 {
@@ -1047,9 +1114,11 @@ static int update_pinned_extents(struct btrfs_root *root,
 			  (bytenr - cache->key.objectid));
 		if (pin) {
 			cache->pinned += len;
+			cache->space_info->bytes_pinned += len;
 			fs_info->total_pinned += len;
 		} else {
 			cache->pinned -= len;
+			cache->space_info->bytes_pinned -= len;
 			fs_info->total_pinned -= len;
 		}
 		bytenr += len;
@@ -1472,7 +1541,7 @@ check_failed:
 		goto new_group;
 	}
 
-	if (!(data & BLOCK_GROUP_DATA)) {
+	if (!(data & BTRFS_BLOCK_GROUP_DATA)) {
 		block_group = btrfs_lookup_block_group(info, ins->objectid);
 		if (block_group)
 			trans->block_group = block_group;
@@ -1532,12 +1601,25 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
 	struct btrfs_path *path;
 	struct btrfs_key keys[2];
 
-	if (data)
-		data = BLOCK_GROUP_DATA;
-	else if (root == root->fs_info->chunk_root)
-		data = BLOCK_GROUP_SYSTEM;
-	else
-		data = BLOCK_GROUP_METADATA;
+	if (data) {
+		data = BTRFS_BLOCK_GROUP_DATA;
+	} else if (root == root->fs_info->chunk_root) {
+		data = BTRFS_BLOCK_GROUP_SYSTEM;
+	} else {
+		data = BTRFS_BLOCK_GROUP_METADATA;
+	}
+
+	if (root->ref_cows) {
+		if (data != BTRFS_BLOCK_GROUP_METADATA) {
+			ret = do_chunk_alloc(trans, root->fs_info->extent_root,
+					     num_bytes,
+					     BTRFS_BLOCK_GROUP_METADATA);
+			BUG_ON(ret);
+		}
+		ret = do_chunk_alloc(trans, root->fs_info->extent_root,
+				     num_bytes, data);
+		BUG_ON(ret);
+	}
 
 	new_hint = max(hint_byte, root->fs_info->alloc_start);
 	if (new_hint < btrfs_super_total_bytes(&info->super_copy))
@@ -2490,6 +2572,34 @@ error:
 	return ret;
 }
 
+static int update_space_info(struct btrfs_fs_info *info, u64 flags,
+			     u64 total_bytes, u64 bytes_used,
+			     struct btrfs_space_info **space_info)
+{
+	struct btrfs_space_info *found;
+
+	found = __find_space_info(info, flags);
+	if (found) {
+		found->total_bytes += total_bytes;
+		found->bytes_used += bytes_used;
+		WARN_ON(found->total_bytes < found->bytes_used);
+		*space_info = found;
+		return 0;
+	}
+	found = kmalloc(sizeof(*found), GFP_NOFS);
+	if (!found)
+		return -ENOMEM;
+
+	list_add(&found->list, &info->space_info);
+	found->flags = flags;
+	found->total_bytes = total_bytes;
+	found->bytes_used = bytes_used;
+	found->bytes_pinned = 0;
+	found->full = 0;
+	*space_info = found;
+	return 0;
+}
+
 int btrfs_read_block_groups(struct btrfs_root *root)
 {
 	struct btrfs_path *path;
@@ -2497,6 +2607,7 @@ int btrfs_read_block_groups(struct btrfs_root *root)
 	int bit;
 	struct btrfs_block_group_cache *cache;
 	struct btrfs_fs_info *info = root->fs_info;
+	struct btrfs_space_info *space_info;
 	struct extent_io_tree *block_group_cache;
 	struct btrfs_key key;
 	struct btrfs_key found_key;
@@ -2547,6 +2658,12 @@ int btrfs_read_block_groups(struct btrfs_root *root)
 			bit = BLOCK_GROUP_METADATA;
 		}
 
+		ret = update_space_info(info, cache->flags, found_key.offset,
+					btrfs_block_group_used(&cache->item),
+					&space_info);
+		BUG_ON(ret);
+		cache->space_info = space_info;
+
 		/* use EXTENT_LOCKED to prevent merging */
 		set_extent_bits(block_group_cache, found_key.objectid,
 				found_key.objectid + found_key.offset - 1,
@@ -2563,3 +2680,58 @@ error:
 	btrfs_free_path(path);
 	return ret;
 }
+
+int btrfs_make_block_group(struct btrfs_trans_handle *trans,
+			   struct btrfs_root *root, u64 bytes_used,
+			   u64 type, u64 chunk_tree, u64 chunk_objectid,
+			   u64 size)
+{
+	int ret;
+	int bit = 0;
+	struct btrfs_root *extent_root;
+	struct btrfs_block_group_cache *cache;
+	struct extent_io_tree *block_group_cache;
+
+	extent_root = root->fs_info->extent_root;
+	block_group_cache = &root->fs_info->block_group_cache;
+
+	cache = kmalloc(sizeof(*cache), GFP_NOFS);
+	BUG_ON(!cache);
+	cache->key.objectid = chunk_objectid;
+	cache->key.offset = size;
+	cache->cached = 0;
+	cache->pinned = 0;
+	btrfs_set_key_type(&cache->key, BTRFS_BLOCK_GROUP_ITEM_KEY);
+	memset(&cache->item, 0, sizeof(cache->item));
+	btrfs_set_block_group_used(&cache->item, bytes_used);
+	btrfs_set_block_group_chunk_tree(&cache->item, chunk_tree);
+	btrfs_set_block_group_chunk_objectid(&cache->item, chunk_objectid);
+	cache->flags = type;
+	btrfs_set_block_group_flags(&cache->item, type);
+
+	ret = update_space_info(root->fs_info, cache->flags, size, bytes_used,
+				&cache->space_info);
+	BUG_ON(ret);
+
+	if (type & BTRFS_BLOCK_GROUP_DATA) {
+		bit = BLOCK_GROUP_DATA;
+	} else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
+		bit = BLOCK_GROUP_SYSTEM;
+	} else if (type & BTRFS_BLOCK_GROUP_METADATA) {
+		bit = BLOCK_GROUP_METADATA;
+	}
+	set_extent_bits(block_group_cache, chunk_objectid,
+			chunk_objectid + size - 1,
+			bit | EXTENT_LOCKED, GFP_NOFS);
+	set_state_private(block_group_cache, chunk_objectid,
+			  (unsigned long)cache);
+
+	ret = btrfs_insert_item(trans, extent_root, &cache->key, &cache->item,
+				sizeof(cache->item));
+	BUG_ON(ret);
+
+	finish_current_insert(trans, extent_root);
+	ret = del_pending_extents(trans, extent_root);
+	BUG_ON(ret);
+	return 0;
+}
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 17063cd2cb73..109576b57f69 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1527,6 +1527,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
 {
 	struct inode *inode;
 	struct btrfs_inode_item *inode_item;
+	struct btrfs_block_group_cache *new_inode_group;
 	struct btrfs_key *location;
 	struct btrfs_path *path;
 	struct btrfs_inode_ref *ref;
@@ -1553,9 +1554,13 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
 		owner = 0;
 	else
 		owner = 1;
-	group = btrfs_find_block_group(root, group, 0,
+	new_inode_group = btrfs_find_block_group(root, group, 0,
 				       BTRFS_BLOCK_GROUP_METADATA, owner);
-	BTRFS_I(inode)->block_group = group;
+	if (!new_inode_group) {
+		printk("find_block group failed\n");
+		new_inode_group = group;
+	}
+	BTRFS_I(inode)->block_group = new_inode_group;
 	BTRFS_I(inode)->flags = 0;
 
 	key[0].objectid = objectid;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 90a8d45dc6d7..a52a13f365d6 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -131,7 +131,7 @@ check_pending:
 	btrfs_release_path(root, path);
 	BUG_ON(*start < search_start);
 
-	if (*start + num_bytes >= search_end) {
+	if (*start + num_bytes > search_end) {
 		ret = -ENOSPC;
 		goto error;
 	}
@@ -159,8 +159,9 @@ int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
 		return -ENOMEM;
 
 	ret = find_free_dev_extent(trans, device, path, num_bytes, start);
-	if (ret)
+	if (ret) {
 		goto err;
+	}
 
 	key.objectid = device->devid;
 	key.offset = *start;
@@ -214,22 +215,6 @@ error:
 	return ret;
 }
 
-static struct btrfs_device *next_device(struct list_head *head,
-					struct list_head *last)
-{
-	struct list_head *next = last->next;
-	struct btrfs_device *dev;
-
-	if (list_empty(head))
-		return NULL;
-
-	if (next == head)
-		next = next->next;
-
-	dev = list_entry(next, struct btrfs_device, dev_list);
-	return dev;
-}
-
 static int find_next_devid(struct btrfs_root *root, struct btrfs_path *path,
 			   u64 *objectid)
 {
@@ -397,31 +382,63 @@ int btrfs_add_system_chunk(struct btrfs_trans_handle *trans,
 
 int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 		      struct btrfs_root *extent_root, u64 *start,
-		      u64 *num_bytes, u32 type)
+		      u64 *num_bytes, u64 type)
 {
 	u64 dev_offset;
 	struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root;
 	struct btrfs_stripe *stripes;
 	struct btrfs_device *device = NULL;
 	struct btrfs_chunk *chunk;
+	struct list_head private_devs;
 	struct list_head *dev_list = &extent_root->fs_info->devices;
-	struct list_head *last_dev = extent_root->fs_info->last_device;
+	struct list_head *cur;
 	struct extent_map_tree *em_tree;
 	struct map_lookup *map;
 	struct extent_map *em;
 	u64 physical;
 	u64 calc_size = 1024 * 1024 * 1024;
-	int num_stripes;
+	u64 avail;
+	u64 max_avail = 0;
+	int num_stripes = 1;
+	int looped = 0;
 	int ret;
-	int index = 0;
+	int index;
 	struct btrfs_key key;
 
+	if (list_empty(dev_list))
+		return -ENOSPC;
+again:
+	INIT_LIST_HEAD(&private_devs);
+	cur = dev_list->next;
+	index = 0;
+	/* build a private list of devices we will allocate from */
+	while(index < num_stripes) {
+		device = list_entry(cur, struct btrfs_device, dev_list);
+		avail = device->total_bytes - device->bytes_used;
+		cur = cur->next;
+		if (avail > max_avail)
+			max_avail = avail;
+		if (avail >= calc_size) {
+			list_move_tail(&device->dev_list, &private_devs);
+			index++;
+		}
+		if (cur == dev_list)
+			break;
+	}
+	if (index < num_stripes) {
+		list_splice(&private_devs, dev_list);
+		if (!looped && max_avail > 0) {
+			looped = 1;
+			calc_size = max_avail;
+			goto again;
+		}
+		return -ENOSPC;
+	}
 
 	ret = find_next_chunk(chunk_root, &key.objectid);
 	if (ret)
 		return ret;
 
-	num_stripes = 1;
 	chunk = kmalloc(btrfs_chunk_item_size(num_stripes), GFP_NOFS);
 	if (!chunk)
 		return -ENOMEM;
@@ -429,11 +446,12 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 	stripes = &chunk->stripe;
 
 	*num_bytes = calc_size;
+	index = 0;
 	while(index < num_stripes) {
-		device = next_device(dev_list, last_dev);
-		BUG_ON(!device);
-		last_dev = &device->dev_list;
-		extent_root->fs_info->last_device = last_dev;
+		BUG_ON(list_empty(&private_devs));
+		cur = private_devs.next;
+		device = list_entry(cur, struct btrfs_device, dev_list);
+		list_move_tail(&device->dev_list, dev_list);
 
 		ret = btrfs_alloc_dev_extent(trans, device,
 					     key.objectid,
@@ -449,6 +467,7 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 		physical = dev_offset;
 		index++;
 	}
+	BUG_ON(!list_empty(&private_devs));
 
 	/* key.objectid was set above */
 	key.offset = *num_bytes;
@@ -692,17 +711,17 @@ static int read_one_dev(struct btrfs_root *root, struct btrfs_key *key,
 	int ret;
 
 	devid = btrfs_device_id(leaf, dev_item);
-	if (btrfs_find_device(root, devid))
-		return 0;
-
-	device = kmalloc(sizeof(*device), GFP_NOFS);
-	if (!device)
-		return -ENOMEM;
+	device = btrfs_find_device(root, devid);
+	if (!device) {
+		device = kmalloc(sizeof(*device), GFP_NOFS);
+		if (!device)
+			return -ENOMEM;
+		list_add(&device->dev_list, &root->fs_info->devices);
+	}
 
 	fill_device_from_item(leaf, dev_item, device);
 	device->dev_root = root->fs_info->dev_root;
 	device->bdev = root->fs_info->sb->s_bdev;
-	list_add(&device->dev_list, &root->fs_info->devices);
 	memcpy(&device->dev_key, key, sizeof(*key));
 	ret = 0;
 #if 0
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 4a47dcb00a73..4ab9891878a8 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -71,7 +71,7 @@ int btrfs_read_sys_array(struct btrfs_root *root);
 int btrfs_read_chunk_tree(struct btrfs_root *root);
 int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 		      struct btrfs_root *extent_root, u64 *start,
-		      u64 *num_bytes, u32 type);
+		      u64 *num_bytes, u64 type);
 void btrfs_mapping_init(struct btrfs_mapping_tree *tree);
 void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree);
 int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio);
-- 
cgit v1.2.3


From 0d81ba5dbedef0c3970d6aa318aa84920943e6e3 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 24 Mar 2008 15:02:07 -0400
Subject: Btrfs: Move device information into the super block so it can be
 scanned

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h      | 21 ++----------------
 fs/btrfs/disk-io.c    |  4 +++-
 fs/btrfs/print-tree.c | 15 ++-----------
 fs/btrfs/volumes.c    | 61 +++++++++++++--------------------------------------
 fs/btrfs/volumes.h    | 13 +----------
 5 files changed, 23 insertions(+), 91 deletions(-)

(limited to 'fs/btrfs/volumes.c')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 285fb7e46106..96a493217860 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -150,21 +150,11 @@ struct btrfs_dev_item {
 	/* minimal io size for this device */
 	__le32 sector_size;
 
-	/* the kernel device number */
-	__le64 rdev;
-
 	/* type and info about this device */
 	__le64 type;
 
-	/* partition number, 0 for whole dev */
-	__le32 partition;
-
-	/* length of the name data at the end of the item */
-	__le16 name_len;
-
-	/* physical drive uuid (or lvm uuid) */
+	/* btrfs generated uuid for this device */
 	u8 uuid[BTRFS_DEV_UUID_SIZE];
-	/* name goes here */
 } __attribute__ ((__packed__));
 
 struct btrfs_stripe {
@@ -255,6 +245,7 @@ struct btrfs_super_block {
 	__le32 sys_chunk_array_size;
 	u8 root_level;
 	u8 chunk_root_level;
+	struct btrfs_dev_item dev_item;
 	u8 sys_chunk_array[BTRFS_SYSTEM_CHUNK_ARRAY_SIZE];
 } __attribute__ ((__packed__));
 
@@ -685,20 +676,12 @@ BTRFS_SETGET_FUNCS(device_io_align, struct btrfs_dev_item, io_align, 32);
 BTRFS_SETGET_FUNCS(device_io_width, struct btrfs_dev_item, io_width, 32);
 BTRFS_SETGET_FUNCS(device_sector_size, struct btrfs_dev_item, sector_size, 32);
 BTRFS_SETGET_FUNCS(device_id, struct btrfs_dev_item, devid, 64);
-BTRFS_SETGET_FUNCS(device_rdev, struct btrfs_dev_item, rdev, 64);
-BTRFS_SETGET_FUNCS(device_partition, struct btrfs_dev_item, partition, 32);
-BTRFS_SETGET_FUNCS(device_name_len, struct btrfs_dev_item, name_len, 16);
 
 static inline char *btrfs_device_uuid(struct btrfs_dev_item *d)
 {
 	return (char *)d + offsetof(struct btrfs_dev_item, uuid);
 }
 
-static inline char *btrfs_device_name(struct btrfs_dev_item *d)
-{
-	return (char *)(d + 1);
-}
-
 BTRFS_SETGET_FUNCS(chunk_owner, struct btrfs_chunk, owner, 64);
 BTRFS_SETGET_FUNCS(chunk_stripe_len, struct btrfs_chunk, stripe_len, 64);
 BTRFS_SETGET_FUNCS(chunk_io_align, struct btrfs_chunk, io_align, 32);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 2a239ae49f78..26185d46712c 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -370,7 +370,6 @@ static int close_all_devices(struct btrfs_fs_info *fs_info)
 		next = list->next;
 		list_del(next);
 		device = list_entry(next, struct btrfs_device, dev_list);
-		kfree(device->name);
 		kfree(device);
 	}
 	return 0;
@@ -800,6 +799,9 @@ struct btrfs_root *open_ctree(struct super_block *sb)
 	}
 
 	mutex_lock(&fs_info->fs_mutex);
+	ret = btrfs_read_super_device(tree_root, fs_info->sb_buffer);
+	BUG_ON(ret);
+
 	ret = btrfs_read_sys_array(tree_root);
 	BUG_ON(ret);
 
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index 9c1335dad40c..ee0de112cf5a 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -37,22 +37,11 @@ static void print_chunk(struct extent_buffer *eb, struct btrfs_chunk *chunk)
 static void print_dev_item(struct extent_buffer *eb,
 			   struct btrfs_dev_item *dev_item)
 {
-	char *name;
-	int name_len;
-
-	name_len = btrfs_device_name_len(eb, dev_item);
-	name = kmalloc(name_len, GFP_NOFS);
-	if (name) {
-		read_extent_buffer(eb, name,
-				   (unsigned long)btrfs_device_name(dev_item),
-				   name_len);
-	}
-	printk("\t\tdev item name %.*s devid %llu "
-	       "total_bytes %llu bytes used %Lu\n", name_len, name,
+	printk("\t\tdev item devid %llu "
+	       "total_bytes %llu bytes used %Lu\n",
 	       (unsigned long long)btrfs_device_id(eb, dev_item),
 	       (unsigned long long)btrfs_device_total_bytes(eb, dev_item),
 	       (unsigned long long)btrfs_device_bytes_used(eb, dev_item));
-	kfree(name);
 }
 void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
 {
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index a52a13f365d6..ae22d01ecf54 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -278,7 +278,7 @@ int btrfs_add_device(struct btrfs_trans_handle *trans,
 	key.offset = free_devid;
 
 	ret = btrfs_insert_empty_item(trans, root, path, &key,
-				      sizeof(*dev_item) + device->name_len);
+				      sizeof(*dev_item));
 	if (ret)
 		goto out;
 
@@ -290,15 +290,9 @@ int btrfs_add_device(struct btrfs_trans_handle *trans,
 	btrfs_set_device_io_align(leaf, dev_item, device->io_align);
 	btrfs_set_device_io_width(leaf, dev_item, device->io_width);
 	btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
-	btrfs_set_device_rdev(leaf, dev_item, device->rdev);
-	btrfs_set_device_partition(leaf, dev_item, device->partition);
-	btrfs_set_device_name_len(leaf, dev_item, device->name_len);
 	btrfs_set_device_total_bytes(leaf, dev_item, device->total_bytes);
 	btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used);
 
-	ptr = (unsigned long)btrfs_device_name(dev_item);
-	write_extent_buffer(leaf, device->name, ptr, device->name_len);
-
 	ptr = (unsigned long)btrfs_device_uuid(dev_item);
 	write_extent_buffer(leaf, device->uuid, ptr, BTRFS_DEV_UUID_SIZE);
 	btrfs_mark_buffer_dirty(leaf);
@@ -345,8 +339,6 @@ int btrfs_update_device(struct btrfs_trans_handle *trans,
 	btrfs_set_device_io_align(leaf, dev_item, device->io_align);
 	btrfs_set_device_io_width(leaf, dev_item, device->io_width);
 	btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
-	btrfs_set_device_rdev(leaf, dev_item, device->rdev);
-	btrfs_set_device_partition(leaf, dev_item, device->partition);
 	btrfs_set_device_total_bytes(leaf, dev_item, device->total_bytes);
 	btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used);
 	btrfs_mark_buffer_dirty(leaf);
@@ -676,7 +668,6 @@ static int fill_device_from_item(struct extent_buffer *leaf,
 				 struct btrfs_device *device)
 {
 	unsigned long ptr;
-	char *name;
 
 	device->devid = btrfs_device_id(leaf, dev_item);
 	device->total_bytes = btrfs_device_total_bytes(leaf, dev_item);
@@ -685,24 +676,14 @@ static int fill_device_from_item(struct extent_buffer *leaf,
 	device->io_align = btrfs_device_io_align(leaf, dev_item);
 	device->io_width = btrfs_device_io_width(leaf, dev_item);
 	device->sector_size = btrfs_device_sector_size(leaf, dev_item);
-	device->rdev = btrfs_device_rdev(leaf, dev_item);
-	device->partition = btrfs_device_partition(leaf, dev_item);
-	device->name_len = btrfs_device_name_len(leaf, dev_item);
 
 	ptr = (unsigned long)btrfs_device_uuid(dev_item);
 	read_extent_buffer(leaf, device->uuid, ptr, BTRFS_DEV_UUID_SIZE);
 
-	name = kmalloc(device->name_len + 1, GFP_NOFS);
-	if (!name)
-		return -ENOMEM;
-	device->name = name;
-	ptr = (unsigned long)btrfs_device_name(dev_item);
-	read_extent_buffer(leaf, name, ptr, device->name_len);
-	name[device->name_len] = '\0';
 	return 0;
 }
 
-static int read_one_dev(struct btrfs_root *root, struct btrfs_key *key,
+static int read_one_dev(struct btrfs_root *root,
 			struct extent_buffer *leaf,
 			struct btrfs_dev_item *dev_item)
 {
@@ -722,7 +703,6 @@ static int read_one_dev(struct btrfs_root *root, struct btrfs_key *key,
 	fill_device_from_item(leaf, dev_item, device);
 	device->dev_root = root->fs_info->dev_root;
 	device->bdev = root->fs_info->sb->s_bdev;
-	memcpy(&device->dev_key, key, sizeof(*key));
 	ret = 0;
 #if 0
 	ret = btrfs_open_device(device);
@@ -733,12 +713,20 @@ static int read_one_dev(struct btrfs_root *root, struct btrfs_key *key,
 	return ret;
 }
 
+int btrfs_read_super_device(struct btrfs_root *root, struct extent_buffer *buf)
+{
+	struct btrfs_dev_item *dev_item;
+
+	dev_item = (struct btrfs_dev_item *)offsetof(struct btrfs_super_block,
+						     dev_item);
+	return read_one_dev(root, buf, dev_item);
+}
+
 int btrfs_read_sys_array(struct btrfs_root *root)
 {
 	struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
 	struct extent_buffer *sb = root->fs_info->sb_buffer;
 	struct btrfs_disk_key *disk_key;
-	struct btrfs_dev_item *dev_item;
 	struct btrfs_chunk *chunk;
 	struct btrfs_key key;
 	u32 num_stripes;
@@ -748,7 +736,6 @@ int btrfs_read_sys_array(struct btrfs_root *root)
 	unsigned long sb_ptr;
 	u32 cur;
 	int ret;
-	int dev_only = 1;
 
 	array_size = btrfs_super_sys_array_size(super_copy);
 
@@ -757,7 +744,6 @@ int btrfs_read_sys_array(struct btrfs_root *root)
 	 * once for all of the chunks.  This way there are device
 	 * structs filled in for every chunk
 	 */
-again:
 	ptr = super_copy->sys_chunk_array;
 	sb_ptr = offsetof(struct btrfs_super_block, sys_chunk_array);
 	cur = 0;
@@ -771,22 +757,10 @@ again:
 		sb_ptr += len;
 		cur += len;
 
-		if (key.objectid == BTRFS_DEV_ITEMS_OBJECTID &&
-		    key.type == BTRFS_DEV_ITEM_KEY) {
-			dev_item = (struct btrfs_dev_item *)sb_ptr;
-			if (dev_only) {
-				ret = read_one_dev(root, &key, sb, dev_item);
-				BUG_ON(ret);
-			}
-			len = sizeof(*dev_item);
-			len += btrfs_device_name_len(sb, dev_item);
-		} else if (key.type == BTRFS_CHUNK_ITEM_KEY) {
-
+		if (key.type == BTRFS_CHUNK_ITEM_KEY) {
 			chunk = (struct btrfs_chunk *)sb_ptr;
-			if (!dev_only) {
-				ret = read_one_chunk(root, &key, sb, chunk);
-				BUG_ON(ret);
-			}
+			ret = read_one_chunk(root, &key, sb, chunk);
+			BUG_ON(ret);
 			num_stripes = btrfs_chunk_num_stripes(sb, chunk);
 			len = btrfs_chunk_item_size(num_stripes);
 		} else {
@@ -796,10 +770,6 @@ again:
 		sb_ptr += len;
 		cur += len;
 	}
-	if (dev_only == 1) {
-		dev_only = 0;
-		goto again;
-	}
 	return 0;
 }
 
@@ -846,8 +816,7 @@ again:
 				struct btrfs_dev_item *dev_item;
 				dev_item = btrfs_item_ptr(leaf, slot,
 						  struct btrfs_dev_item);
-				ret = read_one_dev(root, &found_key, leaf,
-						   dev_item);
+				ret = read_one_dev(root, leaf, dev_item);
 				BUG_ON(ret);
 			}
 		} else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) {
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 4ab9891878a8..77fa6efd79cf 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -21,7 +21,6 @@
 struct btrfs_device {
 	struct list_head dev_list;
 	struct btrfs_root *dev_root;
-	struct btrfs_key dev_key;
 
 	struct block_device *bdev;
 
@@ -43,22 +42,11 @@ struct btrfs_device {
 	/* minimal io size for this device */
 	u32 sector_size;
 
-	/* the kernel device number */
-	u64 rdev;
-
 	/* type and info about this device */
 	u64 type;
 
-	/* partition number, 0 for whole dev */
-	int partition;
-
-	/* length of the name data at the end of the item */
-	int name_len;
-
 	/* physical drive uuid (or lvm uuid) */
 	u8 uuid[BTRFS_DEV_UUID_SIZE];
-
-	char *name;
 };
 
 int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
@@ -75,4 +63,5 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 void btrfs_mapping_init(struct btrfs_mapping_tree *tree);
 void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree);
 int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio);
+int btrfs_read_super_device(struct btrfs_root *root, struct extent_buffer *buf);
 #endif
-- 
cgit v1.2.3


From 239b14b32dc39232ebf9cce29ff77c4c564355fd Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 24 Mar 2008 15:02:07 -0400
Subject: Btrfs: Bring back mount -o ssd optimizations

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h       |  3 +++
 fs/btrfs/disk-io.c     |  2 ++
 fs/btrfs/extent-tree.c | 59 ++++++++++++++++++++++++++++++++++++++++++++++++--
 fs/btrfs/extent_io.c   |  2 ++
 fs/btrfs/extent_io.h   |  2 ++
 fs/btrfs/inode.c       | 29 +++++++++++++++++++++++++
 fs/btrfs/volumes.c     |  5 +++++
 fs/btrfs/volumes.h     |  3 +++
 8 files changed, 103 insertions(+), 2 deletions(-)

(limited to 'fs/btrfs/volumes.c')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 96a493217860..acf22ad6115c 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1405,6 +1405,9 @@ int btrfs_csum_truncate(struct btrfs_trans_handle *trans,
 			struct btrfs_root *root, struct btrfs_path *path,
 			u64 isize);
 /* inode.c */
+int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
+			 size_t size, struct bio *bio);
+
 static inline void dec_i_blocks(struct inode *inode, u64 dec)
 {
 	dec = dec >> 9;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 26185d46712c..4890151cd68d 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1103,4 +1103,6 @@ int btrfs_read_buffer(struct extent_buffer *buf)
 static struct extent_io_ops btree_extent_io_ops = {
 	.writepage_io_hook = btree_writepage_io_hook,
 	.submit_bio_hook = btree_submit_bio_hook,
+	/* note we're sharing with inode.c for the merge bio hook */
+	.merge_bio_hook = btrfs_merge_bio_hook,
 };
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 14eb8fc87015..e9ef644ff56f 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1473,13 +1473,31 @@ static int noinline find_free_extent(struct btrfs_trans_handle *trans,
 	struct btrfs_root * root = orig_root->fs_info->extent_root;
 	struct btrfs_fs_info *info = root->fs_info;
 	u64 total_needed = num_bytes;
+	u64 *last_ptr = NULL;
 	struct btrfs_block_group_cache *block_group;
 	int full_scan = 0;
 	int wrapped = 0;
+	int empty_cluster = 2 * 1024 * 1024;
 
 	WARN_ON(num_bytes < root->sectorsize);
 	btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY);
 
+	if (data & BTRFS_BLOCK_GROUP_METADATA) {
+		last_ptr = &root->fs_info->last_alloc;
+	}
+
+	if ((data & BTRFS_BLOCK_GROUP_DATA) && btrfs_test_opt(root, SSD)) {
+		last_ptr = &root->fs_info->last_data_alloc;
+	}
+
+	if (last_ptr) {
+		if (*last_ptr)
+			hint_byte = *last_ptr;
+		else {
+			empty_size += empty_cluster;
+		}
+	}
+
 	if (search_end == (u64)-1)
 		search_end = btrfs_super_total_bytes(&info->super_copy);
 
@@ -1489,11 +1507,14 @@ static int noinline find_free_extent(struct btrfs_trans_handle *trans,
 			hint_byte = search_start;
 		block_group = btrfs_find_block_group(root, block_group,
 						     hint_byte, data, 1);
+		if (last_ptr && *last_ptr == 0 && block_group)
+			hint_byte = block_group->key.objectid;
 	} else {
 		block_group = btrfs_find_block_group(root,
 						     trans->block_group,
 						     search_start, data, 1);
 	}
+	search_start = max(search_start, hint_byte);
 
 	total_needed += empty_size;
 
@@ -1506,9 +1527,36 @@ check_failed:
 	}
 	ret = find_search_start(root, &block_group, &search_start,
 				total_needed, data);
+	if (ret == -ENOSPC && last_ptr && *last_ptr) {
+		*last_ptr = 0;
+		block_group = btrfs_lookup_block_group(info,
+						       orig_search_start);
+		search_start = orig_search_start;
+		ret = find_search_start(root, &block_group, &search_start,
+					total_needed, data);
+	}
+	if (ret == -ENOSPC)
+		goto enospc;
 	if (ret)
 		goto error;
 
+	if (last_ptr && *last_ptr && search_start != *last_ptr) {
+		*last_ptr = 0;
+		if (!empty_size) {
+			empty_size += empty_cluster;
+			total_needed += empty_size;
+		}
+		block_group = btrfs_lookup_block_group(info,
+						       orig_search_start);
+		search_start = orig_search_start;
+		ret = find_search_start(root, &block_group,
+					&search_start, total_needed, data);
+		if (ret == -ENOSPC)
+			goto enospc;
+		if (ret)
+			goto error;
+	}
+
 	search_start = stripe_align(root, search_start);
 	ins->objectid = search_start;
 	ins->offset = num_bytes;
@@ -1547,6 +1595,13 @@ check_failed:
 			trans->block_group = block_group;
 	}
 	ins->offset = num_bytes;
+	if (last_ptr) {
+		*last_ptr = ins->objectid + ins->offset;
+		if (*last_ptr ==
+		    btrfs_super_total_bytes(&root->fs_info->super_copy)) {
+			*last_ptr = 0;
+		}
+	}
 	return 0;
 
 new_group:
@@ -1612,12 +1667,12 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
 	if (root->ref_cows) {
 		if (data != BTRFS_BLOCK_GROUP_METADATA) {
 			ret = do_chunk_alloc(trans, root->fs_info->extent_root,
-					     num_bytes,
+					     2 * 1024 * 1024,
 					     BTRFS_BLOCK_GROUP_METADATA);
 			BUG_ON(ret);
 		}
 		ret = do_chunk_alloc(trans, root->fs_info->extent_root,
-				     num_bytes, data);
+				     num_bytes + 2 * 1024 * 1024, data);
 		BUG_ON(ret);
 	}
 
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 7e3a1ebde9fc..6dab664529c1 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1730,6 +1730,8 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree,
 	if (bio_ret && *bio_ret) {
 		bio = *bio_ret;
 		if (bio->bi_sector + (bio->bi_size >> 9) != sector ||
+		    (tree->ops && tree->ops->merge_bio_hook &&
+		     tree->ops->merge_bio_hook(page, offset, size, bio)) ||
 		    bio_add_page(bio, page, size, offset) < size) {
 			ret = submit_one_bio(rw, bio);
 			bio = NULL;
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 0dca89328f98..8b5319db2516 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -29,6 +29,8 @@ struct extent_io_ops {
 	int (*fill_delalloc)(struct inode *inode, u64 start, u64 end);
 	int (*writepage_io_hook)(struct page *page, u64 start, u64 end);
 	int (*submit_bio_hook)(struct inode *inode, int rw, struct bio *bio);
+	int (*merge_bio_hook)(struct page *page, unsigned long offset,
+			      size_t size, struct bio *bio);
 	int (*readpage_io_hook)(struct page *page, u64 start, u64 end);
 	int (*readpage_end_io_hook)(struct page *page, u64 start, u64 end,
 				    struct extent_state *state);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 109576b57f69..5140d6801846 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -296,6 +296,34 @@ int btrfs_clear_bit_hook(struct inode *inode, u64 start, u64 end,
 	return 0;
 }
 
+int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
+			 size_t size, struct bio *bio)
+{
+	struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
+	struct btrfs_mapping_tree *map_tree;
+	struct btrfs_device *dev;
+	u64 logical = bio->bi_sector << 9;
+	u64 physical;
+	u64 length = 0;
+	u64 map_length;
+	struct bio_vec *bvec;
+	int i;
+	int ret;
+
+	bio_for_each_segment(bvec, bio, i) {
+		length += bvec->bv_len;
+	}
+	map_tree = &root->fs_info->mapping_tree;
+	map_length = length;
+	ret = btrfs_map_block(map_tree, logical, &physical, &map_length, &dev);
+	if (map_length < length + size) {
+		printk("merge bio hook logical %Lu bio len %Lu physical %Lu "
+		       "len %Lu\n", logical, length, physical, map_length);
+		return 1;
+	}
+	return 0;
+}
+
 int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -3033,6 +3061,7 @@ static struct file_operations btrfs_dir_file_operations = {
 static struct extent_io_ops btrfs_extent_io_ops = {
 	.fill_delalloc = run_delalloc_range,
 	.submit_bio_hook = btrfs_submit_bio_hook,
+	.merge_bio_hook = btrfs_merge_bio_hook,
 	.readpage_io_hook = btrfs_readpage_io_hook,
 	.readpage_end_io_hook = btrfs_readpage_end_io_hook,
 	.set_bit_hook = btrfs_set_bit_hook,
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index ae22d01ecf54..16fb6bbe6e28 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -578,6 +578,11 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio)
 	map_tree = &root->fs_info->mapping_tree;
 	map_length = length;
 	ret = btrfs_map_block(map_tree, logical, &physical, &map_length, &dev);
+	if (map_length < length) {
+		printk("mapping failed logical %Lu bio len %Lu physical %Lu "
+		       "len %Lu\n", logical, length, physical, map_length);
+		BUG();
+	}
 	BUG_ON(map_length < length);
 	bio->bi_sector = physical >> 9;
 	bio->bi_bdev = dev->bdev;
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 77fa6efd79cf..20259128152e 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -64,4 +64,7 @@ void btrfs_mapping_init(struct btrfs_mapping_tree *tree);
 void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree);
 int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio);
 int btrfs_read_super_device(struct btrfs_root *root, struct extent_buffer *buf);
+int btrfs_map_block(struct btrfs_mapping_tree *map_tree,
+		    u64 logical, u64 *phys, u64 *length,
+		    struct btrfs_device **dev);
 #endif
-- 
cgit v1.2.3


From 8a4b83cc8bd75fca29ac68615896d9e92820e7c2 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 24 Mar 2008 15:02:07 -0400
Subject: Btrfs: Add support for device scanning and detection ioctls

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h   |  21 ++++-
 fs/btrfs/disk-io.c |  24 ++++--
 fs/btrfs/disk-io.h |   4 +-
 fs/btrfs/ioctl.h   |   6 +-
 fs/btrfs/super.c   |  61 ++++++++++----
 fs/btrfs/volumes.c | 236 +++++++++++++++++++++++++++++++++++++++++++++++++----
 fs/btrfs/volumes.h |  25 ++++++
 7 files changed, 333 insertions(+), 44 deletions(-)

(limited to 'fs/btrfs/volumes.c')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index acf22ad6115c..7556f8319c60 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -37,7 +37,7 @@ extern struct kmem_cache *btrfs_transaction_cachep;
 extern struct kmem_cache *btrfs_bit_radix_cachep;
 extern struct kmem_cache *btrfs_path_cachep;
 
-#define BTRFS_MAGIC "_B4RfS_M"
+#define BTRFS_MAGIC "_B5RfS_M"
 
 #define BTRFS_MAX_LEVEL 8
 
@@ -238,6 +238,7 @@ struct btrfs_super_block {
 	__le64 total_bytes;
 	__le64 bytes_used;
 	__le64 root_dir_objectid;
+	__le64 num_devices;
 	__le32 sectorsize;
 	__le32 nodesize;
 	__le32 leafsize;
@@ -440,6 +441,7 @@ struct btrfs_block_group_cache {
 };
 
 struct btrfs_device;
+struct btrfs_fs_devices;
 struct btrfs_fs_info {
 	u8 fsid[BTRFS_FSID_SIZE];
 	struct btrfs_root *extent_root;
@@ -489,7 +491,7 @@ struct btrfs_fs_info {
 	u64 total_pinned;
 	struct list_head dirty_cowonly_roots;
 
-	struct list_head devices;
+	struct btrfs_fs_devices *fs_devices;
 	struct list_head space_info;
 	spinlock_t delalloc_lock;
 	spinlock_t new_trans_lock;
@@ -677,6 +679,19 @@ BTRFS_SETGET_FUNCS(device_io_width, struct btrfs_dev_item, io_width, 32);
 BTRFS_SETGET_FUNCS(device_sector_size, struct btrfs_dev_item, sector_size, 32);
 BTRFS_SETGET_FUNCS(device_id, struct btrfs_dev_item, devid, 64);
 
+BTRFS_SETGET_STACK_FUNCS(stack_device_type, struct btrfs_dev_item, type, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_device_total_bytes, struct btrfs_dev_item,
+			 total_bytes, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_device_bytes_used, struct btrfs_dev_item,
+			 bytes_used, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_device_io_align, struct btrfs_dev_item,
+			 io_align, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_device_io_width, struct btrfs_dev_item,
+			 io_width, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_device_sector_size, struct btrfs_dev_item,
+			 sector_size, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_device_id, struct btrfs_dev_item, devid, 64);
+
 static inline char *btrfs_device_uuid(struct btrfs_dev_item *d)
 {
 	return (char *)d + offsetof(struct btrfs_dev_item, uuid);
@@ -1106,6 +1121,8 @@ BTRFS_SETGET_STACK_FUNCS(super_stripesize, struct btrfs_super_block,
 			 stripesize, 32);
 BTRFS_SETGET_STACK_FUNCS(super_root_dir, struct btrfs_super_block,
 			 root_dir_objectid, 64);
+BTRFS_SETGET_STACK_FUNCS(super_num_devices, struct btrfs_super_block,
+			 num_devices, 64);
 
 static inline unsigned long btrfs_leaf_data(struct extent_buffer *l)
 {
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 4890151cd68d..f971a29e4f20 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -365,12 +365,12 @@ static int close_all_devices(struct btrfs_fs_info *fs_info)
 	struct list_head *next;
 	struct btrfs_device *device;
 
-	list = &fs_info->devices;
-	while(!list_empty(list)) {
-		next = list->next;
-		list_del(next);
+	list = &fs_info->fs_devices->devices;
+	list_for_each(next, list) {
 		device = list_entry(next, struct btrfs_device, dev_list);
-		kfree(device);
+		if (device->bdev && device->bdev != fs_info->sb->s_bdev)
+			close_bdev_excl(device->bdev);
+		device->bdev = NULL;
 	}
 	return 0;
 }
@@ -655,7 +655,8 @@ static int add_hasher(struct btrfs_fs_info *info, char *type) {
 	return 0;
 }
 #endif
-struct btrfs_root *open_ctree(struct super_block *sb)
+struct btrfs_root *open_ctree(struct super_block *sb,
+			      struct btrfs_fs_devices *fs_devices)
 {
 	u32 sectorsize;
 	u32 nodesize;
@@ -697,8 +698,8 @@ struct btrfs_root *open_ctree(struct super_block *sb)
 	fs_info->extent_root = extent_root;
 	fs_info->chunk_root = chunk_root;
 	fs_info->dev_root = dev_root;
+	fs_info->fs_devices = fs_devices;
 	INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);
-	INIT_LIST_HEAD(&fs_info->devices);
 	INIT_LIST_HEAD(&fs_info->space_info);
 	btrfs_mapping_init(&fs_info->mapping_tree);
 	fs_info->sb = sb;
@@ -779,6 +780,12 @@ struct btrfs_root *open_ctree(struct super_block *sb)
 	if (!btrfs_super_root(disk_super))
 		goto fail_sb_buffer;
 
+	if (btrfs_super_num_devices(disk_super) != fs_devices->num_devices) {
+		printk("Btrfs: wanted %llu devices, but found %llu\n",
+		       (unsigned long long)btrfs_super_num_devices(disk_super),
+		       (unsigned long long)fs_devices->num_devices);
+		goto fail_sb_buffer;
+	}
 	nodesize = btrfs_super_nodesize(disk_super);
 	leafsize = btrfs_super_leafsize(disk_super);
 	sectorsize = btrfs_super_sectorsize(disk_super);
@@ -799,8 +806,6 @@ struct btrfs_root *open_ctree(struct super_block *sb)
 	}
 
 	mutex_lock(&fs_info->fs_mutex);
-	ret = btrfs_read_super_device(tree_root, fs_info->sb_buffer);
-	BUG_ON(ret);
 
 	ret = btrfs_read_sys_array(tree_root);
 	BUG_ON(ret);
@@ -859,6 +864,7 @@ fail_sb_buffer:
 fail_iput:
 	iput(fs_info->btree_inode);
 fail:
+	close_all_devices(fs_info);
 	kfree(extent_root);
 	kfree(tree_root);
 	kfree(fs_info);
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 206cb48638f7..b7cbc58a5553 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -21,6 +21,7 @@
 
 #define BTRFS_SUPER_INFO_OFFSET (16 * 1024)
 struct btrfs_device;
+struct btrfs_fs_devices;
 
 struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
 				      u32 blocksize);
@@ -29,7 +30,8 @@ struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
 						   u64 bytenr, u32 blocksize);
 int clean_tree_block(struct btrfs_trans_handle *trans,
 		     struct btrfs_root *root, struct extent_buffer *buf);
-struct btrfs_root *open_ctree(struct super_block *sb);
+struct btrfs_root *open_ctree(struct super_block *sb,
+			      struct btrfs_fs_devices *fs_devices);
 int close_ctree(struct btrfs_root *root);
 int write_ctree_super(struct btrfs_trans_handle *trans,
 		      struct btrfs_root *root);
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
index 8c6290665d49..4551e82013c8 100644
--- a/fs/btrfs/ioctl.h
+++ b/fs/btrfs/ioctl.h
@@ -22,8 +22,10 @@
 
 #define BTRFS_IOCTL_MAGIC 0x94
 #define BTRFS_VOL_NAME_MAX 255
+#define BTRFS_PATH_NAME_MAX 4095
+
 struct btrfs_ioctl_vol_args {
-	char name[BTRFS_VOL_NAME_MAX + 1];
+	char name[BTRFS_PATH_NAME_MAX + 1];
 };
 
 #define BTRFS_IOC_SNAP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 1, \
@@ -32,4 +34,6 @@ struct btrfs_ioctl_vol_args {
 				   struct btrfs_ioctl_vol_args)
 #define BTRFS_IOC_RESIZE _IOW(BTRFS_IOCTL_MAGIC, 3, \
 				   struct btrfs_ioctl_vol_args)
+#define BTRFS_IOC_SCAN_DEV _IOW(BTRFS_IOCTL_MAGIC, 4, \
+				   struct btrfs_ioctl_vol_args)
 #endif
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 67ed216df475..9624923a33dc 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -44,6 +44,7 @@
 #include "ioctl.h"
 #include "print-tree.h"
 #include "xattr.h"
+#include "volumes.h"
 
 #define BTRFS_SUPER_MAGIC 0x9123683E
 
@@ -216,7 +217,9 @@ static int parse_options (char * options,
 	return 1;
 }
 
-static int btrfs_fill_super(struct super_block * sb, void * data, int silent)
+static int btrfs_fill_super(struct super_block * sb,
+			    struct btrfs_fs_devices *fs_devices,
+			    void * data, int silent)
 {
 	struct inode * inode;
 	struct dentry * root_dentry;
@@ -231,7 +234,7 @@ static int btrfs_fill_super(struct super_block * sb, void * data, int silent)
 	sb->s_xattr = btrfs_xattr_handlers;
 	sb->s_time_gran = 1;
 
-	tree_root = open_ctree(sb);
+	tree_root = open_ctree(sb, fs_devices);
 
 	if (!tree_root || IS_ERR(tree_root)) {
 		printk("btrfs: open_ctree failed\n");
@@ -334,18 +337,23 @@ static int test_bdev_super(struct super_block *s, void *data)
 
 int btrfs_get_sb_bdev(struct file_system_type *fs_type,
 	int flags, const char *dev_name, void *data,
-	int (*fill_super)(struct super_block *, void *, int),
 	struct vfsmount *mnt, const char *subvol)
 {
 	struct block_device *bdev = NULL;
 	struct super_block *s;
 	struct dentry *root;
+	struct btrfs_fs_devices *fs_devices = NULL;
 	int error = 0;
 
-	bdev = open_bdev_excl(dev_name, flags, fs_type);
-	if (IS_ERR(bdev))
-		return PTR_ERR(bdev);
+	error = btrfs_scan_one_device(dev_name, flags, fs_type, &fs_devices);
+	if (error)
+		return error;
 
+	error = btrfs_open_devices(fs_devices, flags, fs_type);
+	if (error)
+		return error;
+
+	bdev = fs_devices->lowest_bdev;
 	/*
 	 * once the super is inserted into the list by sget, s_umount
 	 * will protect the lockfs code from trying to start a snapshot
@@ -372,7 +380,8 @@ int btrfs_get_sb_bdev(struct file_system_type *fs_type,
 		s->s_flags = flags;
 		strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id));
 		sb_set_blocksize(s, block_size(bdev));
-		error = fill_super(s, data, flags & MS_SILENT ? 1 : 0);
+		error = btrfs_fill_super(s, fs_devices, data,
+					 flags & MS_SILENT ? 1 : 0);
 		if (error) {
 			up_write(&s->s_umount);
 			deactivate_super(s);
@@ -408,7 +417,7 @@ int btrfs_get_sb_bdev(struct file_system_type *fs_type,
 error_s:
 	error = PTR_ERR(s);
 error_bdev:
-	close_bdev_excl(bdev);
+	btrfs_close_devices(fs_devices);
 error:
 	return error;
 }
@@ -421,8 +430,7 @@ static int btrfs_get_sb(struct file_system_type *fs_type,
 	char *subvol_name = NULL;
 
 	parse_options((char *)data, NULL, &subvol_name);
-	ret = btrfs_get_sb_bdev(fs_type, flags, dev_name, data,
-			btrfs_fill_super, mnt,
+	ret = btrfs_get_sb_bdev(fs_type, flags, dev_name, data, mnt,
 			subvol_name ? subvol_name : "default");
 	if (subvol_name)
 		kfree(subvol_name);
@@ -445,13 +453,6 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 	return 0;
 }
 
-static long btrfs_control_ioctl(struct file *file, unsigned int cmd,
-				unsigned long arg)
-{
-	printk("btrfs control ioctl %d\n", cmd);
-	return 0;
-}
-
 static struct file_system_type btrfs_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "btrfs",
@@ -460,6 +461,31 @@ static struct file_system_type btrfs_fs_type = {
 	.fs_flags	= FS_REQUIRES_DEV,
 };
 
+static long btrfs_control_ioctl(struct file *file, unsigned int cmd,
+				unsigned long arg)
+{
+	struct btrfs_ioctl_vol_args *vol;
+	struct btrfs_fs_devices *fs_devices;
+	int ret;
+	int len;
+
+	vol = kmalloc(sizeof(*vol), GFP_KERNEL);
+	if (copy_from_user(vol, (void __user *)arg, sizeof(*vol))) {
+		ret = -EFAULT;
+		goto out;
+	}
+	len = strnlen(vol->name, BTRFS_PATH_NAME_MAX);
+	switch (cmd) {
+	case BTRFS_IOC_SCAN_DEV:
+		ret = btrfs_scan_one_device(vol->name, MS_RDONLY,
+					    &btrfs_fs_type, &fs_devices);
+		break;
+	}
+out:
+	kfree(vol);
+	return 0;
+}
+
 static void btrfs_write_super_lockfs(struct super_block *sb)
 {
 	struct btrfs_root *root = btrfs_sb(sb);
@@ -567,6 +593,7 @@ static void __exit exit_btrfs_fs(void)
 	btrfs_interface_exit();
 	unregister_filesystem(&btrfs_fs_type);
 	btrfs_exit_sysfs();
+	btrfs_cleanup_fs_uuids();
 }
 
 module_init(init_btrfs_fs)
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 16fb6bbe6e28..263f01cc3db4 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -17,6 +17,7 @@
  */
 #include <linux/sched.h>
 #include <linux/bio.h>
+#include <linux/buffer_head.h>
 #include "ctree.h"
 #include "extent_map.h"
 #include "disk-io.h"
@@ -28,6 +29,215 @@ struct map_lookup {
 	struct btrfs_device *dev;
 	u64 physical;
 };
+static DEFINE_MUTEX(uuid_mutex);
+static LIST_HEAD(fs_uuids);
+
+int btrfs_cleanup_fs_uuids(void)
+{
+	struct btrfs_fs_devices *fs_devices;
+	struct list_head *uuid_cur;
+	struct list_head *devices_cur;
+	struct btrfs_device *dev;
+
+	list_for_each(uuid_cur, &fs_uuids) {
+		fs_devices = list_entry(uuid_cur, struct btrfs_fs_devices,
+					list);
+		while(!list_empty(&fs_devices->devices)) {
+			devices_cur = fs_devices->devices.next;
+			dev = list_entry(devices_cur, struct btrfs_device,
+					 dev_list);
+			printk("uuid cleanup finds %s\n", dev->name);
+			if (dev->bdev) {
+				printk("closing\n");
+				close_bdev_excl(dev->bdev);
+			}
+			list_del(&dev->dev_list);
+			kfree(dev);
+		}
+	}
+	return 0;
+}
+
+static struct btrfs_device *__find_device(struct list_head *head, u64 devid)
+{
+	struct btrfs_device *dev;
+	struct list_head *cur;
+
+	list_for_each(cur, head) {
+		dev = list_entry(cur, struct btrfs_device, dev_list);
+		if (dev->devid == devid)
+			return dev;
+	}
+	return NULL;
+}
+
+static struct btrfs_fs_devices *find_fsid(u8 *fsid)
+{
+	struct list_head *cur;
+	struct btrfs_fs_devices *fs_devices;
+
+	list_for_each(cur, &fs_uuids) {
+		fs_devices = list_entry(cur, struct btrfs_fs_devices, list);
+		if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0)
+			return fs_devices;
+	}
+	return NULL;
+}
+
+static int device_list_add(const char *path,
+			   struct btrfs_super_block *disk_super,
+			   u64 devid, struct btrfs_fs_devices **fs_devices_ret)
+{
+	struct btrfs_device *device;
+	struct btrfs_fs_devices *fs_devices;
+	u64 found_transid = btrfs_super_generation(disk_super);
+
+	fs_devices = find_fsid(disk_super->fsid);
+	if (!fs_devices) {
+		fs_devices = kmalloc(sizeof(*fs_devices), GFP_NOFS);
+		if (!fs_devices)
+			return -ENOMEM;
+		INIT_LIST_HEAD(&fs_devices->devices);
+		list_add(&fs_devices->list, &fs_uuids);
+		memcpy(fs_devices->fsid, disk_super->fsid, BTRFS_FSID_SIZE);
+		fs_devices->latest_devid = devid;
+		fs_devices->latest_trans = found_transid;
+		fs_devices->lowest_devid = (u64)-1;
+		fs_devices->num_devices = 0;
+		device = NULL;
+	} else {
+		device = __find_device(&fs_devices->devices, devid);
+	}
+	if (!device) {
+		device = kzalloc(sizeof(*device), GFP_NOFS);
+		if (!device) {
+			/* we can safely leave the fs_devices entry around */
+			return -ENOMEM;
+		}
+		device->devid = devid;
+		device->name = kstrdup(path, GFP_NOFS);
+		if (!device->name) {
+			kfree(device);
+			return -ENOMEM;
+		}
+		list_add(&device->dev_list, &fs_devices->devices);
+		fs_devices->num_devices++;
+	}
+
+	if (found_transid > fs_devices->latest_trans) {
+		fs_devices->latest_devid = devid;
+		fs_devices->latest_trans = found_transid;
+	}
+	if (fs_devices->lowest_devid > devid) {
+		fs_devices->lowest_devid = devid;
+		printk("lowest devid now %Lu\n", devid);
+	}
+	*fs_devices_ret = fs_devices;
+	return 0;
+}
+
+int btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
+{
+	struct list_head *head = &fs_devices->devices;
+	struct list_head *cur;
+	struct btrfs_device *device;
+
+	mutex_lock(&uuid_mutex);
+	list_for_each(cur, head) {
+		device = list_entry(cur, struct btrfs_device, dev_list);
+		if (device->bdev) {
+			close_bdev_excl(device->bdev);
+			printk("close devices closes %s\n", device->name);
+		}
+		device->bdev = NULL;
+	}
+	mutex_unlock(&uuid_mutex);
+	return 0;
+}
+
+int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
+		       int flags, void *holder)
+{
+	struct block_device *bdev;
+	struct list_head *head = &fs_devices->devices;
+	struct list_head *cur;
+	struct btrfs_device *device;
+	int ret;
+
+	mutex_lock(&uuid_mutex);
+	list_for_each(cur, head) {
+		device = list_entry(cur, struct btrfs_device, dev_list);
+		bdev = open_bdev_excl(device->name, flags, holder);
+printk("opening %s devid %Lu\n", device->name, device->devid);
+		if (IS_ERR(bdev)) {
+			printk("open %s failed\n", device->name);
+			ret = PTR_ERR(bdev);
+			goto fail;
+		}
+		if (device->devid == fs_devices->latest_devid)
+			fs_devices->latest_bdev = bdev;
+		if (device->devid == fs_devices->lowest_devid) {
+			fs_devices->lowest_bdev = bdev;
+printk("lowest bdev %s\n", device->name);
+		}
+		device->bdev = bdev;
+	}
+	mutex_unlock(&uuid_mutex);
+	return 0;
+fail:
+	mutex_unlock(&uuid_mutex);
+	btrfs_close_devices(fs_devices);
+	return ret;
+}
+
+int btrfs_scan_one_device(const char *path, int flags, void *holder,
+			  struct btrfs_fs_devices **fs_devices_ret)
+{
+	struct btrfs_super_block *disk_super;
+	struct block_device *bdev;
+	struct buffer_head *bh;
+	int ret;
+	u64 devid;
+
+	mutex_lock(&uuid_mutex);
+
+	printk("scan one opens %s\n", path);
+	bdev = open_bdev_excl(path, flags, holder);
+
+	if (IS_ERR(bdev)) {
+		printk("open failed\n");
+		ret = PTR_ERR(bdev);
+		goto error;
+	}
+
+	ret = set_blocksize(bdev, 4096);
+	if (ret)
+		goto error_close;
+	bh = __bread(bdev, BTRFS_SUPER_INFO_OFFSET / 4096, 4096);
+	if (!bh) {
+		ret = -EIO;
+		goto error_close;
+	}
+	disk_super = (struct btrfs_super_block *)bh->b_data;
+	if (strncmp((char *)(&disk_super->magic), BTRFS_MAGIC,
+	    sizeof(disk_super->magic))) {
+		printk("no btrfs found on %s\n", path);
+		ret = -ENOENT;
+		goto error_brelse;
+	}
+	devid = le64_to_cpu(disk_super->dev_item.devid);
+	printk("found device %Lu on %s\n", devid, path);
+	ret = device_list_add(path, disk_super, devid, fs_devices_ret);
+
+error_brelse:
+	brelse(bh);
+error_close:
+	close_bdev_excl(bdev);
+	printk("scan one closes bdev %s\n", path);
+error:
+	mutex_unlock(&uuid_mutex);
+	return ret;
+}
 
 /*
  * this uses a pretty simple search, the expectation is that it is
@@ -56,6 +266,10 @@ static int find_free_dev_extent(struct btrfs_trans_handle *trans,
 
 	/* FIXME use last free of some kind */
 
+	/* we don't want to overwrite the superblock on the drive,
+	 * so we make sure to start at an offset of at least 1MB
+	 */
+	search_start = max((u64)1024 * 1024, search_start);
 	key.objectid = device->devid;
 	key.offset = search_start;
 	key.type = BTRFS_DEV_EXTENT_KEY;
@@ -285,6 +499,7 @@ int btrfs_add_device(struct btrfs_trans_handle *trans,
 	leaf = path->nodes[0];
 	dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
 
+	device->devid = free_devid;
 	btrfs_set_device_id(leaf, dev_item, device->devid);
 	btrfs_set_device_type(leaf, dev_item, device->type);
 	btrfs_set_device_io_align(leaf, dev_item, device->io_align);
@@ -382,7 +597,7 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 	struct btrfs_device *device = NULL;
 	struct btrfs_chunk *chunk;
 	struct list_head private_devs;
-	struct list_head *dev_list = &extent_root->fs_info->devices;
+	struct list_head *dev_list = &extent_root->fs_info->fs_devices->devices;
 	struct list_head *cur;
 	struct extent_map_tree *em_tree;
 	struct map_lookup *map;
@@ -449,7 +664,7 @@ again:
 					     key.objectid,
 					     calc_size, &dev_offset);
 		BUG_ON(ret);
-
+printk("alloc chunk size %Lu from dev %Lu\n", calc_size, device->devid);
 		device->bytes_used += calc_size;
 		ret = btrfs_update_device(trans, device);
 		BUG_ON(ret);
@@ -592,17 +807,9 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio)
 
 struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid)
 {
-	struct btrfs_device *dev;
-	struct list_head *cur = root->fs_info->devices.next;
-	struct list_head *head = &root->fs_info->devices;
+	struct list_head *head = &root->fs_info->fs_devices->devices;
 
-	while(cur != head) {
-		dev = list_entry(cur, struct btrfs_device, dev_list);
-		if (dev->devid == devid)
-			return dev;
-		cur = cur->next;
-	}
-	return NULL;
+	return __find_device(head, devid);
 }
 
 static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
@@ -699,15 +906,16 @@ static int read_one_dev(struct btrfs_root *root,
 	devid = btrfs_device_id(leaf, dev_item);
 	device = btrfs_find_device(root, devid);
 	if (!device) {
+		printk("warning devid %Lu not found already\n", devid);
 		device = kmalloc(sizeof(*device), GFP_NOFS);
 		if (!device)
 			return -ENOMEM;
-		list_add(&device->dev_list, &root->fs_info->devices);
+		list_add(&device->dev_list,
+			 &root->fs_info->fs_devices->devices);
 	}
 
 	fill_device_from_item(leaf, dev_item, device);
 	device->dev_root = root->fs_info->dev_root;
-	device->bdev = root->fs_info->sb->s_bdev;
 	ret = 0;
 #if 0
 	ret = btrfs_open_device(device);
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 20259128152e..12f297eb0559 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -24,6 +24,8 @@ struct btrfs_device {
 
 	struct block_device *bdev;
 
+	char *name;
+
 	/* the internal btrfs device id */
 	u64 devid;
 
@@ -49,6 +51,20 @@ struct btrfs_device {
 	u8 uuid[BTRFS_DEV_UUID_SIZE];
 };
 
+struct btrfs_fs_devices {
+	u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */
+
+	/* the device with this id has the most recent coyp of the super */
+	u64 latest_devid;
+	u64 latest_trans;
+	u64 lowest_devid;
+	u64 num_devices;
+	struct block_device *latest_bdev;
+	struct block_device *lowest_bdev;
+	struct list_head devices;
+	struct list_head list;
+};
+
 int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
 			   struct btrfs_device *device,
 			   u64 owner, u64 num_bytes, u64 *start);
@@ -67,4 +83,13 @@ int btrfs_read_super_device(struct btrfs_root *root, struct extent_buffer *buf);
 int btrfs_map_block(struct btrfs_mapping_tree *map_tree,
 		    u64 logical, u64 *phys, u64 *length,
 		    struct btrfs_device **dev);
+int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
+		       int flags, void *holder);
+int btrfs_scan_one_device(const char *path, int flags, void *holder,
+			  struct btrfs_fs_devices **fs_devices_ret);
+int btrfs_close_devices(struct btrfs_fs_devices *fs_devices);
+int btrfs_add_device(struct btrfs_trans_handle *trans,
+		     struct btrfs_root *root,
+		     struct btrfs_device *device);
+int btrfs_cleanup_fs_uuids(void);
 #endif
-- 
cgit v1.2.3


From 593060d756e0c2382d59cf00d5f3b9e3a336c408 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 25 Mar 2008 16:50:33 -0400
Subject: Btrfs: Implement raid0 when multiple devices are present

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h       |   3 ++
 fs/btrfs/disk-io.c     |   2 +
 fs/btrfs/extent-tree.c | 102 +++++++++++++++++++++++++-------------
 fs/btrfs/inode.c       |   2 -
 fs/btrfs/volumes.c     | 130 +++++++++++++++++++++++++++++++++++++------------
 5 files changed, 173 insertions(+), 66 deletions(-)

(limited to 'fs/btrfs/volumes.c')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 7556f8319c60..f43dfe7ffe6d 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -413,6 +413,7 @@ struct btrfs_csum_item {
 #define BTRFS_BLOCK_GROUP_DATA     (1 << 0)
 #define BTRFS_BLOCK_GROUP_SYSTEM   (1 << 1)
 #define BTRFS_BLOCK_GROUP_METADATA (1 << 2)
+#define BTRFS_BLOCK_GROUP_RAID0    (1 << 3)
 
 
 struct btrfs_block_group_item {
@@ -498,6 +499,8 @@ struct btrfs_fs_info {
 	u64 delalloc_bytes;
 	u64 last_alloc;
 	u64 last_data_alloc;
+	int extra_data_alloc_bits;
+	int extra_alloc_bits;
 };
 
 /*
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index f971a29e4f20..d8474796dea2 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -736,6 +736,8 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	fs_info->total_pinned = 0;
 	fs_info->last_alloc = 0;
 	fs_info->last_data_alloc = 0;
+	fs_info->extra_alloc_bits = 0;
+	fs_info->extra_data_alloc_bits = 0;
 
 #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18)
 	INIT_WORK(&fs_info->trans_work, btrfs_transaction_cleaner, fs_info);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index e9ef644ff56f..c226656f29b7 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -172,7 +172,7 @@ struct btrfs_block_group_cache *btrfs_lookup_block_group(struct
 
 static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
 {
-	return (cache->flags & bits);
+	return (cache->flags & bits) == bits;
 }
 
 static int noinline find_search_start(struct btrfs_root *root,
@@ -1010,6 +1010,35 @@ static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
 
 }
 
+static int update_space_info(struct btrfs_fs_info *info, u64 flags,
+			     u64 total_bytes, u64 bytes_used,
+			     struct btrfs_space_info **space_info)
+{
+	struct btrfs_space_info *found;
+
+	found = __find_space_info(info, flags);
+	if (found) {
+		found->total_bytes += total_bytes;
+		found->bytes_used += bytes_used;
+		WARN_ON(found->total_bytes < found->bytes_used);
+		*space_info = found;
+		return 0;
+	}
+	found = kmalloc(sizeof(*found), GFP_NOFS);
+	if (!found)
+		return -ENOMEM;
+
+	list_add(&found->list, &info->space_info);
+	found->flags = flags;
+	found->total_bytes = total_bytes;
+	found->bytes_used = bytes_used;
+	found->bytes_pinned = 0;
+	found->full = 0;
+	*space_info = found;
+	return 0;
+}
+
+
 static int do_chunk_alloc(struct btrfs_trans_handle *trans,
 			  struct btrfs_root *extent_root, u64 alloc_bytes,
 			  u64 flags)
@@ -1021,6 +1050,11 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
 	int ret;
 
 	space_info = __find_space_info(extent_root->fs_info, flags);
+	if (!space_info) {
+		ret = update_space_info(extent_root->fs_info, flags,
+					0, 0, &space_info);
+		BUG_ON(ret);
+	}
 	BUG_ON(!space_info);
 
 	if (space_info->full)
@@ -1044,6 +1078,17 @@ printk("space info full %Lu\n", flags);
 		     extent_root->fs_info->chunk_root->root_key.objectid,
 		     start, num_bytes);
 	BUG_ON(ret);
+
+	if (flags & BTRFS_BLOCK_GROUP_RAID0) {
+		if (flags & BTRFS_BLOCK_GROUP_DATA) {
+			extent_root->fs_info->extra_data_alloc_bits =
+				BTRFS_BLOCK_GROUP_RAID0;
+		}
+		if (flags & BTRFS_BLOCK_GROUP_METADATA) {
+			extent_root->fs_info->extra_alloc_bits =
+				BTRFS_BLOCK_GROUP_RAID0;
+		}
+	}
 	return 0;
 }
 
@@ -1655,24 +1700,31 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
 	struct btrfs_extent_ref *ref;
 	struct btrfs_path *path;
 	struct btrfs_key keys[2];
+	int extra_chunk_alloc_bits = 0;
 
 	if (data) {
-		data = BTRFS_BLOCK_GROUP_DATA;
+		data = BTRFS_BLOCK_GROUP_DATA | info->extra_data_alloc_bits;
 	} else if (root == root->fs_info->chunk_root) {
 		data = BTRFS_BLOCK_GROUP_SYSTEM;
 	} else {
-		data = BTRFS_BLOCK_GROUP_METADATA;
+		data = BTRFS_BLOCK_GROUP_METADATA | info->extra_alloc_bits;
 	}
+	if (btrfs_super_num_devices(&info->super_copy) > 1 &&
+	    !(data & BTRFS_BLOCK_GROUP_SYSTEM))
+		extra_chunk_alloc_bits = BTRFS_BLOCK_GROUP_RAID0;
 
 	if (root->ref_cows) {
-		if (data != BTRFS_BLOCK_GROUP_METADATA) {
+		if (!(data & BTRFS_BLOCK_GROUP_METADATA)) {
 			ret = do_chunk_alloc(trans, root->fs_info->extent_root,
 					     2 * 1024 * 1024,
-					     BTRFS_BLOCK_GROUP_METADATA);
+					     BTRFS_BLOCK_GROUP_METADATA |
+					     info->extra_alloc_bits |
+					     extra_chunk_alloc_bits);
 			BUG_ON(ret);
 		}
 		ret = do_chunk_alloc(trans, root->fs_info->extent_root,
-				     num_bytes + 2 * 1024 * 1024, data);
+				     num_bytes + 2 * 1024 * 1024, data |
+				     extra_chunk_alloc_bits);
 		BUG_ON(ret);
 	}
 
@@ -2627,34 +2679,6 @@ error:
 	return ret;
 }
 
-static int update_space_info(struct btrfs_fs_info *info, u64 flags,
-			     u64 total_bytes, u64 bytes_used,
-			     struct btrfs_space_info **space_info)
-{
-	struct btrfs_space_info *found;
-
-	found = __find_space_info(info, flags);
-	if (found) {
-		found->total_bytes += total_bytes;
-		found->bytes_used += bytes_used;
-		WARN_ON(found->total_bytes < found->bytes_used);
-		*space_info = found;
-		return 0;
-	}
-	found = kmalloc(sizeof(*found), GFP_NOFS);
-	if (!found)
-		return -ENOMEM;
-
-	list_add(&found->list, &info->space_info);
-	found->flags = flags;
-	found->total_bytes = total_bytes;
-	found->bytes_used = bytes_used;
-	found->bytes_pinned = 0;
-	found->full = 0;
-	*space_info = found;
-	return 0;
-}
-
 int btrfs_read_block_groups(struct btrfs_root *root)
 {
 	struct btrfs_path *path;
@@ -2712,6 +2736,16 @@ int btrfs_read_block_groups(struct btrfs_root *root)
 		} else if (cache->flags & BTRFS_BLOCK_GROUP_METADATA) {
 			bit = BLOCK_GROUP_METADATA;
 		}
+		if (cache->flags & BTRFS_BLOCK_GROUP_RAID0) {
+			if (cache->flags & BTRFS_BLOCK_GROUP_DATA) {
+				info->extra_data_alloc_bits =
+					BTRFS_BLOCK_GROUP_RAID0;
+			}
+			if (cache->flags & BTRFS_BLOCK_GROUP_METADATA) {
+				info->extra_alloc_bits =
+					BTRFS_BLOCK_GROUP_RAID0;
+			}
+		}
 
 		ret = update_space_info(info, cache->flags, found_key.offset,
 					btrfs_block_group_used(&cache->item),
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 5140d6801846..db60d85598ce 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -317,8 +317,6 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
 	map_length = length;
 	ret = btrfs_map_block(map_tree, logical, &physical, &map_length, &dev);
 	if (map_length < length + size) {
-		printk("merge bio hook logical %Lu bio len %Lu physical %Lu "
-		       "len %Lu\n", logical, length, physical, map_length);
 		return 1;
 	}
 	return 0;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 263f01cc3db4..d8fce32a3bbc 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -18,6 +18,7 @@
 #include <linux/sched.h>
 #include <linux/bio.h>
 #include <linux/buffer_head.h>
+#include <asm/div64.h>
 #include "ctree.h"
 #include "extent_map.h"
 #include "disk-io.h"
@@ -25,10 +26,24 @@
 #include "print-tree.h"
 #include "volumes.h"
 
-struct map_lookup {
+struct stripe {
 	struct btrfs_device *dev;
 	u64 physical;
 };
+
+struct map_lookup {
+	u64 type;
+	int io_align;
+	int io_width;
+	int stripe_len;
+	int sector_size;
+	int num_stripes;
+	struct stripe stripes[];
+};
+
+#define map_lookup_size(n) (sizeof(struct map_lookup) + \
+			    (sizeof(struct stripe) * (n)))
+
 static DEFINE_MUTEX(uuid_mutex);
 static LIST_HEAD(fs_uuids);
 
@@ -592,6 +607,7 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 		      u64 *num_bytes, u64 type)
 {
 	u64 dev_offset;
+	struct btrfs_fs_info *info = extent_root->fs_info;
 	struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root;
 	struct btrfs_stripe *stripes;
 	struct btrfs_device *device = NULL;
@@ -610,10 +626,18 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 	int looped = 0;
 	int ret;
 	int index;
+	int stripe_len = 64 * 1024;
 	struct btrfs_key key;
 
 	if (list_empty(dev_list))
 		return -ENOSPC;
+
+	if (type & BTRFS_BLOCK_GROUP_RAID0)
+		num_stripes = btrfs_super_num_devices(&info->super_copy);
+	if (type & BTRFS_BLOCK_GROUP_DATA)
+		stripe_len = 64 * 1024;
+	if (type & (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_SYSTEM))
+		stripe_len = 32 * 1024;
 again:
 	INIT_LIST_HEAD(&private_devs);
 	cur = dev_list->next;
@@ -650,9 +674,15 @@ again:
 	if (!chunk)
 		return -ENOMEM;
 
+	map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
+	if (!map) {
+		kfree(chunk);
+		return -ENOMEM;
+	}
+
 	stripes = &chunk->stripe;
 
-	*num_bytes = calc_size;
+	*num_bytes = calc_size * num_stripes;
 	index = 0;
 	while(index < num_stripes) {
 		BUG_ON(list_empty(&private_devs));
@@ -669,6 +699,8 @@ printk("alloc chunk size %Lu from dev %Lu\n", calc_size, device->devid);
 		ret = btrfs_update_device(trans, device);
 		BUG_ON(ret);
 
+		map->stripes[index].dev = device;
+		map->stripes[index].physical = dev_offset;
 		btrfs_set_stack_stripe_devid(stripes + index, device->devid);
 		btrfs_set_stack_stripe_offset(stripes + index, dev_offset);
 		physical = dev_offset;
@@ -680,12 +712,18 @@ printk("alloc chunk size %Lu from dev %Lu\n", calc_size, device->devid);
 	key.offset = *num_bytes;
 	key.type = BTRFS_CHUNK_ITEM_KEY;
 	btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid);
-	btrfs_set_stack_chunk_stripe_len(chunk, 64 * 1024);
+	btrfs_set_stack_chunk_stripe_len(chunk, stripe_len);
 	btrfs_set_stack_chunk_type(chunk, type);
 	btrfs_set_stack_chunk_num_stripes(chunk, num_stripes);
-	btrfs_set_stack_chunk_io_align(chunk, extent_root->sectorsize);
-	btrfs_set_stack_chunk_io_width(chunk, extent_root->sectorsize);
+	btrfs_set_stack_chunk_io_align(chunk, stripe_len);
+	btrfs_set_stack_chunk_io_width(chunk, stripe_len);
 	btrfs_set_stack_chunk_sector_size(chunk, extent_root->sectorsize);
+	map->sector_size = extent_root->sectorsize;
+	map->stripe_len = stripe_len;
+	map->io_align = stripe_len;
+	map->io_width = stripe_len;
+	map->type = type;
+	map->num_stripes = num_stripes;
 
 	ret = btrfs_insert_item(trans, chunk_root, &key, chunk,
 				btrfs_chunk_item_size(num_stripes));
@@ -695,25 +733,11 @@ printk("alloc chunk size %Lu from dev %Lu\n", calc_size, device->devid);
 	em = alloc_extent_map(GFP_NOFS);
 	if (!em)
 		return -ENOMEM;
-	map = kmalloc(sizeof(*map), GFP_NOFS);
-	if (!map) {
-		free_extent_map(em);
-		return -ENOMEM;
-	}
-
 	em->bdev = (struct block_device *)map;
 	em->start = key.objectid;
 	em->len = key.offset;
 	em->block_start = 0;
 
-	map->physical = physical;
-	map->dev = device;
-
-	if (!map->dev) {
-		kfree(map);
-		free_extent_map(em);
-		return -EIO;
-	}
 	kfree(chunk);
 
 	em_tree = &extent_root->fs_info->mapping_tree.map_tree;
@@ -758,6 +782,9 @@ int btrfs_map_block(struct btrfs_mapping_tree *map_tree,
 	struct map_lookup *map;
 	struct extent_map_tree *em_tree = &map_tree->map_tree;
 	u64 offset;
+	u64 stripe_offset;
+	u64 stripe_nr;
+	int stripe_index;
 
 
 	spin_lock(&em_tree->lock);
@@ -767,9 +794,40 @@ int btrfs_map_block(struct btrfs_mapping_tree *map_tree,
 	BUG_ON(em->start > logical || em->start + em->len < logical);
 	map = (struct map_lookup *)em->bdev;
 	offset = logical - em->start;
-	*phys = map->physical + offset;
-	*length = em->len - offset;
-	*dev = map->dev;
+
+	stripe_nr = offset;
+	/*
+	 * stripe_nr counts the total number of stripes we have to stride
+	 * to get to this block
+	 */
+	do_div(stripe_nr, map->stripe_len);
+
+	stripe_offset = stripe_nr * map->stripe_len;
+	BUG_ON(offset < stripe_offset);
+
+	/* stripe_offset is the offset of this block in its stripe*/
+	stripe_offset = offset - stripe_offset;
+
+	/*
+	 * after this do_div call, stripe_nr is the number of stripes
+	 * on this device we have to walk to find the data, and
+	 * stripe_index is the number of our device in the stripe array
+	 */
+	stripe_index = do_div(stripe_nr, map->num_stripes);
+
+	BUG_ON(stripe_index >= map->num_stripes);
+
+	*phys = map->stripes[stripe_index].physical + stripe_offset +
+		stripe_nr * map->stripe_len;
+
+	if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
+		/* we limit the length of each bio to what fits in a stripe */
+		*length = min_t(u64, em->len - offset,
+			      map->stripe_len - stripe_offset);
+	} else {
+		*length = em->len - offset;
+	}
+	*dev = map->stripes[stripe_index].dev;
 	free_extent_map(em);
 	spin_unlock(&em_tree->lock);
 	return 0;
@@ -822,7 +880,9 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
 	u64 logical;
 	u64 length;
 	u64 devid;
+	int num_stripes;
 	int ret;
+	int i;
 
 	logical = key->objectid;
 	length = key->offset;
@@ -846,7 +906,8 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
 	em = alloc_extent_map(GFP_NOFS);
 	if (!em)
 		return -ENOMEM;
-	map = kmalloc(sizeof(*map), GFP_NOFS);
+	num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
+	map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
 	if (!map) {
 		free_extent_map(em);
 		return -ENOMEM;
@@ -857,13 +918,22 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
 	em->len = length;
 	em->block_start = 0;
 
-	map->physical = btrfs_stripe_offset_nr(leaf, chunk, 0);
-	devid = btrfs_stripe_devid_nr(leaf, chunk, 0);
-	map->dev = btrfs_find_device(root, devid);
-	if (!map->dev) {
-		kfree(map);
-		free_extent_map(em);
-		return -EIO;
+	map->num_stripes = num_stripes;
+	map->io_width = btrfs_chunk_io_width(leaf, chunk);
+	map->io_align = btrfs_chunk_io_align(leaf, chunk);
+	map->sector_size = btrfs_chunk_sector_size(leaf, chunk);
+	map->stripe_len = btrfs_chunk_stripe_len(leaf, chunk);
+	map->type = btrfs_chunk_type(leaf, chunk);
+	for (i = 0; i < num_stripes; i++) {
+		map->stripes[i].physical =
+			btrfs_stripe_offset_nr(leaf, chunk, i);
+		devid = btrfs_stripe_devid_nr(leaf, chunk, i);
+		map->stripes[i].dev = btrfs_find_device(root, devid);
+		if (!map->stripes[i].dev) {
+			kfree(map);
+			free_extent_map(em);
+			return -EIO;
+		}
 	}
 
 	spin_lock(&map_tree->map_tree.lock);
-- 
cgit v1.2.3


From e58ca0203d32869a01540a293df40ddc480dc378 Mon Sep 17 00:00:00 2001
From: Yan <yanzheng@21cn.com>
Date: Tue, 1 Apr 2008 11:21:34 -0400
Subject: Fix btrfs_fill_super to return -EINVAL when no FS found

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/disk-io.c | 3 +--
 fs/btrfs/super.c   | 4 ++--
 fs/btrfs/volumes.c | 2 +-
 3 files changed, 4 insertions(+), 5 deletions(-)

(limited to 'fs/btrfs/volumes.c')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 709e33d08bb5..5547607681f4 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -728,9 +728,8 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	struct btrfs_root *dev_root = kmalloc(sizeof(struct btrfs_root),
 					      GFP_NOFS);
 	int ret;
-	int err = -EIO;
+	int err = -EINVAL;
 	struct btrfs_super_block *disk_super;
-
 	if (!extent_root || !tree_root || !fs_info) {
 		err = -ENOMEM;
 		goto fail;
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 9624923a33dc..892864906880 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -236,9 +236,9 @@ static int btrfs_fill_super(struct super_block * sb,
 
 	tree_root = open_ctree(sb, fs_devices);
 
-	if (!tree_root || IS_ERR(tree_root)) {
+	if (IS_ERR(tree_root)) {
 		printk("btrfs: open_ctree failed\n");
-		return -EIO;
+		return PTR_ERR(tree_root);
 	}
 	sb->s_fs_info = tree_root;
 	disk_super = &tree_root->fs_info->super_copy;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index d8fce32a3bbc..169be0f7285b 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -237,7 +237,7 @@ int btrfs_scan_one_device(const char *path, int flags, void *holder,
 	if (strncmp((char *)(&disk_super->magic), BTRFS_MAGIC,
 	    sizeof(disk_super->magic))) {
 		printk("no btrfs found on %s\n", path);
-		ret = -ENOENT;
+		ret = -EINVAL;
 		goto error_brelse;
 	}
 	devid = le64_to_cpu(disk_super->dev_item.devid);
-- 
cgit v1.2.3


From 8790d502e4401a4a3a4175b83a3a47e8d595c771 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 3 Apr 2008 16:29:03 -0400
Subject: Btrfs: Add support for mirroring across drives

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h       |   9 ++-
 fs/btrfs/disk-io.c     |  20 +++----
 fs/btrfs/extent-tree.c |  93 +++++++++++++++--------------
 fs/btrfs/inode.c       |   4 +-
 fs/btrfs/volumes.c     | 154 ++++++++++++++++++++++++++++++++++++++++---------
 fs/btrfs/volumes.h     |  11 ++--
 6 files changed, 198 insertions(+), 93 deletions(-)

(limited to 'fs/btrfs/volumes.c')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 67d533cf8f47..0a207861472e 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -418,6 +418,7 @@ struct btrfs_csum_item {
 #define BTRFS_BLOCK_GROUP_SYSTEM   (1 << 1)
 #define BTRFS_BLOCK_GROUP_METADATA (1 << 2)
 #define BTRFS_BLOCK_GROUP_RAID0    (1 << 3)
+#define BTRFS_BLOCK_GROUP_RAID1    (1 << 4)
 
 
 struct btrfs_block_group_item {
@@ -504,8 +505,12 @@ struct btrfs_fs_info {
 	u64 delalloc_bytes;
 	u64 last_alloc;
 	u64 last_data_alloc;
-	int extra_data_alloc_bits;
-	int extra_alloc_bits;
+	int avail_data_alloc_bits;
+	int avail_metadata_alloc_bits;
+	int avail_system_alloc_bits;
+	int data_alloc_profile;
+	int metadata_alloc_profile;
+	int system_alloc_profile;
 };
 
 /*
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index e40fb318ad99..ff75ad586767 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -735,7 +735,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 						 GFP_NOFS);
 	struct btrfs_root *tree_root = kmalloc(sizeof(struct btrfs_root),
 					       GFP_NOFS);
-	struct btrfs_fs_info *fs_info = kmalloc(sizeof(*fs_info),
+	struct btrfs_fs_info *fs_info = kzalloc(sizeof(*fs_info),
 						GFP_NOFS);
 	struct btrfs_root *chunk_root = kmalloc(sizeof(struct btrfs_root),
 						GFP_NOFS);
@@ -744,6 +744,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	int ret;
 	int err = -EINVAL;
 	struct btrfs_super_block *disk_super;
+
 	if (!extent_root || !tree_root || !fs_info) {
 		err = -ENOMEM;
 		goto fail;
@@ -756,11 +757,8 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	spin_lock_init(&fs_info->delalloc_lock);
 	spin_lock_init(&fs_info->new_trans_lock);
 
-	memset(&fs_info->super_kobj, 0, sizeof(fs_info->super_kobj));
 	init_completion(&fs_info->kobj_unregister);
 	sb_set_blocksize(sb, 4096);
-	fs_info->running_transaction = NULL;
-	fs_info->last_trans_committed = 0;
 	fs_info->tree_root = tree_root;
 	fs_info->extent_root = extent_root;
 	fs_info->chunk_root = chunk_root;
@@ -770,11 +768,8 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	INIT_LIST_HEAD(&fs_info->space_info);
 	btrfs_mapping_init(&fs_info->mapping_tree);
 	fs_info->sb = sb;
-	fs_info->throttles = 0;
-	fs_info->mount_opt = 0;
 	fs_info->max_extent = (u64)-1;
 	fs_info->max_inline = 8192 * 1024;
-	fs_info->delalloc_bytes = 0;
 	setup_bdi(fs_info, &fs_info->bdi);
 	fs_info->btree_inode = new_inode(sb);
 	fs_info->btree_inode->i_ino = 1;
@@ -802,12 +797,6 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	extent_io_tree_init(&fs_info->extent_ins,
 			     fs_info->btree_inode->i_mapping, GFP_NOFS);
 	fs_info->do_barriers = 1;
-	fs_info->closing = 0;
-	fs_info->total_pinned = 0;
-	fs_info->last_alloc = 0;
-	fs_info->last_data_alloc = 0;
-	fs_info->extra_alloc_bits = 0;
-	fs_info->extra_data_alloc_bits = 0;
 
 #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18)
 	INIT_WORK(&fs_info->trans_work, btrfs_transaction_cleaner, fs_info);
@@ -923,6 +912,11 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	btrfs_read_block_groups(extent_root);
 
 	fs_info->generation = btrfs_super_generation(disk_super) + 1;
+	if (btrfs_super_num_devices(disk_super) > 0) {
+		fs_info->data_alloc_profile = BTRFS_BLOCK_GROUP_RAID0;
+		fs_info->metadata_alloc_profile = BTRFS_BLOCK_GROUP_RAID1;
+		fs_info->system_alloc_profile = BTRFS_BLOCK_GROUP_RAID0;
+	}
 	mutex_unlock(&fs_info->fs_mutex);
 	return tree_root;
 
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index a34c289aec21..4ab98d8b73fa 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -230,9 +230,13 @@ again:
 			goto new_group;
 		if (start + num  > total_fs_bytes)
 			goto new_group;
+		if (!block_group_bits(cache, data)) {
+			printk("block group bits don't match %Lu %Lu\n", cache->flags, data);
+		}
 		*start_ret = start;
 		return 0;
-	} out:
+	}
+out:
 	cache = btrfs_lookup_block_group(root->fs_info, search_start);
 	if (!cache) {
 		printk("Unable to find block group for %Lu\n", search_start);
@@ -365,14 +369,17 @@ again:
 		if (cache->key.objectid > total_fs_bytes)
 			break;
 
-		if (full_search)
-			free_check = cache->key.offset;
-		else
-			free_check = div_factor(cache->key.offset, factor);
+		if (block_group_bits(cache, data)) {
+			if (full_search)
+				free_check = cache->key.offset;
+			else
+				free_check = div_factor(cache->key.offset,
+							factor);
 
-		if (used + cache->pinned < free_check) {
-			found_group = cache;
-			goto found;
+			if (used + cache->pinned < free_check) {
+				found_group = cache;
+				goto found;
+			}
 		}
 		cond_resched();
 	}
@@ -1038,6 +1045,19 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
 	return 0;
 }
 
+static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
+{
+	u64 extra_flags = flags & (BTRFS_BLOCK_GROUP_RAID0 |
+				   BTRFS_BLOCK_GROUP_RAID1);
+	if (extra_flags) {
+		if (flags & BTRFS_BLOCK_GROUP_DATA)
+			fs_info->avail_data_alloc_bits |= extra_flags;
+		if (flags & BTRFS_BLOCK_GROUP_METADATA)
+			fs_info->avail_metadata_alloc_bits |= extra_flags;
+		if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
+			fs_info->avail_system_alloc_bits |= extra_flags;
+	}
+}
 
 static int do_chunk_alloc(struct btrfs_trans_handle *trans,
 			  struct btrfs_root *extent_root, u64 alloc_bytes,
@@ -1060,7 +1080,7 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
 	if (space_info->full)
 		return 0;
 
-	thresh = div_factor(space_info->total_bytes, 7);
+	thresh = div_factor(space_info->total_bytes, 6);
 	if ((space_info->bytes_used + space_info->bytes_pinned + alloc_bytes) <
 	    thresh)
 		return 0;
@@ -1079,16 +1099,7 @@ printk("space info full %Lu\n", flags);
 		     start, num_bytes);
 	BUG_ON(ret);
 
-	if (flags & BTRFS_BLOCK_GROUP_RAID0) {
-		if (flags & BTRFS_BLOCK_GROUP_DATA) {
-			extent_root->fs_info->extra_data_alloc_bits =
-				BTRFS_BLOCK_GROUP_RAID0;
-		}
-		if (flags & BTRFS_BLOCK_GROUP_METADATA) {
-			extent_root->fs_info->extra_alloc_bits =
-				BTRFS_BLOCK_GROUP_RAID0;
-		}
-	}
+	set_avail_alloc_bits(extent_root->fs_info, flags);
 	return 0;
 }
 
@@ -1529,6 +1540,7 @@ static int noinline find_free_extent(struct btrfs_trans_handle *trans,
 
 	if (data & BTRFS_BLOCK_GROUP_METADATA) {
 		last_ptr = &root->fs_info->last_alloc;
+		empty_cluster = 256 * 1024;
 	}
 
 	if ((data & BTRFS_BLOCK_GROUP_DATA) && btrfs_test_opt(root, SSD)) {
@@ -1693,6 +1705,7 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
 	u64 root_used;
 	u64 search_start = 0;
 	u64 new_hint;
+	u64 alloc_profile;
 	u32 sizes[2];
 	struct btrfs_fs_info *info = root->fs_info;
 	struct btrfs_root *extent_root = info->extent_root;
@@ -1700,31 +1713,32 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
 	struct btrfs_extent_ref *ref;
 	struct btrfs_path *path;
 	struct btrfs_key keys[2];
-	int extra_chunk_alloc_bits = 0;
 
 	if (data) {
-		data = BTRFS_BLOCK_GROUP_DATA | info->extra_data_alloc_bits;
+		alloc_profile = info->avail_data_alloc_bits &
+			        info->data_alloc_profile;
+		data = BTRFS_BLOCK_GROUP_DATA | alloc_profile;
 	} else if (root == root->fs_info->chunk_root) {
-		data = BTRFS_BLOCK_GROUP_SYSTEM;
+		alloc_profile = info->avail_system_alloc_bits &
+			        info->system_alloc_profile;
+		data = BTRFS_BLOCK_GROUP_SYSTEM | alloc_profile;
 	} else {
-		data = BTRFS_BLOCK_GROUP_METADATA | info->extra_alloc_bits;
+		alloc_profile = info->avail_metadata_alloc_bits &
+			        info->metadata_alloc_profile;
+		data = BTRFS_BLOCK_GROUP_METADATA | alloc_profile;
 	}
-	if (btrfs_super_num_devices(&info->super_copy) > 1 &&
-	    !(data & BTRFS_BLOCK_GROUP_SYSTEM))
-		extra_chunk_alloc_bits = BTRFS_BLOCK_GROUP_RAID0;
 
 	if (root->ref_cows) {
 		if (!(data & BTRFS_BLOCK_GROUP_METADATA)) {
 			ret = do_chunk_alloc(trans, root->fs_info->extent_root,
 					     2 * 1024 * 1024,
 					     BTRFS_BLOCK_GROUP_METADATA |
-					     info->extra_alloc_bits |
-					     extra_chunk_alloc_bits);
+					     (info->metadata_alloc_profile &
+					      info->avail_metadata_alloc_bits));
 			BUG_ON(ret);
 		}
 		ret = do_chunk_alloc(trans, root->fs_info->extent_root,
-				     num_bytes + 2 * 1024 * 1024, data |
-				     extra_chunk_alloc_bits);
+				     num_bytes + 2 * 1024 * 1024, data);
 		BUG_ON(ret);
 	}
 
@@ -2046,12 +2060,12 @@ static int noinline walk_down_tree(struct btrfs_trans_handle *trans,
 		if (!next || !btrfs_buffer_uptodate(next)) {
 			free_extent_buffer(next);
 			reada_walk_down(root, cur, path->slots[*level]);
+
+			mutex_unlock(&root->fs_info->fs_mutex);
 			next = read_tree_block(root, bytenr, blocksize);
+			mutex_lock(&root->fs_info->fs_mutex);
 
-			/* we used to drop the lock above, keep the
-			 * code to double check so that we won't forget
-			 * when we drop the lock again in the future
-			 */
+			/* we've dropped the lock, double check */
 			ret = lookup_extent_ref(trans, root, bytenr,
 						blocksize, &refs);
 			BUG_ON(ret);
@@ -2739,16 +2753,7 @@ int btrfs_read_block_groups(struct btrfs_root *root)
 		} else if (cache->flags & BTRFS_BLOCK_GROUP_METADATA) {
 			bit = BLOCK_GROUP_METADATA;
 		}
-		if (cache->flags & BTRFS_BLOCK_GROUP_RAID0) {
-			if (cache->flags & BTRFS_BLOCK_GROUP_DATA) {
-				info->extra_data_alloc_bits =
-					BTRFS_BLOCK_GROUP_RAID0;
-			}
-			if (cache->flags & BTRFS_BLOCK_GROUP_METADATA) {
-				info->extra_alloc_bits =
-					BTRFS_BLOCK_GROUP_RAID0;
-			}
-		}
+		set_avail_alloc_bits(info, cache->flags);
 
 		ret = update_space_info(info, cache->flags, found_key.offset,
 					btrfs_block_group_used(&cache->item),
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 0fa7cf227f1a..a8ae68c6fbb8 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -306,6 +306,7 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
 	u64 physical;
 	u64 length = 0;
 	u64 map_length;
+	int total_devs;
 	struct bio_vec *bvec;
 	int i;
 	int ret;
@@ -315,7 +316,8 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
 	}
 	map_tree = &root->fs_info->mapping_tree;
 	map_length = length;
-	ret = btrfs_map_block(map_tree, logical, &physical, &map_length, &dev);
+	ret = btrfs_map_block(map_tree, READ, 0, logical, &physical,
+			      &map_length, &dev, &total_devs);
 	if (map_length < length + size) {
 		return 1;
 	}
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 169be0f7285b..bc3c0b97588e 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -31,6 +31,13 @@ struct stripe {
 	u64 physical;
 };
 
+struct multi_bio {
+	atomic_t stripes;
+	bio_end_io_t *end_io;
+	void *private;
+	int error;
+};
+
 struct map_lookup {
 	u64 type;
 	int io_align;
@@ -632,12 +639,12 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 	if (list_empty(dev_list))
 		return -ENOSPC;
 
-	if (type & BTRFS_BLOCK_GROUP_RAID0)
+	if (type & (BTRFS_BLOCK_GROUP_RAID0))
 		num_stripes = btrfs_super_num_devices(&info->super_copy);
-	if (type & BTRFS_BLOCK_GROUP_DATA)
-		stripe_len = 64 * 1024;
-	if (type & (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_SYSTEM))
-		stripe_len = 32 * 1024;
+	if (type & (BTRFS_BLOCK_GROUP_RAID1)) {
+		num_stripes = min_t(u64, 2,
+				  btrfs_super_num_devices(&info->super_copy));
+	}
 again:
 	INIT_LIST_HEAD(&private_devs);
 	cur = dev_list->next;
@@ -682,7 +689,11 @@ again:
 
 	stripes = &chunk->stripe;
 
-	*num_bytes = calc_size * num_stripes;
+	if (type & BTRFS_BLOCK_GROUP_RAID1)
+		*num_bytes = calc_size;
+	else
+		*num_bytes = calc_size * num_stripes;
+
 	index = 0;
 	while(index < num_stripes) {
 		BUG_ON(list_empty(&private_devs));
@@ -694,7 +705,7 @@ again:
 					     key.objectid,
 					     calc_size, &dev_offset);
 		BUG_ON(ret);
-printk("alloc chunk size %Lu from dev %Lu\n", calc_size, device->devid);
+printk("alloc chunk start %Lu size %Lu from dev %Lu type %Lu\n", key.objectid, calc_size, device->devid, type);
 		device->bytes_used += calc_size;
 		ret = btrfs_update_device(trans, device);
 		BUG_ON(ret);
@@ -774,9 +785,9 @@ void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree)
 	}
 }
 
-int btrfs_map_block(struct btrfs_mapping_tree *map_tree,
-		    u64 logical, u64 *phys, u64 *length,
-		    struct btrfs_device **dev)
+int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
+		    int dev_nr, u64 logical, u64 *phys, u64 *length,
+		    struct btrfs_device **dev, int *total_devs)
 {
 	struct extent_map *em;
 	struct map_lookup *map;
@@ -808,19 +819,39 @@ int btrfs_map_block(struct btrfs_mapping_tree *map_tree,
 	/* stripe_offset is the offset of this block in its stripe*/
 	stripe_offset = offset - stripe_offset;
 
-	/*
-	 * after this do_div call, stripe_nr is the number of stripes
-	 * on this device we have to walk to find the data, and
-	 * stripe_index is the number of our device in the stripe array
-	 */
-	stripe_index = do_div(stripe_nr, map->num_stripes);
-
+	if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
+		stripe_index = dev_nr;
+		if (rw & (1 << BIO_RW))
+			*total_devs = map->num_stripes;
+		else {
+			int i;
+			u64 least = (u64)-1;
+			struct btrfs_device *cur;
+
+			for (i = 0; i < map->num_stripes; i++) {
+				cur = map->stripes[i].dev;
+				spin_lock(&cur->io_lock);
+				if (cur->total_ios < least) {
+					least = cur->total_ios;
+					stripe_index = i;
+				}
+				spin_unlock(&cur->io_lock);
+			}
+			*total_devs = 1;
+		}
+	} else {
+		/*
+		 * after this do_div call, stripe_nr is the number of stripes
+		 * on this device we have to walk to find the data, and
+		 * stripe_index is the number of our device in the stripe array
+		 */
+		stripe_index = do_div(stripe_nr, map->num_stripes);
+	}
 	BUG_ON(stripe_index >= map->num_stripes);
-
 	*phys = map->stripes[stripe_index].physical + stripe_offset +
 		stripe_nr * map->stripe_len;
 
-	if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
+	if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1)) {
 		/* we limit the length of each bio to what fits in a stripe */
 		*length = min_t(u64, em->len - offset,
 			      map->stripe_len - stripe_offset);
@@ -833,33 +864,98 @@ int btrfs_map_block(struct btrfs_mapping_tree *map_tree,
 	return 0;
 }
 
+#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,23)
+static void end_bio_multi_stripe(struct bio *bio, int err)
+#else
+static int end_bio_multi_stripe(struct bio *bio,
+				   unsigned int bytes_done, int err)
+#endif
+{
+	struct multi_bio *multi = bio->bi_private;
+
+#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
+	if (bio->bi_size)
+		return 1;
+#endif
+	if (err)
+		multi->error = err;
+
+	if (atomic_dec_and_test(&multi->stripes)) {
+		bio->bi_private = multi->private;
+		bio->bi_end_io = multi->end_io;
+
+		if (!err && multi->error)
+			err = multi->error;
+		kfree(multi);
+
+		bio_endio(bio, err);
+	} else {
+		bio_put(bio);
+	}
+#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
+	return 0;
+#endif
+}
+
 int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio)
 {
 	struct btrfs_mapping_tree *map_tree;
 	struct btrfs_device *dev;
+	struct bio *first_bio = bio;
 	u64 logical = bio->bi_sector << 9;
 	u64 physical;
 	u64 length = 0;
 	u64 map_length;
 	struct bio_vec *bvec;
+	struct multi_bio *multi = NULL;
 	int i;
 	int ret;
+	int dev_nr = 0;
+	int total_devs = 1;
 
 	bio_for_each_segment(bvec, bio, i) {
 		length += bvec->bv_len;
 	}
+
 	map_tree = &root->fs_info->mapping_tree;
 	map_length = length;
-	ret = btrfs_map_block(map_tree, logical, &physical, &map_length, &dev);
-	if (map_length < length) {
-		printk("mapping failed logical %Lu bio len %Lu physical %Lu "
-		       "len %Lu\n", logical, length, physical, map_length);
-		BUG();
+	while(dev_nr < total_devs) {
+		ret = btrfs_map_block(map_tree, rw, dev_nr, logical,
+				      &physical, &map_length, &dev,
+				      &total_devs);
+		if (map_length < length) {
+			printk("mapping failed logical %Lu bio len %Lu physical %Lu "
+			       "len %Lu\n", logical, length, physical, map_length);
+			BUG();
+		}
+		BUG_ON(map_length < length);
+		if (total_devs > 1) {
+			if (!multi) {
+				multi = kmalloc(sizeof(*multi), GFP_NOFS);
+				atomic_set(&multi->stripes, 1);
+				multi->end_io = bio->bi_end_io;
+				multi->private = first_bio->bi_private;
+				multi->error = 0;
+			} else {
+				atomic_inc(&multi->stripes);
+			}
+			if (dev_nr < total_devs - 1) {
+				bio = bio_clone(first_bio, GFP_NOFS);
+				BUG_ON(!bio);
+			} else {
+				bio = first_bio;
+			}
+			bio->bi_private = multi;
+			bio->bi_end_io = end_bio_multi_stripe;
+		}
+		bio->bi_sector = physical >> 9;
+		bio->bi_bdev = dev->bdev;
+		spin_lock(&dev->io_lock);
+		dev->total_ios++;
+		spin_unlock(&dev->io_lock);
+		submit_bio(rw, bio);
+		dev_nr++;
 	}
-	BUG_ON(map_length < length);
-	bio->bi_sector = physical >> 9;
-	bio->bi_bdev = dev->bdev;
-	submit_bio(rw, bio);
 	return 0;
 }
 
@@ -982,6 +1078,8 @@ static int read_one_dev(struct btrfs_root *root,
 			return -ENOMEM;
 		list_add(&device->dev_list,
 			 &root->fs_info->fs_devices->devices);
+		device->total_ios = 0;
+		spin_lock_init(&device->io_lock);
 	}
 
 	fill_device_from_item(leaf, dev_item, device);
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 12f297eb0559..35dec3efd78c 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -18,12 +18,16 @@
 
 #ifndef __BTRFS_VOLUMES_
 #define __BTRFS_VOLUMES_
+
 struct btrfs_device {
 	struct list_head dev_list;
 	struct btrfs_root *dev_root;
+	spinlock_t io_lock;
 
 	struct block_device *bdev;
 
+	u64 total_ios;
+
 	char *name;
 
 	/* the internal btrfs device id */
@@ -68,9 +72,9 @@ struct btrfs_fs_devices {
 int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
 			   struct btrfs_device *device,
 			   u64 owner, u64 num_bytes, u64 *start);
-int btrfs_map_block(struct btrfs_mapping_tree *map_tree,
+int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, int stripe_nr,
 		    u64 logical, u64 *phys, u64 *length,
-		    struct btrfs_device **dev);
+		    struct btrfs_device **dev, int *total_stripes);
 int btrfs_read_sys_array(struct btrfs_root *root);
 int btrfs_read_chunk_tree(struct btrfs_root *root);
 int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
@@ -80,9 +84,6 @@ void btrfs_mapping_init(struct btrfs_mapping_tree *tree);
 void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree);
 int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio);
 int btrfs_read_super_device(struct btrfs_root *root, struct extent_buffer *buf);
-int btrfs_map_block(struct btrfs_mapping_tree *map_tree,
-		    u64 logical, u64 *phys, u64 *length,
-		    struct btrfs_device **dev);
 int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
 		       int flags, void *holder);
 int btrfs_scan_one_device(const char *path, int flags, void *holder,
-- 
cgit v1.2.3


From 611f0e00a27fe0e5a571194a12443ecdc99a43ef Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 3 Apr 2008 16:29:03 -0400
Subject: Btrfs: Add support for duplicate blocks on a single spindle

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h       |  1 +
 fs/btrfs/disk-io.c     |  8 +++++---
 fs/btrfs/extent-tree.c |  5 +++--
 fs/btrfs/volumes.c     | 32 ++++++++++++++++++++++++++++----
 4 files changed, 37 insertions(+), 9 deletions(-)

(limited to 'fs/btrfs/volumes.c')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 0a207861472e..72deae63ec28 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -419,6 +419,7 @@ struct btrfs_csum_item {
 #define BTRFS_BLOCK_GROUP_METADATA (1 << 2)
 #define BTRFS_BLOCK_GROUP_RAID0    (1 << 3)
 #define BTRFS_BLOCK_GROUP_RAID1    (1 << 4)
+#define BTRFS_BLOCK_GROUP_DUP	   (1 << 5)
 
 
 struct btrfs_block_group_item {
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index ff75ad586767..42522232fde4 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -913,9 +913,11 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 
 	fs_info->generation = btrfs_super_generation(disk_super) + 1;
 	if (btrfs_super_num_devices(disk_super) > 0) {
-		fs_info->data_alloc_profile = BTRFS_BLOCK_GROUP_RAID0;
-		fs_info->metadata_alloc_profile = BTRFS_BLOCK_GROUP_RAID1;
-		fs_info->system_alloc_profile = BTRFS_BLOCK_GROUP_RAID0;
+		fs_info->data_alloc_profile = BTRFS_BLOCK_GROUP_RAID0 |
+			BTRFS_BLOCK_GROUP_RAID1;
+		fs_info->metadata_alloc_profile = BTRFS_BLOCK_GROUP_RAID1 |
+			BTRFS_BLOCK_GROUP_DUP;
+		fs_info->system_alloc_profile = fs_info->metadata_alloc_profile;
 	}
 	mutex_unlock(&fs_info->fs_mutex);
 	return tree_root;
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 4ab98d8b73fa..1885ec4280c8 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -231,7 +231,7 @@ again:
 		if (start + num  > total_fs_bytes)
 			goto new_group;
 		if (!block_group_bits(cache, data)) {
-			printk("block group bits don't match %Lu %Lu\n", cache->flags, data);
+			printk("block group bits don't match %Lu %d\n", cache->flags, data);
 		}
 		*start_ret = start;
 		return 0;
@@ -1048,7 +1048,8 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
 static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
 {
 	u64 extra_flags = flags & (BTRFS_BLOCK_GROUP_RAID0 |
-				   BTRFS_BLOCK_GROUP_RAID1);
+				   BTRFS_BLOCK_GROUP_RAID1 |
+				   BTRFS_BLOCK_GROUP_DUP);
 	if (extra_flags) {
 		if (flags & BTRFS_BLOCK_GROUP_DATA)
 			fs_info->avail_data_alloc_bits |= extra_flags;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index bc3c0b97588e..b9294e3c05f0 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -627,6 +627,7 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 	struct extent_map *em;
 	u64 physical;
 	u64 calc_size = 1024 * 1024 * 1024;
+	u64 min_free = calc_size;
 	u64 avail;
 	u64 max_avail = 0;
 	int num_stripes = 1;
@@ -641,6 +642,8 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 
 	if (type & (BTRFS_BLOCK_GROUP_RAID0))
 		num_stripes = btrfs_super_num_devices(&info->super_copy);
+	if (type & (BTRFS_BLOCK_GROUP_DUP))
+		num_stripes = 2;
 	if (type & (BTRFS_BLOCK_GROUP_RAID1)) {
 		num_stripes = min_t(u64, 2,
 				  btrfs_super_num_devices(&info->super_copy));
@@ -649,16 +652,23 @@ again:
 	INIT_LIST_HEAD(&private_devs);
 	cur = dev_list->next;
 	index = 0;
+
+	if (type & BTRFS_BLOCK_GROUP_DUP)
+		min_free = calc_size * 2;
+
 	/* build a private list of devices we will allocate from */
 	while(index < num_stripes) {
 		device = list_entry(cur, struct btrfs_device, dev_list);
+
 		avail = device->total_bytes - device->bytes_used;
 		cur = cur->next;
 		if (avail > max_avail)
 			max_avail = avail;
-		if (avail >= calc_size) {
+		if (avail >= min_free) {
 			list_move_tail(&device->dev_list, &private_devs);
 			index++;
+			if (type & BTRFS_BLOCK_GROUP_DUP)
+				index++;
 		}
 		if (cur == dev_list)
 			break;
@@ -689,17 +699,22 @@ again:
 
 	stripes = &chunk->stripe;
 
-	if (type & BTRFS_BLOCK_GROUP_RAID1)
+	if (type & (BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_DUP))
 		*num_bytes = calc_size;
 	else
 		*num_bytes = calc_size * num_stripes;
 
 	index = 0;
+printk("new chunk type %Lu start %Lu size %Lu\n", type, key.objectid, *num_bytes);
 	while(index < num_stripes) {
 		BUG_ON(list_empty(&private_devs));
 		cur = private_devs.next;
 		device = list_entry(cur, struct btrfs_device, dev_list);
-		list_move_tail(&device->dev_list, dev_list);
+
+		/* loop over this device again if we're doing a dup group */
+		if (!(type & BTRFS_BLOCK_GROUP_DUP) ||
+		    (index == num_stripes - 1))
+			list_move_tail(&device->dev_list, dev_list);
 
 		ret = btrfs_alloc_dev_extent(trans, device,
 					     key.objectid,
@@ -839,6 +854,14 @@ int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
 			}
 			*total_devs = 1;
 		}
+	} else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
+		if (rw == WRITE) {
+			*total_devs = map->num_stripes;
+			stripe_index = dev_nr;
+		} else {
+			stripe_index = 0;
+			*total_devs = 1;
+		}
 	} else {
 		/*
 		 * after this do_div call, stripe_nr is the number of stripes
@@ -851,7 +874,8 @@ int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
 	*phys = map->stripes[stripe_index].physical + stripe_offset +
 		stripe_nr * map->stripe_len;
 
-	if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1)) {
+	if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 |
+			 BTRFS_BLOCK_GROUP_DUP)) {
 		/* we limit the length of each bio to what fits in a stripe */
 		*length = min_t(u64, em->len - offset,
 			      map->stripe_len - stripe_offset);
-- 
cgit v1.2.3


From cea9e4452ebaf18dd0951e90dc84d82a5dee40b2 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 9 Apr 2008 16:28:12 -0400
Subject: Change btrfs_map_block to return a structure with mappings for all
 stripes

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c   |   4 +-
 fs/btrfs/inode.c   |   8 ++--
 fs/btrfs/volumes.c | 135 +++++++++++++++++++++++++++++------------------------
 fs/btrfs/volumes.h |  25 ++++++++--
 4 files changed, 103 insertions(+), 69 deletions(-)

(limited to 'fs/btrfs/volumes.c')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 3c7891cf0d72..c92c6b0ee582 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -2025,8 +2025,10 @@ again:
 					 root->root_key.objectid,
 					 root_gen, disk_key.objectid, 0,
 					 l->start, 0);
-	if (IS_ERR(right))
+	if (IS_ERR(right)) {
+		BUG_ON(1);
 		return PTR_ERR(right);
+	}
 
 	memset_extent_buffer(right, 0, 0, sizeof(struct btrfs_header));
 	btrfs_set_header_bytenr(right, right->start);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index a8ae68c6fbb8..7ae677d8a6de 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -301,12 +301,9 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
 {
 	struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
 	struct btrfs_mapping_tree *map_tree;
-	struct btrfs_device *dev;
 	u64 logical = bio->bi_sector << 9;
-	u64 physical;
 	u64 length = 0;
 	u64 map_length;
-	int total_devs;
 	struct bio_vec *bvec;
 	int i;
 	int ret;
@@ -316,8 +313,9 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
 	}
 	map_tree = &root->fs_info->mapping_tree;
 	map_length = length;
-	ret = btrfs_map_block(map_tree, READ, 0, logical, &physical,
-			      &map_length, &dev, &total_devs);
+	ret = btrfs_map_block(map_tree, READ, logical,
+			      &map_length, NULL);
+
 	if (map_length < length + size) {
 		return 1;
 	}
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index b9294e3c05f0..008d3640e8c2 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -26,18 +26,6 @@
 #include "print-tree.h"
 #include "volumes.h"
 
-struct stripe {
-	struct btrfs_device *dev;
-	u64 physical;
-};
-
-struct multi_bio {
-	atomic_t stripes;
-	bio_end_io_t *end_io;
-	void *private;
-	int error;
-};
-
 struct map_lookup {
 	u64 type;
 	int io_align;
@@ -45,11 +33,11 @@ struct map_lookup {
 	int stripe_len;
 	int sector_size;
 	int num_stripes;
-	struct stripe stripes[];
+	struct btrfs_bio_stripe stripes[];
 };
 
 #define map_lookup_size(n) (sizeof(struct map_lookup) + \
-			    (sizeof(struct stripe) * (n)))
+			    (sizeof(struct btrfs_bio_stripe) * (n)))
 
 static DEFINE_MUTEX(uuid_mutex);
 static LIST_HEAD(fs_uuids);
@@ -801,8 +789,8 @@ void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree)
 }
 
 int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
-		    int dev_nr, u64 logical, u64 *phys, u64 *length,
-		    struct btrfs_device **dev, int *total_devs)
+		    u64 logical, u64 *length,
+		    struct btrfs_multi_bio **multi_ret)
 {
 	struct extent_map *em;
 	struct map_lookup *map;
@@ -810,8 +798,21 @@ int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
 	u64 offset;
 	u64 stripe_offset;
 	u64 stripe_nr;
+	int stripes_allocated = 8;
 	int stripe_index;
+	int i;
+	struct btrfs_multi_bio *multi = NULL;
 
+	if (multi_ret && !(rw & (1 << BIO_RW))) {
+		stripes_allocated = 1;
+	}
+again:
+	if (multi_ret) {
+		multi = kzalloc(btrfs_multi_bio_size(stripes_allocated),
+				GFP_NOFS);
+		if (!multi)
+			return -ENOMEM;
+	}
 
 	spin_lock(&em_tree->lock);
 	em = lookup_extent_mapping(em_tree, logical, *length);
@@ -821,6 +822,17 @@ int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
 	map = (struct map_lookup *)em->bdev;
 	offset = logical - em->start;
 
+	/* if our multi bio struct is too small, back off and try again */
+	if (multi_ret && (rw & (1 << BIO_RW)) &&
+	    stripes_allocated < map->num_stripes &&
+	    ((map->type & BTRFS_BLOCK_GROUP_RAID1) ||
+	     (map->type & BTRFS_BLOCK_GROUP_DUP))) {
+		stripes_allocated = map->num_stripes;
+		spin_unlock(&em_tree->lock);
+		free_extent_map(em);
+		kfree(multi);
+		goto again;
+	}
 	stripe_nr = offset;
 	/*
 	 * stripe_nr counts the total number of stripes we have to stride
@@ -834,10 +846,22 @@ int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
 	/* stripe_offset is the offset of this block in its stripe*/
 	stripe_offset = offset - stripe_offset;
 
+	if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 |
+			 BTRFS_BLOCK_GROUP_DUP)) {
+		/* we limit the length of each bio to what fits in a stripe */
+		*length = min_t(u64, em->len - offset,
+			      map->stripe_len - stripe_offset);
+	} else {
+		*length = em->len - offset;
+	}
+	if (!multi_ret)
+		goto out;
+
+	multi->num_stripes = 1;
+	stripe_index = 0;
 	if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
-		stripe_index = dev_nr;
 		if (rw & (1 << BIO_RW))
-			*total_devs = map->num_stripes;
+			multi->num_stripes = map->num_stripes;
 		else {
 			int i;
 			u64 least = (u64)-1;
@@ -852,16 +876,10 @@ int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
 				}
 				spin_unlock(&cur->io_lock);
 			}
-			*total_devs = 1;
 		}
 	} else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
-		if (rw == WRITE) {
-			*total_devs = map->num_stripes;
-			stripe_index = dev_nr;
-		} else {
-			stripe_index = 0;
-			*total_devs = 1;
-		}
+		if (rw & (1 << BIO_RW))
+			multi->num_stripes = map->num_stripes;
 	} else {
 		/*
 		 * after this do_div call, stripe_nr is the number of stripes
@@ -871,18 +889,17 @@ int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
 		stripe_index = do_div(stripe_nr, map->num_stripes);
 	}
 	BUG_ON(stripe_index >= map->num_stripes);
-	*phys = map->stripes[stripe_index].physical + stripe_offset +
-		stripe_nr * map->stripe_len;
-
-	if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 |
-			 BTRFS_BLOCK_GROUP_DUP)) {
-		/* we limit the length of each bio to what fits in a stripe */
-		*length = min_t(u64, em->len - offset,
-			      map->stripe_len - stripe_offset);
-	} else {
-		*length = em->len - offset;
+	BUG_ON(stripe_index != 0 && multi->num_stripes > 1);
+
+	for (i = 0; i < multi->num_stripes; i++) {
+		multi->stripes[i].physical =
+			map->stripes[stripe_index].physical + stripe_offset +
+			stripe_nr * map->stripe_len;
+		multi->stripes[i].dev = map->stripes[stripe_index].dev;
+		stripe_index++;
 	}
-	*dev = map->stripes[stripe_index].dev;
+	*multi_ret = multi;
+out:
 	free_extent_map(em);
 	spin_unlock(&em_tree->lock);
 	return 0;
@@ -895,7 +912,7 @@ static int end_bio_multi_stripe(struct bio *bio,
 				   unsigned int bytes_done, int err)
 #endif
 {
-	struct multi_bio *multi = bio->bi_private;
+	struct btrfs_multi_bio *multi = bio->bi_private;
 
 #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
 	if (bio->bi_size)
@@ -904,7 +921,7 @@ static int end_bio_multi_stripe(struct bio *bio,
 	if (err)
 		multi->error = err;
 
-	if (atomic_dec_and_test(&multi->stripes)) {
+	if (atomic_dec_and_test(&multi->stripes_pending)) {
 		bio->bi_private = multi->private;
 		bio->bi_end_io = multi->end_io;
 
@@ -927,11 +944,10 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio)
 	struct btrfs_device *dev;
 	struct bio *first_bio = bio;
 	u64 logical = bio->bi_sector << 9;
-	u64 physical;
 	u64 length = 0;
 	u64 map_length;
 	struct bio_vec *bvec;
-	struct multi_bio *multi = NULL;
+	struct btrfs_multi_bio *multi = NULL;
 	int i;
 	int ret;
 	int dev_nr = 0;
@@ -943,26 +959,22 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio)
 
 	map_tree = &root->fs_info->mapping_tree;
 	map_length = length;
+
+	ret = btrfs_map_block(map_tree, rw, logical, &map_length, &multi);
+	BUG_ON(ret);
+
+	total_devs = multi->num_stripes;
+	if (map_length < length) {
+		printk("mapping failed logical %Lu bio len %Lu "
+		       "len %Lu\n", logical, length, map_length);
+		BUG();
+	}
+	multi->end_io = first_bio->bi_end_io;
+	multi->private = first_bio->bi_private;
+	atomic_set(&multi->stripes_pending, multi->num_stripes);
+
 	while(dev_nr < total_devs) {
-		ret = btrfs_map_block(map_tree, rw, dev_nr, logical,
-				      &physical, &map_length, &dev,
-				      &total_devs);
-		if (map_length < length) {
-			printk("mapping failed logical %Lu bio len %Lu physical %Lu "
-			       "len %Lu\n", logical, length, physical, map_length);
-			BUG();
-		}
-		BUG_ON(map_length < length);
 		if (total_devs > 1) {
-			if (!multi) {
-				multi = kmalloc(sizeof(*multi), GFP_NOFS);
-				atomic_set(&multi->stripes, 1);
-				multi->end_io = bio->bi_end_io;
-				multi->private = first_bio->bi_private;
-				multi->error = 0;
-			} else {
-				atomic_inc(&multi->stripes);
-			}
 			if (dev_nr < total_devs - 1) {
 				bio = bio_clone(first_bio, GFP_NOFS);
 				BUG_ON(!bio);
@@ -972,7 +984,8 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio)
 			bio->bi_private = multi;
 			bio->bi_end_io = end_bio_multi_stripe;
 		}
-		bio->bi_sector = physical >> 9;
+		bio->bi_sector = multi->stripes[dev_nr].physical >> 9;
+		dev = multi->stripes[dev_nr].dev;
 		bio->bi_bdev = dev->bdev;
 		spin_lock(&dev->io_lock);
 		dev->total_ios++;
@@ -980,6 +993,8 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio)
 		submit_bio(rw, bio);
 		dev_nr++;
 	}
+	if (total_devs == 1)
+		kfree(multi);
 	return 0;
 }
 
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 35dec3efd78c..10ca0104750c 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -19,6 +19,8 @@
 #ifndef __BTRFS_VOLUMES_
 #define __BTRFS_VOLUMES_
 
+#include <linux/bio.h>
+
 struct btrfs_device {
 	struct list_head dev_list;
 	struct btrfs_root *dev_root;
@@ -69,12 +71,29 @@ struct btrfs_fs_devices {
 	struct list_head list;
 };
 
+struct btrfs_bio_stripe {
+	struct btrfs_device *dev;
+	u64 physical;
+};
+
+struct btrfs_multi_bio {
+	atomic_t stripes_pending;
+	bio_end_io_t *end_io;
+	void *private;
+	int error;
+	int num_stripes;
+	struct btrfs_bio_stripe stripes[];
+};
+
+#define btrfs_multi_bio_size(n) (sizeof(struct btrfs_multi_bio) + \
+			    (sizeof(struct btrfs_bio_stripe) * (n)))
+
 int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
 			   struct btrfs_device *device,
 			   u64 owner, u64 num_bytes, u64 *start);
-int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, int stripe_nr,
-		    u64 logical, u64 *phys, u64 *length,
-		    struct btrfs_device **dev, int *total_stripes);
+int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
+		    u64 logical, u64 *length,
+		    struct btrfs_multi_bio **multi_ret);
 int btrfs_read_sys_array(struct btrfs_root *root);
 int btrfs_read_chunk_tree(struct btrfs_root *root);
 int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
-- 
cgit v1.2.3


From f188591e987e21b6f7f8864c66a02858b95b530e Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 9 Apr 2008 16:28:12 -0400
Subject: Btrfs: Retry metadata reads in the face of checksum failures

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c     | 11 +++++++-
 fs/btrfs/disk-io.c   | 72 ++++++++++++++++++++++++++++++++++++++--------------
 fs/btrfs/extent_io.c | 50 +++++++++++++++++++++---------------
 fs/btrfs/extent_io.h |  5 ++--
 fs/btrfs/inode.c     |  7 ++---
 fs/btrfs/volumes.c   | 39 +++++++++++++++++++++++++---
 fs/btrfs/volumes.h   |  6 +++--
 7 files changed, 138 insertions(+), 52 deletions(-)

(limited to 'fs/btrfs/volumes.c')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index efce173a935c..ff4e9c6859d1 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -569,7 +569,16 @@ static int check_leaf(struct btrfs_root *root, struct btrfs_path *path,
 static int noinline check_block(struct btrfs_root *root,
 				struct btrfs_path *path, int level)
 {
-	return 0;
+	u64 found_start;
+	if (btrfs_header_level(path->nodes[level]) != level)
+	    printk("warning: bad level %Lu wanted %d found %d\n",
+		   path->nodes[level]->start, level,
+		   btrfs_header_level(path->nodes[level]));
+	found_start = btrfs_header_bytenr(path->nodes[level]);
+	if (found_start != path->nodes[level]->start) {
+	    printk("warning: bad bytentr %Lu found %Lu\n",
+		   path->nodes[level]->start, found_start);
+	}
 #if 0
 	struct extent_buffer *buf = path->nodes[level];
 
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 94ff87d0eae4..59bdf0474be3 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -156,7 +156,6 @@ static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf,
 			memcpy(&found, result, BTRFS_CRC32_SIZE);
 
 			read_extent_buffer(buf, &val, 0, BTRFS_CRC32_SIZE);
-			WARN_ON(1);
 			printk("btrfs: %s checksum verify failed on %llu "
 			       "wanted %X found %X from_this_trans %d "
 			       "level %d\n",
@@ -171,6 +170,40 @@ static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf,
 	return 0;
 }
 
+static int btree_read_extent_buffer_pages(struct btrfs_root *root,
+					  struct extent_buffer *eb,
+					  u64 start)
+{
+	struct extent_io_tree *io_tree;
+	int ret;
+	int num_copies = 0;
+	int mirror_num = 0;
+
+	io_tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree;
+	while (1) {
+		ret = read_extent_buffer_pages(io_tree, eb, start, 1,
+					       btree_get_extent, mirror_num);
+		if (!ret) {
+			if (mirror_num)
+printk("good read %Lu mirror %d total %d\n", eb->start, mirror_num, num_copies);
+			return ret;
+		}
+		num_copies = btrfs_num_copies(&root->fs_info->mapping_tree,
+					      eb->start, eb->len);
+printk("failed to read %Lu mirror %d total %d\n", eb->start, mirror_num, num_copies);
+		if (num_copies == 1) {
+printk("reading %Lu failed only one copy\n", eb->start);
+			return ret;
+		}
+		mirror_num++;
+		if (mirror_num > num_copies) {
+printk("bailing at mirror %d of %d\n", mirror_num, num_copies);
+			return ret;
+		}
+	}
+printk("read extent buffer page last\n");
+	return -EIO;
+}
 
 int csum_dirty_buffer(struct btrfs_root *root, struct page *page)
 {
@@ -180,6 +213,8 @@ int csum_dirty_buffer(struct btrfs_root *root, struct page *page)
 	int found_level;
 	unsigned long len;
 	struct extent_buffer *eb;
+	int ret;
+
 	tree = &BTRFS_I(page->mapping->host)->io_tree;
 
 	if (page->private == EXTENT_PAGE_PRIVATE)
@@ -191,8 +226,8 @@ int csum_dirty_buffer(struct btrfs_root *root, struct page *page)
 		WARN_ON(1);
 	}
 	eb = alloc_extent_buffer(tree, start, len, page, GFP_NOFS);
-	read_extent_buffer_pages(tree, eb, start + PAGE_CACHE_SIZE, 1,
-				 btree_get_extent);
+	ret = btree_read_extent_buffer_pages(root, eb, start + PAGE_CACHE_SIZE);
+	BUG_ON(ret);
 	btrfs_clear_buffer_defrag(eb);
 	found_start = btrfs_header_bytenr(eb);
 	if (found_start != start) {
@@ -240,7 +275,7 @@ int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
 	unsigned long len;
 	struct extent_buffer *eb;
 	struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
-	int ret;
+	int ret = 0;
 
 	tree = &BTRFS_I(page->mapping->host)->io_tree;
 	if (page->private == EXTENT_PAGE_PRIVATE)
@@ -252,25 +287,26 @@ int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
 		WARN_ON(1);
 	}
 	eb = alloc_extent_buffer(tree, start, len, page, GFP_NOFS);
-	read_extent_buffer_pages(tree, eb, start + PAGE_CACHE_SIZE, 1,
-				 btree_get_extent);
+
 	btrfs_clear_buffer_defrag(eb);
 	found_start = btrfs_header_bytenr(eb);
 	if (found_start != start) {
-		printk("warning: eb start incorrect %Lu buffer %Lu len %lu\n",
-		       start, found_start, len);
-		WARN_ON(1);
+printk("bad start on %Lu found %Lu\n", eb->start, found_start);
+		ret = -EIO;
 		goto err;
 	}
 	if (eb->first_page != page) {
 		printk("bad first page %lu %lu\n", eb->first_page->index,
 		       page->index);
 		WARN_ON(1);
+		ret = -EIO;
 		goto err;
 	}
 	found_level = btrfs_header_level(eb);
 
 	ret = csum_tree_block(root, eb, 1);
+	if (ret)
+		ret = -EIO;
 
 	end = min_t(u64, eb->len, PAGE_CACHE_SIZE);
 	end = eb->start + end - 1;
@@ -278,7 +314,7 @@ int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
 err:
 	free_extent_buffer(eb);
 out:
-	return 0;
+	return ret;
 }
 
 #if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,23)
@@ -329,7 +365,8 @@ int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
 	return 0;
 }
 
-static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio)
+static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
+				 int mirror_num)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	u64 offset;
@@ -338,7 +375,7 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio)
 	offset = bio->bi_sector << 9;
 
 	if (rw & (1 << BIO_RW)) {
-		return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio);
+		return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num);
 	}
 
 	ret = btrfs_bio_wq_end_io(root->fs_info, bio, 1);
@@ -349,7 +386,7 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio)
 		submit_bio(rw, bio);
 		return 0;
 	}
-	return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio);
+	return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num);
 }
 
 static int btree_writepage(struct page *page, struct writeback_control *wbc)
@@ -459,7 +496,7 @@ int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize)
 	if (!buf)
 		return 0;
 	read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree,
-				 buf, 0, 0, btree_get_extent);
+				 buf, 0, 0, btree_get_extent, 0);
 	free_extent_buffer(buf);
 	return ret;
 }
@@ -522,8 +559,7 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
 	if (!buf)
 		return NULL;
 
-	ret = read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree, buf, 0,
-				       1, btree_get_extent);
+	ret = btree_read_extent_buffer_pages(root, buf, 0);
 
 	if (ret == 0) {
 		buf->flags |= EXTENT_UPTODATE;
@@ -1366,10 +1402,8 @@ int btrfs_clear_buffer_defrag(struct extent_buffer *buf)
 int btrfs_read_buffer(struct extent_buffer *buf)
 {
 	struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root;
-	struct inode *btree_inode = root->fs_info->btree_inode;
 	int ret;
-	ret = read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree,
-					buf, 0, 1, btree_get_extent);
+	ret = btree_read_extent_buffer_pages(root, buf, 0);
 	if (ret == 0) {
 		buf->flags |= EXTENT_UPTODATE;
 	}
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index cfc383c17a3a..2f159375c878 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1546,7 +1546,7 @@ static int end_bio_extent_readpage(struct bio *bio,
 				    !(state->state & EXTENT_LOCKED))
 					state = NULL;
 			}
-			if (!state) {
+			if (!state && uptodate) {
 				spin_unlock_irqrestore(&tree->lock, flags);
 				set_extent_uptodate(tree, start, end,
 						    GFP_ATOMIC);
@@ -1567,8 +1567,10 @@ static int end_bio_extent_readpage(struct bio *bio,
 			} else {
 				state = NULL;
 			}
-			set_state_cb(tree, clear, EXTENT_UPTODATE);
-			clear->state |= EXTENT_UPTODATE;
+			if (uptodate) {
+				set_state_cb(tree, clear, EXTENT_UPTODATE);
+				clear->state |= EXTENT_UPTODATE;
+			}
 			clear_state_bit(tree, clear, EXTENT_LOCKED,
 					1, 0);
 			if (cur == start)
@@ -1685,7 +1687,7 @@ extent_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
 	return bio;
 }
 
-static int submit_one_bio(int rw, struct bio *bio)
+static int submit_one_bio(int rw, struct bio *bio, int mirror_num)
 {
 	u64 maxsector;
 	int ret = 0;
@@ -1722,7 +1724,8 @@ static int submit_one_bio(int rw, struct bio *bio)
 		WARN_ON(1);
 	}
 	if (tree->ops && tree->ops->submit_bio_hook)
-		tree->ops->submit_bio_hook(page->mapping->host, rw, bio);
+		tree->ops->submit_bio_hook(page->mapping->host, rw, bio,
+					   mirror_num);
 	else
 		submit_bio(rw, bio);
 	if (bio_flagged(bio, BIO_EOPNOTSUPP))
@@ -1737,7 +1740,8 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree,
 			      struct block_device *bdev,
 			      struct bio **bio_ret,
 			      unsigned long max_pages,
-			      bio_end_io_t end_io_func)
+			      bio_end_io_t end_io_func,
+			      int mirror_num)
 {
 	int ret = 0;
 	struct bio *bio;
@@ -1749,7 +1753,7 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree,
 		    (tree->ops && tree->ops->merge_bio_hook &&
 		     tree->ops->merge_bio_hook(page, offset, size, bio)) ||
 		    bio_add_page(bio, page, size, offset) < size) {
-			ret = submit_one_bio(rw, bio);
+			ret = submit_one_bio(rw, bio, mirror_num);
 			bio = NULL;
 		} else {
 			return 0;
@@ -1769,7 +1773,7 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree,
 	if (bio_ret) {
 		*bio_ret = bio;
 	} else {
-		ret = submit_one_bio(rw, bio);
+		ret = submit_one_bio(rw, bio, mirror_num);
 	}
 
 	return ret;
@@ -1798,7 +1802,7 @@ void set_page_extent_head(struct page *page, unsigned long len)
 static int __extent_read_full_page(struct extent_io_tree *tree,
 				   struct page *page,
 				   get_extent_t *get_extent,
-				   struct bio **bio)
+				   struct bio **bio, int mirror_num)
 {
 	struct inode *inode = page->mapping->host;
 	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
@@ -1901,7 +1905,7 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
 			ret = submit_extent_page(READ, tree, page,
 					 sector, iosize, page_offset,
 					 bdev, bio, nr,
-					 end_bio_extent_readpage);
+					 end_bio_extent_readpage, mirror_num);
 		}
 		if (ret)
 			SetPageError(page);
@@ -1923,9 +1927,9 @@ int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
 	struct bio *bio = NULL;
 	int ret;
 
-	ret = __extent_read_full_page(tree, page, get_extent, &bio);
+	ret = __extent_read_full_page(tree, page, get_extent, &bio, 0);
 	if (bio)
-		submit_one_bio(READ, bio);
+		submit_one_bio(READ, bio, 0);
 	return ret;
 }
 EXPORT_SYMBOL(extent_read_full_page);
@@ -2077,7 +2081,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
 			ret = submit_extent_page(WRITE, tree, page, sector,
 						 iosize, page_offset, bdev,
 						 &epd->bio, max_nr,
-						 end_bio_extent_writepage);
+						 end_bio_extent_writepage, 0);
 			if (ret)
 				SetPageError(page);
 		}
@@ -2244,7 +2248,7 @@ int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
 
 	write_cache_pages(mapping, &wbc_writepages, __extent_writepage, &epd);
 	if (epd.bio) {
-		submit_one_bio(WRITE, epd.bio);
+		submit_one_bio(WRITE, epd.bio, 0);
 	}
 	return ret;
 }
@@ -2265,7 +2269,7 @@ int extent_writepages(struct extent_io_tree *tree,
 
 	ret = write_cache_pages(mapping, wbc, __extent_writepage, &epd);
 	if (epd.bio) {
-		submit_one_bio(WRITE, epd.bio);
+		submit_one_bio(WRITE, epd.bio, 0);
 	}
 	return ret;
 }
@@ -2297,7 +2301,8 @@ int extent_readpages(struct extent_io_tree *tree,
 			page_cache_get(page);
 			if (!pagevec_add(&pvec, page))
 				__pagevec_lru_add(&pvec);
-			__extent_read_full_page(tree, page, get_extent, &bio);
+			__extent_read_full_page(tree, page, get_extent,
+						&bio, 0);
 		}
 		page_cache_release(page);
 	}
@@ -2305,7 +2310,7 @@ int extent_readpages(struct extent_io_tree *tree,
 		__pagevec_lru_add(&pvec);
 	BUG_ON(!list_empty(pages));
 	if (bio)
-		submit_one_bio(READ, bio);
+		submit_one_bio(READ, bio, 0);
 	return 0;
 }
 EXPORT_SYMBOL(extent_readpages);
@@ -2430,7 +2435,7 @@ int extent_prepare_write(struct extent_io_tree *tree,
 			ret = submit_extent_page(READ, tree, page,
 					 sector, iosize, page_offset, em->bdev,
 					 NULL, 1,
-					 end_bio_extent_preparewrite);
+					 end_bio_extent_preparewrite, 0);
 			iocount++;
 			block_start = block_start + iosize;
 		} else {
@@ -2696,6 +2701,7 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
 		mark_page_accessed(page0);
 		set_page_extent_mapped(page0);
 		set_page_extent_head(page0, len);
+		uptodate = PageUptodate(page0);
 	} else {
 		i = 0;
 	}
@@ -3006,7 +3012,7 @@ EXPORT_SYMBOL(extent_buffer_uptodate);
 int read_extent_buffer_pages(struct extent_io_tree *tree,
 			     struct extent_buffer *eb,
 			     u64 start, int wait,
-			     get_extent_t *get_extent)
+			     get_extent_t *get_extent, int mirror_num)
 {
 	unsigned long i;
 	unsigned long start_i;
@@ -3062,8 +3068,10 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
 		if (!PageUptodate(page)) {
 			if (start_i == 0)
 				inc_all_pages = 1;
+			ClearPageError(page);
 			err = __extent_read_full_page(tree, page,
-						      get_extent, &bio);
+						      get_extent, &bio,
+						      mirror_num);
 			if (err) {
 				ret = err;
 			}
@@ -3073,7 +3081,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
 	}
 
 	if (bio)
-		submit_one_bio(READ, bio);
+		submit_one_bio(READ, bio, mirror_num);
 
 	if (ret || !wait) {
 		return ret;
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 9d2991d1d3ce..8d6b8a14cc30 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -27,7 +27,8 @@ struct extent_state;
 struct extent_io_ops {
 	int (*fill_delalloc)(struct inode *inode, u64 start, u64 end);
 	int (*writepage_io_hook)(struct page *page, u64 start, u64 end);
-	int (*submit_bio_hook)(struct inode *inode, int rw, struct bio *bio);
+	int (*submit_bio_hook)(struct inode *inode, int rw, struct bio *bio,
+			       int mirror_num);
 	int (*merge_bio_hook)(struct page *page, unsigned long offset,
 			      size_t size, struct bio *bio);
 	int (*readpage_io_hook)(struct page *page, u64 start, u64 end);
@@ -172,7 +173,7 @@ struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
 void free_extent_buffer(struct extent_buffer *eb);
 int read_extent_buffer_pages(struct extent_io_tree *tree,
 			     struct extent_buffer *eb, u64 start, int wait,
-			     get_extent_t *get_extent);
+			     get_extent_t *get_extent, int mirror_num);
 
 static inline void extent_buffer_get(struct extent_buffer *eb)
 {
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index e1ef1acdb350..8c2d5d036bd6 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -314,7 +314,7 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
 	map_tree = &root->fs_info->mapping_tree;
 	map_length = length;
 	ret = btrfs_map_block(map_tree, READ, logical,
-			      &map_length, NULL);
+			      &map_length, NULL, 0);
 
 	if (map_length < length + size) {
 		return 1;
@@ -322,7 +322,8 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
 	return 0;
 }
 
-int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio)
+int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
+			  int mirror_num)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_trans_handle *trans;
@@ -347,7 +348,7 @@ int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio)
 	BUG_ON(ret);
 	mutex_unlock(&root->fs_info->fs_mutex);
 mapit:
-	return btrfs_map_bio(root, rw, bio);
+	return btrfs_map_bio(root, rw, bio, mirror_num);
 }
 
 int btrfs_readpage_io_hook(struct page *page, u64 start, u64 end)
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 008d3640e8c2..3b927f698320 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -788,9 +788,31 @@ void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree)
 	}
 }
 
+int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len)
+{
+	struct extent_map *em;
+	struct map_lookup *map;
+	struct extent_map_tree *em_tree = &map_tree->map_tree;
+	int ret;
+
+	spin_lock(&em_tree->lock);
+	em = lookup_extent_mapping(em_tree, logical, len);
+	BUG_ON(!em);
+
+	BUG_ON(em->start > logical || em->start + em->len < logical);
+	map = (struct map_lookup *)em->bdev;
+	if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1))
+		ret = map->num_stripes;
+	else
+		ret = 1;
+	free_extent_map(em);
+	spin_unlock(&em_tree->lock);
+	return ret;
+}
+
 int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
 		    u64 logical, u64 *length,
-		    struct btrfs_multi_bio **multi_ret)
+		    struct btrfs_multi_bio **multi_ret, int mirror_num)
 {
 	struct extent_map *em;
 	struct map_lookup *map;
@@ -822,6 +844,9 @@ again:
 	map = (struct map_lookup *)em->bdev;
 	offset = logical - em->start;
 
+	if (mirror_num > map->num_stripes)
+		mirror_num = 0;
+
 	/* if our multi bio struct is too small, back off and try again */
 	if (multi_ret && (rw & (1 << BIO_RW)) &&
 	    stripes_allocated < map->num_stripes &&
@@ -862,7 +887,9 @@ again:
 	if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
 		if (rw & (1 << BIO_RW))
 			multi->num_stripes = map->num_stripes;
-		else {
+		else if (mirror_num) {
+			stripe_index = mirror_num - 1;
+		} else {
 			int i;
 			u64 least = (u64)-1;
 			struct btrfs_device *cur;
@@ -880,6 +907,8 @@ again:
 	} else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
 		if (rw & (1 << BIO_RW))
 			multi->num_stripes = map->num_stripes;
+		else if (mirror_num)
+			stripe_index = mirror_num - 1;
 	} else {
 		/*
 		 * after this do_div call, stripe_nr is the number of stripes
@@ -938,7 +967,8 @@ static int end_bio_multi_stripe(struct bio *bio,
 #endif
 }
 
-int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio)
+int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
+		  int mirror_num)
 {
 	struct btrfs_mapping_tree *map_tree;
 	struct btrfs_device *dev;
@@ -960,7 +990,8 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio)
 	map_tree = &root->fs_info->mapping_tree;
 	map_length = length;
 
-	ret = btrfs_map_block(map_tree, rw, logical, &map_length, &multi);
+	ret = btrfs_map_block(map_tree, rw, logical, &map_length, &multi,
+			      mirror_num);
 	BUG_ON(ret);
 
 	total_devs = multi->num_stripes;
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 10ca0104750c..3d5d0a9cb827 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -93,7 +93,7 @@ int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
 			   u64 owner, u64 num_bytes, u64 *start);
 int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
 		    u64 logical, u64 *length,
-		    struct btrfs_multi_bio **multi_ret);
+		    struct btrfs_multi_bio **multi_ret, int mirror_num);
 int btrfs_read_sys_array(struct btrfs_root *root);
 int btrfs_read_chunk_tree(struct btrfs_root *root);
 int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
@@ -101,7 +101,8 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 		      u64 *num_bytes, u64 type);
 void btrfs_mapping_init(struct btrfs_mapping_tree *tree);
 void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree);
-int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio);
+int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
+		  int mirror_num);
 int btrfs_read_super_device(struct btrfs_root *root, struct extent_buffer *buf);
 int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
 		       int flags, void *holder);
@@ -112,4 +113,5 @@ int btrfs_add_device(struct btrfs_trans_handle *trans,
 		     struct btrfs_root *root,
 		     struct btrfs_device *device);
 int btrfs_cleanup_fs_uuids(void);
+int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len);
 #endif
-- 
cgit v1.2.3


From f29844623de29a12358d7fba35d0959465b64adf Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 10 Apr 2008 16:19:33 -0400
Subject: Btrfs: Write out all super blocks on commit, and bring back proper
 barrier support

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/disk-io.c | 118 ++++++++++++++++++++++++++++++++++++++++++++++++++---
 fs/btrfs/disk-io.h |   1 +
 fs/btrfs/volumes.c |   8 ++--
 fs/btrfs/volumes.h |   3 ++
 4 files changed, 122 insertions(+), 8 deletions(-)

(limited to 'fs/btrfs/volumes.c')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 59bdf0474be3..cf1de75f088a 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -382,7 +382,7 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
 	BUG_ON(ret);
 
 	if (offset == BTRFS_SUPER_INFO_OFFSET) {
-		bio->bi_bdev = root->fs_info->sb->s_bdev;
+		bio->bi_bdev = root->fs_info->fs_devices->latest_bdev;
 		submit_bio(rw, bio);
 		return 0;
 	}
@@ -988,7 +988,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	spin_lock_init(&fs_info->new_trans_lock);
 
 	init_completion(&fs_info->kobj_unregister);
-	sb_set_blocksize(sb, 4096);
+	sb_set_blocksize(sb, BTRFS_SUPER_INFO_SIZE);
 	fs_info->tree_root = tree_root;
 	fs_info->extent_root = extent_root;
 	fs_info->chunk_root = chunk_root;
@@ -1169,14 +1169,121 @@ fail:
 	return ERR_PTR(err);
 }
 
+static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate)
+{
+	char b[BDEVNAME_SIZE];
+
+	if (uptodate) {
+		set_buffer_uptodate(bh);
+	} else {
+		if (!buffer_eopnotsupp(bh) && printk_ratelimit()) {
+			printk(KERN_WARNING "lost page write due to "
+					"I/O error on %s\n",
+				       bdevname(bh->b_bdev, b));
+		}
+		set_buffer_write_io_error(bh);
+		clear_buffer_uptodate(bh);
+	}
+	unlock_buffer(bh);
+	put_bh(bh);
+}
+
+int write_all_supers(struct btrfs_root *root)
+{
+	struct list_head *cur;
+	struct list_head *head = &root->fs_info->fs_devices->devices;
+	struct btrfs_device *dev;
+	struct extent_buffer *sb;
+	struct btrfs_dev_item *dev_item;
+	struct buffer_head *bh;
+	int ret;
+	int do_barriers;
+
+	do_barriers = !btrfs_test_opt(root, NOBARRIER);
+
+	sb = root->fs_info->sb_buffer;
+	dev_item = (struct btrfs_dev_item *)offsetof(struct btrfs_super_block,
+						      dev_item);
+	list_for_each(cur, head) {
+		dev = list_entry(cur, struct btrfs_device, dev_list);
+		btrfs_set_device_type(sb, dev_item, dev->type);
+		btrfs_set_device_id(sb, dev_item, dev->devid);
+		btrfs_set_device_total_bytes(sb, dev_item, dev->total_bytes);
+		btrfs_set_device_bytes_used(sb, dev_item, dev->bytes_used);
+		btrfs_set_device_io_align(sb, dev_item, dev->io_align);
+		btrfs_set_device_io_width(sb, dev_item, dev->io_width);
+		btrfs_set_device_sector_size(sb, dev_item, dev->sector_size);
+		write_extent_buffer(sb, dev->uuid,
+				    (unsigned long)btrfs_device_uuid(dev_item),
+				    BTRFS_DEV_UUID_SIZE);
+
+		btrfs_set_header_flag(sb, BTRFS_HEADER_FLAG_WRITTEN);
+		csum_tree_block(root, sb, 0);
+
+		bh = __getblk(dev->bdev, BTRFS_SUPER_INFO_OFFSET /
+			      root->fs_info->sb->s_blocksize,
+			      BTRFS_SUPER_INFO_SIZE);
+
+		read_extent_buffer(sb, bh->b_data, 0, BTRFS_SUPER_INFO_SIZE);
+		dev->pending_io = bh;
+
+		get_bh(bh);
+		set_buffer_uptodate(bh);
+		lock_buffer(bh);
+		bh->b_end_io = btrfs_end_buffer_write_sync;
+
+		if (do_barriers && dev->barriers) {
+			ret = submit_bh(WRITE_BARRIER, bh);
+			if (ret == -EOPNOTSUPP) {
+				printk("btrfs: disabling barriers on dev %s\n",
+				       dev->name);
+				set_buffer_uptodate(bh);
+				dev->barriers = 0;
+				get_bh(bh);
+				lock_buffer(bh);
+				ret = submit_bh(WRITE, bh);
+			}
+		} else {
+			ret = submit_bh(WRITE, bh);
+		}
+		BUG_ON(ret);
+	}
+
+	list_for_each(cur, head) {
+		dev = list_entry(cur, struct btrfs_device, dev_list);
+		BUG_ON(!dev->pending_io);
+		bh = dev->pending_io;
+		wait_on_buffer(bh);
+		if (!buffer_uptodate(dev->pending_io)) {
+			if (do_barriers && dev->barriers) {
+				printk("btrfs: disabling barriers on dev %s\n",
+				       dev->name);
+				set_buffer_uptodate(bh);
+				get_bh(bh);
+				lock_buffer(bh);
+				dev->barriers = 0;
+				ret = submit_bh(WRITE, bh);
+				BUG_ON(ret);
+				wait_on_buffer(bh);
+				BUG_ON(!buffer_uptodate(bh));
+			} else {
+				BUG();
+			}
+
+		}
+		dev->pending_io = NULL;
+		brelse(bh);
+	}
+	return 0;
+}
+
 int write_ctree_super(struct btrfs_trans_handle *trans, struct btrfs_root
 		      *root)
 {
 	int ret;
-	struct extent_buffer *super = root->fs_info->sb_buffer;
-	struct inode *btree_inode = root->fs_info->btree_inode;
-	struct super_block *sb = root->fs_info->sb;
 
+	ret = write_all_supers(root);
+#if 0
 	if (!btrfs_test_opt(root, NOBARRIER))
 		blkdev_issue_flush(sb->s_bdev, NULL);
 	set_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree, super);
@@ -1184,6 +1291,7 @@ int write_ctree_super(struct btrfs_trans_handle *trans, struct btrfs_root
 				     super->start, super->len);
 	if (!btrfs_test_opt(root, NOBARRIER))
 		blkdev_issue_flush(sb->s_bdev, NULL);
+#endif
 	return ret;
 }
 
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 4fac0ccbf8f8..60b01902db79 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -20,6 +20,7 @@
 #define __DISKIO__
 
 #define BTRFS_SUPER_INFO_OFFSET (16 * 1024)
+#define BTRFS_SUPER_INFO_SIZE 4096
 struct btrfs_device;
 struct btrfs_fs_devices;
 
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 3b927f698320..07d43553141c 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -125,6 +125,7 @@ static int device_list_add(const char *path,
 			return -ENOMEM;
 		}
 		device->devid = devid;
+		device->barriers = 1;
 		device->name = kstrdup(path, GFP_NOFS);
 		if (!device->name) {
 			kfree(device);
@@ -208,6 +209,7 @@ int btrfs_scan_one_device(const char *path, int flags, void *holder,
 	struct buffer_head *bh;
 	int ret;
 	u64 devid;
+	u64 transid;
 
 	mutex_lock(&uuid_mutex);
 
@@ -236,14 +238,14 @@ int btrfs_scan_one_device(const char *path, int flags, void *holder,
 		goto error_brelse;
 	}
 	devid = le64_to_cpu(disk_super->dev_item.devid);
-	printk("found device %Lu on %s\n", devid, path);
+	transid = btrfs_super_generation(disk_super);
+	printk("found device %Lu transid %Lu on %s\n", devid, transid, path);
 	ret = device_list_add(path, disk_super, devid, fs_devices_ret);
 
 error_brelse:
 	brelse(bh);
 error_close:
 	close_bdev_excl(bdev);
-	printk("scan one closes bdev %s\n", path);
 error:
 	mutex_unlock(&uuid_mutex);
 	return ret;
@@ -1143,7 +1145,7 @@ static int read_one_dev(struct btrfs_root *root,
 	device = btrfs_find_device(root, devid);
 	if (!device) {
 		printk("warning devid %Lu not found already\n", devid);
-		device = kmalloc(sizeof(*device), GFP_NOFS);
+		device = kzalloc(sizeof(*device), GFP_NOFS);
 		if (!device)
 			return -ENOMEM;
 		list_add(&device->dev_list,
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 3d5d0a9cb827..89548837a1cc 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -21,9 +21,12 @@
 
 #include <linux/bio.h>
 
+struct buffer_head;
 struct btrfs_device {
 	struct list_head dev_list;
 	struct btrfs_root *dev_root;
+	struct buffer_head *pending_io;
+	int barriers;
 	spinlock_t io_lock;
 
 	struct block_device *bdev;
-- 
cgit v1.2.3


From 73f61b2a6459df982cb4faf0e5cf5ac8b153aaff Mon Sep 17 00:00:00 2001
From: Miguel <miguel.filipe@gmail.com>
Date: Fri, 11 Apr 2008 15:50:59 -0400
Subject: Btrfs: bio_endio support for linux 2.6.23 and older.

bio_endio() changed prototype on linux 2.6.24, support older kernels
using the older prototype.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/disk-io.c | 5 ++++-
 fs/btrfs/volumes.c | 4 ++++
 2 files changed, 8 insertions(+), 1 deletion(-)

(limited to 'fs/btrfs/volumes.c')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 0bfcc31d94d7..aebe8c21ec80 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -949,11 +949,14 @@ void btrfs_end_io_csum(struct work_struct *work)
 		bio->bi_private = end_io_wq->private;
 		bio->bi_end_io = end_io_wq->end_io;
 		kfree(end_io_wq);
+#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
+		bio_endio(bio, bio->bi_size, err);
+#else
 		bio_endio(bio, error);
+#endif
 	}
 }
 
-
 struct btrfs_root *open_ctree(struct super_block *sb,
 			      struct btrfs_fs_devices *fs_devices)
 {
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 07d43553141c..82bc6cfc110f 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -960,7 +960,11 @@ static int end_bio_multi_stripe(struct bio *bio,
 			err = multi->error;
 		kfree(multi);
 
+#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
+		bio_endio(bio, bio->bi_size, err);
+#else
 		bio_endio(bio, err);
+#endif
 	} else {
 		bio_put(bio);
 	}
-- 
cgit v1.2.3


From b248a4152956cbae1b23f4c70ef5b51d6ea2ecfb Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 14 Apr 2008 09:48:18 -0400
Subject: Btrfs: A few updates for 2.6.18 and versions older than 2.6.25

This includes fixing a missing spinlock init call that caused oops on mount
for most kernels other than 2.6.25.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/disk-io.c | 12 ++++++++++--
 fs/btrfs/file.c    |  8 +++++++-
 fs/btrfs/volumes.c | 15 +++++++--------
 3 files changed, 24 insertions(+), 11 deletions(-)

(limited to 'fs/btrfs/volumes.c')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index aebe8c21ec80..9c94dddde704 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -850,7 +850,9 @@ void btrfs_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
 
 static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi)
 {
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,23)
 	bdi_init(bdi);
+#endif
 	bdi->ra_pages	= default_backing_dev_info.ra_pages * 4;
 	bdi->state		= 0;
 	bdi->capabilities	= default_backing_dev_info.capabilities;
@@ -950,7 +952,7 @@ void btrfs_end_io_csum(struct work_struct *work)
 		bio->bi_end_io = end_io_wq->end_io;
 		kfree(end_io_wq);
 #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
-		bio_endio(bio, bio->bi_size, err);
+		bio_endio(bio, bio->bi_size, error);
 #else
 		bio_endio(bio, error);
 #endif
@@ -1037,10 +1039,11 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 			     fs_info->btree_inode->i_mapping, GFP_NOFS);
 	fs_info->do_barriers = 1;
 
-	INIT_WORK(&fs_info->end_io_work, btrfs_end_io_csum);
 #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18)
+	INIT_WORK(&fs_info->end_io_work, btrfs_end_io_csum, fs_info);
 	INIT_WORK(&fs_info->trans_work, btrfs_transaction_cleaner, fs_info);
 #else
+	INIT_WORK(&fs_info->end_io_work, btrfs_end_io_csum);
 	INIT_DELAYED_WORK(&fs_info->trans_work, btrfs_transaction_cleaner);
 #endif
 	BTRFS_I(fs_info->btree_inode)->root = tree_root;
@@ -1173,7 +1176,9 @@ fail:
 	close_all_devices(fs_info);
 	kfree(extent_root);
 	kfree(tree_root);
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,23)
 	bdi_destroy(&fs_info->bdi);
+#endif
 	kfree(fs_info);
 	return ERR_PTR(err);
 }
@@ -1407,7 +1412,10 @@ int close_ctree(struct btrfs_root *root)
 #endif
 	close_all_devices(fs_info);
 	btrfs_mapping_tree_free(&fs_info->mapping_tree);
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,23)
 	bdi_destroy(&fs_info->bdi);
+#endif
 
 	kfree(fs_info->extent_root);
 	kfree(fs_info->tree_root);
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 5b9386452218..9fbda6552069 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -908,11 +908,17 @@ out_nolock:
 		if (err < 0)
 			num_written = err;
 	} else if (num_written > 0 && (file->f_flags & O_DIRECT)) {
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,23)
+		do_sync_file_range(file, start_pos,
+				      start_pos + num_written - 1,
+				      SYNC_FILE_RANGE_WRITE |
+				      SYNC_FILE_RANGE_WAIT_AFTER);
+#else
 		do_sync_mapping_range(inode->i_mapping, start_pos,
 				      start_pos + num_written - 1,
 				      SYNC_FILE_RANGE_WRITE |
 				      SYNC_FILE_RANGE_WAIT_AFTER);
-
+#endif
 		invalidate_mapping_pages(inode->i_mapping,
 		      start_pos >> PAGE_CACHE_SHIFT,
 		     (start_pos + num_written - 1) >> PAGE_CACHE_SHIFT);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 82bc6cfc110f..f81519f0e4a7 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -126,6 +126,7 @@ static int device_list_add(const char *path,
 		}
 		device->devid = devid;
 		device->barriers = 1;
+		spin_lock_init(&device->io_lock);
 		device->name = kstrdup(path, GFP_NOFS);
 		if (!device->name) {
 			kfree(device);
@@ -759,8 +760,8 @@ printk("alloc chunk start %Lu size %Lu from dev %Lu type %Lu\n", key.objectid, c
 	em_tree = &extent_root->fs_info->mapping_tree.map_tree;
 	spin_lock(&em_tree->lock);
 	ret = add_extent_mapping(em_tree, em);
-	BUG_ON(ret);
 	spin_unlock(&em_tree->lock);
+	BUG_ON(ret);
 	free_extent_map(em);
 	return ret;
 }
@@ -799,6 +800,7 @@ int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len)
 
 	spin_lock(&em_tree->lock);
 	em = lookup_extent_mapping(em_tree, logical, len);
+	spin_unlock(&em_tree->lock);
 	BUG_ON(!em);
 
 	BUG_ON(em->start > logical || em->start + em->len < logical);
@@ -808,7 +810,6 @@ int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len)
 	else
 		ret = 1;
 	free_extent_map(em);
-	spin_unlock(&em_tree->lock);
 	return ret;
 }
 
@@ -840,6 +841,7 @@ again:
 
 	spin_lock(&em_tree->lock);
 	em = lookup_extent_mapping(em_tree, logical, *length);
+	spin_unlock(&em_tree->lock);
 	BUG_ON(!em);
 
 	BUG_ON(em->start > logical || em->start + em->len < logical);
@@ -855,7 +857,6 @@ again:
 	    ((map->type & BTRFS_BLOCK_GROUP_RAID1) ||
 	     (map->type & BTRFS_BLOCK_GROUP_DUP))) {
 		stripes_allocated = map->num_stripes;
-		spin_unlock(&em_tree->lock);
 		free_extent_map(em);
 		kfree(multi);
 		goto again;
@@ -932,7 +933,6 @@ again:
 	*multi_ret = multi;
 out:
 	free_extent_map(em);
-	spin_unlock(&em_tree->lock);
 	return 0;
 }
 
@@ -1060,16 +1060,15 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
 	length = key->offset;
 	spin_lock(&map_tree->map_tree.lock);
 	em = lookup_extent_mapping(&map_tree->map_tree, logical, 1);
+	spin_unlock(&map_tree->map_tree.lock);
 
 	/* already mapped? */
 	if (em && em->start <= logical && em->start + em->len > logical) {
 		free_extent_map(em);
-		spin_unlock(&map_tree->map_tree.lock);
 		return 0;
 	} else if (em) {
 		free_extent_map(em);
 	}
-	spin_unlock(&map_tree->map_tree.lock);
 
 	map = kzalloc(sizeof(*map), GFP_NOFS);
 	if (!map)
@@ -1110,8 +1109,8 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
 
 	spin_lock(&map_tree->map_tree.lock);
 	ret = add_extent_mapping(&map_tree->map_tree, em);
-	BUG_ON(ret);
 	spin_unlock(&map_tree->map_tree.lock);
+	BUG_ON(ret);
 	free_extent_map(em);
 
 	return 0;
@@ -1154,7 +1153,7 @@ static int read_one_dev(struct btrfs_root *root,
 			return -ENOMEM;
 		list_add(&device->dev_list,
 			 &root->fs_info->fs_devices->devices);
-		device->total_ios = 0;
+		device->barriers = 1;
 		spin_lock_init(&device->io_lock);
 	}
 
-- 
cgit v1.2.3


From e17cade25ff8074101d653557a78df09c16ca276 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 15 Apr 2008 15:41:47 -0400
Subject: Btrfs: Add chunk uuids and update multi-device back references

Block headers now store the chunk tree uuid

Chunk items records the device uuid for each stripes

Device extent items record better back refs to the chunk tree

Block groups record better back refs to the chunk tree

The chunk tree format has also changed.  The objectid of BTRFS_CHUNK_ITEM_KEY
used to be the logical offset of the chunk.  Now it is a chunk tree id,
with the logical offset being stored in the offset field of the key.

This allows a single chunk tree to record multiple logical address spaces,
upping the number of bytes indexed by a chunk tree from 2^64 to
2^128.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c       | 12 ++++++++
 fs/btrfs/ctree.h       | 80 ++++++++++++++++++++++++++++++++++++++++++--------
 fs/btrfs/disk-io.c     |  6 +++-
 fs/btrfs/extent-tree.c | 22 +++++---------
 fs/btrfs/print-tree.c  | 20 +++++++++----
 fs/btrfs/volumes.c     | 76 +++++++++++++++++++++++++++++++----------------
 fs/btrfs/volumes.h     |  6 ++--
 7 files changed, 160 insertions(+), 62 deletions(-)

(limited to 'fs/btrfs/volumes.c')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index e8bf6c221e4e..618e526c9046 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1382,6 +1382,11 @@ static int noinline insert_new_root(struct btrfs_trans_handle *trans,
 	write_extent_buffer(c, root->fs_info->fsid,
 			    (unsigned long)btrfs_header_fsid(c),
 			    BTRFS_FSID_SIZE);
+
+	write_extent_buffer(c, root->fs_info->chunk_tree_uuid,
+			    (unsigned long)btrfs_header_chunk_tree_uuid(c),
+			    BTRFS_UUID_SIZE);
+
 	btrfs_set_node_key(c, &lower_key, 0);
 	btrfs_set_node_blockptr(c, 0, lower->start);
 	lower_gen = btrfs_header_generation(lower);
@@ -1513,6 +1518,9 @@ static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
 	write_extent_buffer(split, root->fs_info->fsid,
 			    (unsigned long)btrfs_header_fsid(split),
 			    BTRFS_FSID_SIZE);
+	write_extent_buffer(split, root->fs_info->chunk_tree_uuid,
+			    (unsigned long)btrfs_header_chunk_tree_uuid(split),
+			    BTRFS_UUID_SIZE);
 
 	mid = (c_nritems + 1) / 2;
 
@@ -2043,6 +2051,10 @@ again:
 	write_extent_buffer(right, root->fs_info->fsid,
 			    (unsigned long)btrfs_header_fsid(right),
 			    BTRFS_FSID_SIZE);
+
+	write_extent_buffer(right, root->fs_info->chunk_tree_uuid,
+			    (unsigned long)btrfs_header_chunk_tree_uuid(right),
+			    BTRFS_UUID_SIZE);
 	if (mid <= slot) {
 		if (nritems == 1 ||
 		    leaf_space_used(l, mid, nritems - mid) + space_needed >
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 09d614fcafb1..82d67c3db8bc 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -70,6 +70,7 @@ extern struct kmem_cache *btrfs_path_cachep;
  * All files have objectids higher than this.
  */
 #define BTRFS_FIRST_FREE_OBJECTID 256ULL
+#define BTRFS_FIRST_CHUNK_TREE_OBJECTID 256ULL
 
 
 /*
@@ -131,7 +132,7 @@ struct btrfs_mapping_tree {
 	struct extent_map_tree map_tree;
 };
 
-#define BTRFS_DEV_UUID_SIZE 16
+#define BTRFS_UUID_SIZE 16
 struct btrfs_dev_item {
 	/* the internal btrfs device id */
 	__le64 devid;
@@ -154,17 +155,32 @@ struct btrfs_dev_item {
 	/* type and info about this device */
 	__le64 type;
 
+	/* grouping information for allocation decisions */
+	__le32 dev_group;
+
+	/* seek speed 0-100 where 100 is fastest */
+	u8 seek_speed;
+
+	/* bandwidth 0-100 where 100 is fastest */
+	u8 bandwidth;
+
 	/* btrfs generated uuid for this device */
-	u8 uuid[BTRFS_DEV_UUID_SIZE];
+	u8 uuid[BTRFS_UUID_SIZE];
 } __attribute__ ((__packed__));
 
 struct btrfs_stripe {
 	__le64 devid;
 	__le64 offset;
+	u8 dev_uuid[BTRFS_UUID_SIZE];
 } __attribute__ ((__packed__));
 
 struct btrfs_chunk {
+	/* size of this chunk in bytes */
+	__le64 length;
+
+	/* objectid of the root referencing this chunk */
 	__le64 owner;
+
 	__le64 stripe_len;
 	__le64 type;
 
@@ -199,10 +215,14 @@ static inline unsigned long btrfs_chunk_item_size(int num_stripes)
  * every tree block (leaf or node) starts with this header.
  */
 struct btrfs_header {
+	/* these first four must match the super block */
 	u8 csum[BTRFS_CSUM_SIZE];
 	u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */
 	__le64 bytenr; /* which block this node is supposed to live in */
 	__le64 flags;
+
+	/* allowed to be different from the super from here on down */
+	u8 chunk_tree_uuid[BTRFS_UUID_SIZE];
 	__le64 generation;
 	__le64 owner;
 	__le32 nritems;
@@ -235,6 +255,8 @@ struct btrfs_super_block {
 	u8 fsid[16];    /* FS specific uuid */
 	__le64 bytenr; /* this block number */
 	__le64 flags;
+
+	/* allowed to be different from the btrfs_header from here own down */
 	__le64 magic;
 	__le64 generation;
 	__le64 root;
@@ -323,14 +345,16 @@ struct btrfs_extent_ref {
 
 /* dev extents record free space on individual devices.  The owner
  * field points back to the chunk allocation mapping tree that allocated
- * the extent
+ * the extent.  The chunk tree uuid field is a way to double check the owner
  */
 struct btrfs_dev_extent {
-	__le64 owner;
+	__le64 chunk_tree;
+	__le64 chunk_objectid;
+	__le64 chunk_offset;
 	__le64 length;
+	u8 chunk_tree_uuid[BTRFS_UUID_SIZE];
 } __attribute__ ((__packed__));
 
-
 struct btrfs_inode_ref {
 	__le16 name_len;
 	/* name goes here */
@@ -424,7 +448,6 @@ struct btrfs_csum_item {
 
 struct btrfs_block_group_item {
 	__le64 used;
-	__le64 chunk_tree;
 	__le64 chunk_objectid;
 	__le64 flags;
 } __attribute__ ((__packed__));
@@ -451,6 +474,7 @@ struct btrfs_device;
 struct btrfs_fs_devices;
 struct btrfs_fs_info {
 	u8 fsid[BTRFS_FSID_SIZE];
+	u8 chunk_tree_uuid[BTRFS_UUID_SIZE];
 	struct btrfs_root *extent_root;
 	struct btrfs_root *tree_root;
 	struct btrfs_root *chunk_root;
@@ -697,6 +721,9 @@ BTRFS_SETGET_FUNCS(device_io_align, struct btrfs_dev_item, io_align, 32);
 BTRFS_SETGET_FUNCS(device_io_width, struct btrfs_dev_item, io_width, 32);
 BTRFS_SETGET_FUNCS(device_sector_size, struct btrfs_dev_item, sector_size, 32);
 BTRFS_SETGET_FUNCS(device_id, struct btrfs_dev_item, devid, 64);
+BTRFS_SETGET_FUNCS(device_group, struct btrfs_dev_item, dev_group, 32);
+BTRFS_SETGET_FUNCS(device_seek_speed, struct btrfs_dev_item, seek_speed, 8);
+BTRFS_SETGET_FUNCS(device_bandwidth, struct btrfs_dev_item, bandwidth, 8);
 
 BTRFS_SETGET_STACK_FUNCS(stack_device_type, struct btrfs_dev_item, type, 64);
 BTRFS_SETGET_STACK_FUNCS(stack_device_total_bytes, struct btrfs_dev_item,
@@ -710,12 +737,19 @@ BTRFS_SETGET_STACK_FUNCS(stack_device_io_width, struct btrfs_dev_item,
 BTRFS_SETGET_STACK_FUNCS(stack_device_sector_size, struct btrfs_dev_item,
 			 sector_size, 32);
 BTRFS_SETGET_STACK_FUNCS(stack_device_id, struct btrfs_dev_item, devid, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_device_group, struct btrfs_dev_item,
+			 dev_group, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_device_seek_speed, struct btrfs_dev_item,
+			 seek_speed, 8);
+BTRFS_SETGET_STACK_FUNCS(stack_device_bandwidth, struct btrfs_dev_item,
+			 bandwidth, 8);
 
 static inline char *btrfs_device_uuid(struct btrfs_dev_item *d)
 {
 	return (char *)d + offsetof(struct btrfs_dev_item, uuid);
 }
 
+BTRFS_SETGET_FUNCS(chunk_length, struct btrfs_chunk, length, 64);
 BTRFS_SETGET_FUNCS(chunk_owner, struct btrfs_chunk, owner, 64);
 BTRFS_SETGET_FUNCS(chunk_stripe_len, struct btrfs_chunk, stripe_len, 64);
 BTRFS_SETGET_FUNCS(chunk_io_align, struct btrfs_chunk, io_align, 32);
@@ -726,6 +760,12 @@ BTRFS_SETGET_FUNCS(chunk_num_stripes, struct btrfs_chunk, num_stripes, 16);
 BTRFS_SETGET_FUNCS(stripe_devid, struct btrfs_stripe, devid, 64);
 BTRFS_SETGET_FUNCS(stripe_offset, struct btrfs_stripe, offset, 64);
 
+static inline char *btrfs_stripe_dev_uuid(struct btrfs_stripe *s)
+{
+	return (char *)s + offsetof(struct btrfs_stripe, dev_uuid);
+}
+
+BTRFS_SETGET_STACK_FUNCS(stack_chunk_length, struct btrfs_chunk, length, 64);
 BTRFS_SETGET_STACK_FUNCS(stack_chunk_owner, struct btrfs_chunk, owner, 64);
 BTRFS_SETGET_STACK_FUNCS(stack_chunk_stripe_len, struct btrfs_chunk,
 			 stripe_len, 64);
@@ -781,13 +821,10 @@ BTRFS_SETGET_STACK_FUNCS(block_group_used, struct btrfs_block_group_item,
 			 used, 64);
 BTRFS_SETGET_FUNCS(disk_block_group_used, struct btrfs_block_group_item,
 			 used, 64);
-BTRFS_SETGET_STACK_FUNCS(block_group_chunk_tree, struct btrfs_block_group_item,
-			 chunk_tree, 64);
-BTRFS_SETGET_FUNCS(disk_block_group_chunk_tree, struct btrfs_block_group_item,
-			 chunk_tree, 64);
 BTRFS_SETGET_STACK_FUNCS(block_group_chunk_objectid,
 			struct btrfs_block_group_item, chunk_objectid, 64);
-BTRFS_SETGET_FUNCS(disk_block_group_chunk_objecitd,
+
+BTRFS_SETGET_FUNCS(disk_block_group_chunk_objectid,
 		   struct btrfs_block_group_item, chunk_objectid, 64);
 BTRFS_SETGET_FUNCS(disk_block_group_flags,
 		   struct btrfs_block_group_item, flags, 64);
@@ -850,9 +887,20 @@ BTRFS_SETGET_FUNCS(timespec_nsec, struct btrfs_timespec, nsec, 32);
 BTRFS_SETGET_FUNCS(extent_refs, struct btrfs_extent_item, refs, 32);
 
 /* struct btrfs_dev_extent */
-BTRFS_SETGET_FUNCS(dev_extent_owner, struct btrfs_dev_extent, owner, 64);
+BTRFS_SETGET_FUNCS(dev_extent_chunk_tree, struct btrfs_dev_extent,
+		   chunk_tree, 64);
+BTRFS_SETGET_FUNCS(dev_extent_chunk_objectid, struct btrfs_dev_extent,
+		   chunk_objectid, 64);
+BTRFS_SETGET_FUNCS(dev_extent_chunk_offset, struct btrfs_dev_extent,
+		   chunk_offset, 64);
 BTRFS_SETGET_FUNCS(dev_extent_length, struct btrfs_dev_extent, length, 64);
 
+static inline u8 *btrfs_dev_extent_chunk_tree_uuid(struct btrfs_dev_extent *dev)
+{
+	unsigned long ptr = offsetof(struct btrfs_dev_extent, chunk_tree_uuid);
+	return (u8 *)((unsigned long)dev + ptr);
+}
+
 /* struct btrfs_extent_ref */
 BTRFS_SETGET_FUNCS(ref_root, struct btrfs_extent_ref, root, 64);
 BTRFS_SETGET_FUNCS(ref_generation, struct btrfs_extent_ref, generation, 64);
@@ -1087,6 +1135,12 @@ static inline u8 *btrfs_header_fsid(struct extent_buffer *eb)
 	return (u8 *)ptr;
 }
 
+static inline u8 *btrfs_header_chunk_tree_uuid(struct extent_buffer *eb)
+{
+	unsigned long ptr = offsetof(struct btrfs_header, chunk_tree_uuid);
+	return (u8 *)ptr;
+}
+
 static inline u8 *btrfs_super_fsid(struct extent_buffer *eb)
 {
 	unsigned long ptr = offsetof(struct btrfs_super_block, fsid);
@@ -1311,7 +1365,7 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info);
 int btrfs_read_block_groups(struct btrfs_root *root);
 int btrfs_make_block_group(struct btrfs_trans_handle *trans,
 			   struct btrfs_root *root, u64 bytes_used,
-			   u64 type, u64 chunk_tree, u64 chunk_objectid,
+			   u64 type, u64 chunk_objectid, u64 chunk_offset,
 			   u64 size);
 /* ctree.c */
 int btrfs_previous_item(struct btrfs_root *root,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 9c94dddde704..79c284c87286 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1125,6 +1125,10 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 					   blocksize);
 	BUG_ON(!chunk_root->node);
 
+	read_extent_buffer(chunk_root->node, fs_info->chunk_tree_uuid,
+	         (unsigned long)btrfs_header_chunk_tree_uuid(chunk_root->node),
+		 BTRFS_UUID_SIZE);
+
 	ret = btrfs_read_chunk_tree(chunk_root);
 	BUG_ON(ret);
 
@@ -1229,7 +1233,7 @@ int write_all_supers(struct btrfs_root *root)
 		btrfs_set_device_sector_size(sb, dev_item, dev->sector_size);
 		write_extent_buffer(sb, dev->uuid,
 				    (unsigned long)btrfs_device_uuid(dev_item),
-				    BTRFS_DEV_UUID_SIZE);
+				    BTRFS_UUID_SIZE);
 
 		btrfs_set_header_flag(sb, BTRFS_HEADER_FLAG_WRITTEN);
 		csum_tree_block(root, sb, 0);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index e49147e767df..71f045c63493 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -35,10 +35,6 @@ static int finish_current_insert(struct btrfs_trans_handle *trans, struct
 				 btrfs_root *extent_root);
 static int del_pending_extents(struct btrfs_trans_handle *trans, struct
 			       btrfs_root *extent_root);
-int btrfs_make_block_group(struct btrfs_trans_handle *trans,
-			   struct btrfs_root *root, u64 bytes_used,
-			   u64 type, u64 chunk_tree, u64 chunk_objectid,
-			   u64 size);
 
 
 static int cache_block_group(struct btrfs_root *root,
@@ -980,7 +976,6 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
 		ret = get_state_private(block_group_cache, start, &ptr);
 		if (ret)
 			break;
-
 		cache = (struct btrfs_block_group_cache *)(unsigned long)ptr;
 		err = write_one_cache_group(trans, root,
 					    path, cache);
@@ -1094,8 +1089,7 @@ printk("space info full %Lu\n", flags);
 	BUG_ON(ret);
 
 	ret = btrfs_make_block_group(trans, extent_root, 0, flags,
-		     extent_root->fs_info->chunk_root->root_key.objectid,
-		     start, num_bytes);
+		     BTRFS_FIRST_CHUNK_TREE_OBJECTID, start, num_bytes);
 	BUG_ON(ret);
 
 	return 0;
@@ -2782,7 +2776,7 @@ error:
 
 int btrfs_make_block_group(struct btrfs_trans_handle *trans,
 			   struct btrfs_root *root, u64 bytes_used,
-			   u64 type, u64 chunk_tree, u64 chunk_objectid,
+			   u64 type, u64 chunk_objectid, u64 chunk_offset,
 			   u64 size)
 {
 	int ret;
@@ -2796,14 +2790,14 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
 
 	cache = kmalloc(sizeof(*cache), GFP_NOFS);
 	BUG_ON(!cache);
-	cache->key.objectid = chunk_objectid;
+	cache->key.objectid = chunk_offset;
 	cache->key.offset = size;
 	cache->cached = 0;
 	cache->pinned = 0;
+
 	btrfs_set_key_type(&cache->key, BTRFS_BLOCK_GROUP_ITEM_KEY);
 	memset(&cache->item, 0, sizeof(cache->item));
 	btrfs_set_block_group_used(&cache->item, bytes_used);
-	btrfs_set_block_group_chunk_tree(&cache->item, chunk_tree);
 	btrfs_set_block_group_chunk_objectid(&cache->item, chunk_objectid);
 	cache->flags = type;
 	btrfs_set_block_group_flags(&cache->item, type);
@@ -2813,12 +2807,12 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
 	BUG_ON(ret);
 
 	bit = block_group_state_bits(type);
-	set_extent_bits(block_group_cache, chunk_objectid,
-			chunk_objectid + size - 1,
+	set_extent_bits(block_group_cache, chunk_offset,
+			chunk_offset + size - 1,
 			bit | EXTENT_LOCKED, GFP_NOFS);
-	set_state_private(block_group_cache, chunk_objectid,
-			  (unsigned long)cache);
 
+	set_state_private(block_group_cache, chunk_offset,
+			  (unsigned long)cache);
 	ret = btrfs_insert_item(trans, extent_root, &cache->key, &cache->item,
 				sizeof(cache->item));
 	BUG_ON(ret);
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index ee0de112cf5a..e99f3249d05a 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -24,7 +24,8 @@ static void print_chunk(struct extent_buffer *eb, struct btrfs_chunk *chunk)
 {
 	int num_stripes = btrfs_chunk_num_stripes(eb, chunk);
 	int i;
-	printk("\t\tchunk owner %llu type %llu num_stripes %d\n",
+	printk("\t\tchunk length %llu owner %llu type %llu num_stripes %d\n",
+	       (unsigned long long)btrfs_chunk_length(eb, chunk),
 	       (unsigned long long)btrfs_chunk_owner(eb, chunk),
 	       (unsigned long long)btrfs_chunk_type(eb, chunk),
 	       num_stripes);
@@ -140,17 +141,24 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
 		case BTRFS_DEV_EXTENT_KEY:
 			dev_extent = btrfs_item_ptr(l, i,
 						    struct btrfs_dev_extent);
-			printk("\t\tdev extent owner %llu length %llu\n",
-			       (unsigned long long)btrfs_dev_extent_owner(l, dev_extent),
-			       (unsigned long long)btrfs_dev_extent_length(l, dev_extent));
+			printk("\t\tdev extent chunk_tree %llu\n"
+			       "\t\tchunk objectid %llu chunk offset %llu "
+			       "length %llu\n",
+			       (unsigned long long)
+			       btrfs_dev_extent_chunk_tree(l, dev_extent),
+			       (unsigned long long)
+			       btrfs_dev_extent_chunk_objectid(l, dev_extent),
+			       (unsigned long long)
+			       btrfs_dev_extent_chunk_offset(l, dev_extent),
+			       (unsigned long long)
+			       btrfs_dev_extent_length(l, dev_extent));
 		};
 	}
 }
 
 void btrfs_print_tree(struct btrfs_root *root, struct extent_buffer *c)
 {
-	int i;
-	u32 nr;
+	int i; u32 nr;
 	struct btrfs_key key;
 	int level;
 
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index f81519f0e4a7..23ebd95b25e0 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -180,7 +180,7 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
 	list_for_each(cur, head) {
 		device = list_entry(cur, struct btrfs_device, dev_list);
 		bdev = open_bdev_excl(device->name, flags, holder);
-printk("opening %s devid %Lu\n", device->name, device->devid);
+
 		if (IS_ERR(bdev)) {
 			printk("open %s failed\n", device->name);
 			ret = PTR_ERR(bdev);
@@ -190,7 +190,6 @@ printk("opening %s devid %Lu\n", device->name, device->devid);
 			fs_devices->latest_bdev = bdev;
 		if (device->devid == fs_devices->lowest_devid) {
 			fs_devices->lowest_bdev = bdev;
-printk("lowest bdev %s\n", device->name);
 		}
 		device->bdev = bdev;
 	}
@@ -372,7 +371,9 @@ error:
 
 int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
 			   struct btrfs_device *device,
-			   u64 owner, u64 num_bytes, u64 *start)
+			   u64 chunk_tree, u64 chunk_objectid,
+			   u64 chunk_offset,
+			   u64 num_bytes, u64 *start)
 {
 	int ret;
 	struct btrfs_path *path;
@@ -400,7 +401,14 @@ int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
 	leaf = path->nodes[0];
 	extent = btrfs_item_ptr(leaf, path->slots[0],
 				struct btrfs_dev_extent);
-	btrfs_set_dev_extent_owner(leaf, extent, owner);
+	btrfs_set_dev_extent_chunk_tree(leaf, extent, chunk_tree);
+	btrfs_set_dev_extent_chunk_objectid(leaf, extent, chunk_objectid);
+	btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset);
+
+	write_extent_buffer(leaf, root->fs_info->chunk_tree_uuid,
+		    (unsigned long)btrfs_dev_extent_chunk_tree_uuid(extent),
+		    BTRFS_UUID_SIZE);
+
 	btrfs_set_dev_extent_length(leaf, extent, num_bytes);
 	btrfs_mark_buffer_dirty(leaf);
 err:
@@ -408,17 +416,18 @@ err:
 	return ret;
 }
 
-static int find_next_chunk(struct btrfs_root *root, u64 *objectid)
+static int find_next_chunk(struct btrfs_root *root, u64 objectid, u64 *offset)
 {
 	struct btrfs_path *path;
 	int ret;
 	struct btrfs_key key;
+	struct btrfs_chunk *chunk;
 	struct btrfs_key found_key;
 
 	path = btrfs_alloc_path();
 	BUG_ON(!path);
 
-	key.objectid = (u64)-1;
+	key.objectid = objectid;
 	key.offset = (u64)-1;
 	key.type = BTRFS_CHUNK_ITEM_KEY;
 
@@ -430,11 +439,18 @@ static int find_next_chunk(struct btrfs_root *root, u64 *objectid)
 
 	ret = btrfs_previous_item(root, path, 0, BTRFS_CHUNK_ITEM_KEY);
 	if (ret) {
-		*objectid = 0;
+		*offset = 0;
 	} else {
 		btrfs_item_key_to_cpu(path->nodes[0], &found_key,
 				      path->slots[0]);
-		*objectid = found_key.objectid + found_key.offset;
+		if (found_key.objectid != objectid)
+			*offset = 0;
+		else {
+			chunk = btrfs_item_ptr(path->nodes[0], path->slots[0],
+					       struct btrfs_chunk);
+			*offset = found_key.offset +
+				btrfs_chunk_length(path->nodes[0], chunk);
+		}
 	}
 	ret = 0;
 error:
@@ -520,9 +536,12 @@ int btrfs_add_device(struct btrfs_trans_handle *trans,
 	btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
 	btrfs_set_device_total_bytes(leaf, dev_item, device->total_bytes);
 	btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used);
+	btrfs_set_device_group(leaf, dev_item, 0);
+	btrfs_set_device_seek_speed(leaf, dev_item, 0);
+	btrfs_set_device_bandwidth(leaf, dev_item, 0);
 
 	ptr = (unsigned long)btrfs_device_uuid(dev_item);
-	write_extent_buffer(leaf, device->uuid, ptr, BTRFS_DEV_UUID_SIZE);
+	write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
 	btrfs_mark_buffer_dirty(leaf);
 	ret = 0;
 
@@ -674,7 +693,10 @@ again:
 		return -ENOSPC;
 	}
 
-	ret = find_next_chunk(chunk_root, &key.objectid);
+	key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
+	key.type = BTRFS_CHUNK_ITEM_KEY;
+	ret = find_next_chunk(chunk_root, BTRFS_FIRST_CHUNK_TREE_OBJECTID,
+			      &key.offset);
 	if (ret)
 		return ret;
 
@@ -696,8 +718,9 @@ again:
 		*num_bytes = calc_size * num_stripes;
 
 	index = 0;
-printk("new chunk type %Lu start %Lu size %Lu\n", type, key.objectid, *num_bytes);
+printk("new chunk type %Lu start %Lu size %Lu\n", type, key.offset, *num_bytes);
 	while(index < num_stripes) {
+		struct btrfs_stripe *stripe;
 		BUG_ON(list_empty(&private_devs));
 		cur = private_devs.next;
 		device = list_entry(cur, struct btrfs_device, dev_list);
@@ -708,26 +731,28 @@ printk("new chunk type %Lu start %Lu size %Lu\n", type, key.objectid, *num_bytes
 			list_move_tail(&device->dev_list, dev_list);
 
 		ret = btrfs_alloc_dev_extent(trans, device,
-					     key.objectid,
-					     calc_size, &dev_offset);
+			     info->chunk_root->root_key.objectid,
+			     BTRFS_FIRST_CHUNK_TREE_OBJECTID, key.offset,
+			     calc_size, &dev_offset);
 		BUG_ON(ret);
-printk("alloc chunk start %Lu size %Lu from dev %Lu type %Lu\n", key.objectid, calc_size, device->devid, type);
+printk("alloc chunk start %Lu size %Lu from dev %Lu type %Lu\n", key.offset, calc_size, device->devid, type);
 		device->bytes_used += calc_size;
 		ret = btrfs_update_device(trans, device);
 		BUG_ON(ret);
 
 		map->stripes[index].dev = device;
 		map->stripes[index].physical = dev_offset;
-		btrfs_set_stack_stripe_devid(stripes + index, device->devid);
-		btrfs_set_stack_stripe_offset(stripes + index, dev_offset);
+		stripe = stripes + index;
+		btrfs_set_stack_stripe_devid(stripe, device->devid);
+		btrfs_set_stack_stripe_offset(stripe, dev_offset);
+		memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE);
 		physical = dev_offset;
 		index++;
 	}
 	BUG_ON(!list_empty(&private_devs));
 
-	/* key.objectid was set above */
-	key.offset = *num_bytes;
-	key.type = BTRFS_CHUNK_ITEM_KEY;
+	/* key was set above */
+	btrfs_set_stack_chunk_length(chunk, *num_bytes);
 	btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid);
 	btrfs_set_stack_chunk_stripe_len(chunk, stripe_len);
 	btrfs_set_stack_chunk_type(chunk, type);
@@ -745,14 +770,14 @@ printk("alloc chunk start %Lu size %Lu from dev %Lu type %Lu\n", key.objectid, c
 	ret = btrfs_insert_item(trans, chunk_root, &key, chunk,
 				btrfs_chunk_item_size(num_stripes));
 	BUG_ON(ret);
-	*start = key.objectid;
+	*start = key.offset;;
 
 	em = alloc_extent_map(GFP_NOFS);
 	if (!em)
 		return -ENOMEM;
 	em->bdev = (struct block_device *)map;
-	em->start = key.objectid;
-	em->len = key.offset;
+	em->start = key.offset;
+	em->len = *num_bytes;
 	em->block_start = 0;
 
 	kfree(chunk);
@@ -1056,8 +1081,8 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
 	int ret;
 	int i;
 
-	logical = key->objectid;
-	length = key->offset;
+	logical = key->offset;
+	length = btrfs_chunk_length(leaf, chunk);
 	spin_lock(&map_tree->map_tree.lock);
 	em = lookup_extent_mapping(&map_tree->map_tree, logical, 1);
 	spin_unlock(&map_tree->map_tree.lock);
@@ -1131,7 +1156,7 @@ static int fill_device_from_item(struct extent_buffer *leaf,
 	device->sector_size = btrfs_device_sector_size(leaf, dev_item);
 
 	ptr = (unsigned long)btrfs_device_uuid(dev_item);
-	read_extent_buffer(leaf, device->uuid, ptr, BTRFS_DEV_UUID_SIZE);
+	read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
 
 	return 0;
 }
@@ -1143,7 +1168,6 @@ static int read_one_dev(struct btrfs_root *root,
 	struct btrfs_device *device;
 	u64 devid;
 	int ret;
-
 	devid = btrfs_device_id(leaf, dev_item);
 	device = btrfs_find_device(root, devid);
 	if (!device) {
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 89548837a1cc..f9cae3072171 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -57,7 +57,7 @@ struct btrfs_device {
 	u64 type;
 
 	/* physical drive uuid (or lvm uuid) */
-	u8 uuid[BTRFS_DEV_UUID_SIZE];
+	u8 uuid[BTRFS_UUID_SIZE];
 };
 
 struct btrfs_fs_devices {
@@ -93,7 +93,9 @@ struct btrfs_multi_bio {
 
 int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
 			   struct btrfs_device *device,
-			   u64 owner, u64 num_bytes, u64 *start);
+			   u64 chunk_tree, u64 chunk_objectid,
+			   u64 chunk_offset,
+			   u64 num_bytes, u64 *start);
 int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
 		    u64 logical, u64 *length,
 		    struct btrfs_multi_bio **multi_ret, int mirror_num);
-- 
cgit v1.2.3


From 321aecc65671ae8136bd2ca6879b56f0221f8ac8 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 16 Apr 2008 10:49:51 -0400
Subject: Btrfs: Add RAID10 support

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h       |  7 +++++++
 fs/btrfs/extent-tree.c |  1 +
 fs/btrfs/volumes.c     | 46 +++++++++++++++++++++++++++++++++++++++++-----
 3 files changed, 49 insertions(+), 5 deletions(-)

(limited to 'fs/btrfs/volumes.c')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 82d67c3db8bc..a22edcf49174 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -197,6 +197,9 @@ struct btrfs_chunk {
 	 * item in the btree
 	 */
 	__le16 num_stripes;
+
+	/* sub stripes only matter for raid10 */
+	__le16 sub_stripes;
 	struct btrfs_stripe stripe;
 	/* additional stripes go here */
 } __attribute__ ((__packed__));
@@ -444,6 +447,7 @@ struct btrfs_csum_item {
 #define BTRFS_BLOCK_GROUP_RAID0    (1 << 3)
 #define BTRFS_BLOCK_GROUP_RAID1    (1 << 4)
 #define BTRFS_BLOCK_GROUP_DUP	   (1 << 5)
+#define BTRFS_BLOCK_GROUP_RAID10   (1 << 6)
 
 
 struct btrfs_block_group_item {
@@ -757,6 +761,7 @@ BTRFS_SETGET_FUNCS(chunk_io_width, struct btrfs_chunk, io_width, 32);
 BTRFS_SETGET_FUNCS(chunk_sector_size, struct btrfs_chunk, sector_size, 32);
 BTRFS_SETGET_FUNCS(chunk_type, struct btrfs_chunk, type, 64);
 BTRFS_SETGET_FUNCS(chunk_num_stripes, struct btrfs_chunk, num_stripes, 16);
+BTRFS_SETGET_FUNCS(chunk_sub_stripes, struct btrfs_chunk, sub_stripes, 16);
 BTRFS_SETGET_FUNCS(stripe_devid, struct btrfs_stripe, devid, 64);
 BTRFS_SETGET_FUNCS(stripe_offset, struct btrfs_stripe, offset, 64);
 
@@ -778,6 +783,8 @@ BTRFS_SETGET_STACK_FUNCS(stack_chunk_sector_size, struct btrfs_chunk,
 BTRFS_SETGET_STACK_FUNCS(stack_chunk_type, struct btrfs_chunk, type, 64);
 BTRFS_SETGET_STACK_FUNCS(stack_chunk_num_stripes, struct btrfs_chunk,
 			 num_stripes, 16);
+BTRFS_SETGET_STACK_FUNCS(stack_chunk_sub_stripes, struct btrfs_chunk,
+			 sub_stripes, 16);
 BTRFS_SETGET_STACK_FUNCS(stack_stripe_devid, struct btrfs_stripe, devid, 64);
 BTRFS_SETGET_STACK_FUNCS(stack_stripe_offset, struct btrfs_stripe, offset, 64);
 
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 71f045c63493..4e5bd62e6e1a 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1042,6 +1042,7 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
 {
 	u64 extra_flags = flags & (BTRFS_BLOCK_GROUP_RAID0 |
 				   BTRFS_BLOCK_GROUP_RAID1 |
+				   BTRFS_BLOCK_GROUP_RAID10 |
 				   BTRFS_BLOCK_GROUP_DUP);
 	if (extra_flags) {
 		if (flags & BTRFS_BLOCK_GROUP_DATA)
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 23ebd95b25e0..e6417a573d44 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -33,6 +33,7 @@ struct map_lookup {
 	int stripe_len;
 	int sector_size;
 	int num_stripes;
+	int sub_stripes;
 	struct btrfs_bio_stripe stripes[];
 };
 
@@ -641,6 +642,7 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 	u64 avail;
 	u64 max_avail = 0;
 	int num_stripes = 1;
+	int sub_stripes = 0;
 	int looped = 0;
 	int ret;
 	int index;
@@ -658,6 +660,13 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 		num_stripes = min_t(u64, 2,
 				  btrfs_super_num_devices(&info->super_copy));
 	}
+	if (type & (BTRFS_BLOCK_GROUP_RAID10)) {
+		num_stripes = btrfs_super_num_devices(&info->super_copy);
+		if (num_stripes < 4)
+			return -ENOSPC;
+		num_stripes &= ~(u32)1;
+		sub_stripes = 2;
+	}
 again:
 	INIT_LIST_HEAD(&private_devs);
 	cur = dev_list->next;
@@ -714,6 +723,8 @@ again:
 
 	if (type & (BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_DUP))
 		*num_bytes = calc_size;
+	else if (type & BTRFS_BLOCK_GROUP_RAID10)
+		*num_bytes = calc_size * num_stripes / sub_stripes;
 	else
 		*num_bytes = calc_size * num_stripes;
 
@@ -760,12 +771,14 @@ printk("alloc chunk start %Lu size %Lu from dev %Lu type %Lu\n", key.offset, cal
 	btrfs_set_stack_chunk_io_align(chunk, stripe_len);
 	btrfs_set_stack_chunk_io_width(chunk, stripe_len);
 	btrfs_set_stack_chunk_sector_size(chunk, extent_root->sectorsize);
+	btrfs_set_stack_chunk_sub_stripes(chunk, sub_stripes);
 	map->sector_size = extent_root->sectorsize;
 	map->stripe_len = stripe_len;
 	map->io_align = stripe_len;
 	map->io_width = stripe_len;
 	map->type = type;
 	map->num_stripes = num_stripes;
+	map->sub_stripes = sub_stripes;
 
 	ret = btrfs_insert_item(trans, chunk_root, &key, chunk,
 				btrfs_chunk_item_size(num_stripes));
@@ -832,6 +845,8 @@ int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len)
 	map = (struct map_lookup *)em->bdev;
 	if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1))
 		ret = map->num_stripes;
+	else if (map->type & BTRFS_BLOCK_GROUP_RAID10)
+		ret = map->sub_stripes;
 	else
 		ret = 1;
 	free_extent_map(em);
@@ -849,6 +864,7 @@ int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
 	u64 stripe_offset;
 	u64 stripe_nr;
 	int stripes_allocated = 8;
+	int stripes_required = 1;
 	int stripe_index;
 	int i;
 	struct btrfs_multi_bio *multi = NULL;
@@ -877,10 +893,16 @@ again:
 		mirror_num = 0;
 
 	/* if our multi bio struct is too small, back off and try again */
-	if (multi_ret && (rw & (1 << BIO_RW)) &&
-	    stripes_allocated < map->num_stripes &&
-	    ((map->type & BTRFS_BLOCK_GROUP_RAID1) ||
-	     (map->type & BTRFS_BLOCK_GROUP_DUP))) {
+	if (rw & (1 << BIO_RW)) {
+		if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
+				 BTRFS_BLOCK_GROUP_DUP)) {
+			stripes_required = map->num_stripes;
+		} else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
+			stripes_required = map->sub_stripes;
+		}
+	}
+	if (multi_ret && rw == WRITE &&
+	    stripes_allocated < stripes_required) {
 		stripes_allocated = map->num_stripes;
 		free_extent_map(em);
 		kfree(multi);
@@ -900,6 +922,7 @@ again:
 	stripe_offset = offset - stripe_offset;
 
 	if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 |
+			 BTRFS_BLOCK_GROUP_RAID10 |
 			 BTRFS_BLOCK_GROUP_DUP)) {
 		/* we limit the length of each bio to what fits in a stripe */
 		*length = min_t(u64, em->len - offset,
@@ -937,6 +960,19 @@ again:
 			multi->num_stripes = map->num_stripes;
 		else if (mirror_num)
 			stripe_index = mirror_num - 1;
+	} else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
+		int factor = map->num_stripes / map->sub_stripes;
+		int orig_stripe_nr = stripe_nr;
+
+		stripe_index = do_div(stripe_nr, factor);
+		stripe_index *= map->sub_stripes;
+
+		if (rw & (1 << BIO_RW))
+			multi->num_stripes = map->sub_stripes;
+		else if (mirror_num)
+			stripe_index += mirror_num - 1;
+		else
+			stripe_index += orig_stripe_nr % map->sub_stripes;
 	} else {
 		/*
 		 * after this do_div call, stripe_nr is the number of stripes
@@ -946,7 +982,6 @@ again:
 		stripe_index = do_div(stripe_nr, map->num_stripes);
 	}
 	BUG_ON(stripe_index >= map->num_stripes);
-	BUG_ON(stripe_index != 0 && multi->num_stripes > 1);
 
 	for (i = 0; i < multi->num_stripes; i++) {
 		multi->stripes[i].physical =
@@ -1120,6 +1155,7 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
 	map->sector_size = btrfs_chunk_sector_size(leaf, chunk);
 	map->stripe_len = btrfs_chunk_stripe_len(leaf, chunk);
 	map->type = btrfs_chunk_type(leaf, chunk);
+	map->sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk);
 	for (i = 0; i < num_stripes; i++) {
 		map->stripes[i].physical =
 			btrfs_stripe_offset_nr(leaf, chunk, i);
-- 
cgit v1.2.3


From 3b951516ed703af0f6d82053937655ad69b60864 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 17 Apr 2008 11:29:12 -0400
Subject: Btrfs: Use the extent map cache to find the logical disk block during
 data retries

The data read retry code needs to find the logical disk block before it
can resubmit new bios.  But, finding this block isn't allowed to take
the fs_mutex because that will deadlock with a number of different callers.

This changes the retry code to use the extent map cache instead, but
that requires the extent map cache to have the extent we're looking for.
This is a problem because btrfs_drop_extent_cache just drops the entire
extent instead of the little tiny part it is invalidating.

The bulk of the code in this patch changes btrfs_drop_extent_cache to
invalidate only a portion of the extent cache, and changes btrfs_get_extent
to deal with the results.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c |   1 +
 fs/btrfs/extent_io.c   |  12 +++--
 fs/btrfs/file.c        |  47 ++++++++++++++++++-
 fs/btrfs/inode.c       | 120 ++++++++++++++++++++++++++++++++++++++++++++-----
 fs/btrfs/volumes.c     |   3 ++
 5 files changed, 168 insertions(+), 15 deletions(-)

(limited to 'fs/btrfs/volumes.c')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 76fd5d7146e1..593011e5d455 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1747,6 +1747,7 @@ again:
 			       search_start, search_end, hint_byte, ins,
 			       trans->alloc_exclude_start,
 			       trans->alloc_exclude_nr, data);
+
 	if (ret == -ENOSPC && num_bytes > min_alloc_size) {
 		num_bytes = num_bytes >> 1;
 		num_bytes = max(num_bytes, min_alloc_size);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 88322684be6a..21597bea21fd 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1025,7 +1025,8 @@ u64 find_lock_delalloc_range(struct extent_io_tree *tree,
 search_again:
 	node = tree_search(tree, cur_start);
 	if (!node) {
-		*end = (u64)-1;
+		if (!found)
+			*end = (u64)-1;
 		goto out;
 	}
 
@@ -1540,6 +1541,8 @@ static int end_bio_extent_readpage(struct bio *bio,
 							 start, end, state);
 			if (ret == 0) {
 				state = NULL;
+				uptodate =
+					test_bit(BIO_UPTODATE, &bio->bi_flags);
 				continue;
 			}
 		}
@@ -1555,10 +1558,11 @@ static int end_bio_extent_readpage(struct bio *bio,
 				    !(state->state & EXTENT_LOCKED))
 					state = NULL;
 			}
-			if (!state && uptodate) {
+			if (!state) {
 				spin_unlock_irqrestore(&tree->lock, flags);
-				set_extent_uptodate(tree, start, end,
-						    GFP_ATOMIC);
+				if (uptodate)
+					set_extent_uptodate(tree, start, end,
+							    GFP_ATOMIC);
 				unlock_extent(tree, start, end, GFP_ATOMIC);
 				goto next_io;
 			}
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 9fbda6552069..3f5525f0834c 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -356,12 +356,23 @@ out_unlock:
 int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end)
 {
 	struct extent_map *em;
+	struct extent_map *split = NULL;
+	struct extent_map *split2 = NULL;
 	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
 	u64 len = end - start + 1;
+	int ret;
+	int testend = 1;
 
-	if (end == (u64)-1)
+	if (end == (u64)-1) {
 		len = (u64)-1;
+		testend = 0;
+	}
 	while(1) {
+		if (!split)
+			split = alloc_extent_map(GFP_NOFS);
+		if (!split2)
+			split2 = alloc_extent_map(GFP_NOFS);
+
 		spin_lock(&em_tree->lock);
 		em = lookup_extent_mapping(em_tree, start, len);
 		if (!em) {
@@ -369,6 +380,36 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end)
 			break;
 		}
 		remove_extent_mapping(em_tree, em);
+
+		if (em->block_start < EXTENT_MAP_LAST_BYTE &&
+		    em->start < start) {
+			split->start = em->start;
+			split->len = start - em->start;
+			split->block_start = em->block_start;
+			split->bdev = em->bdev;
+			split->flags = em->flags;
+			ret = add_extent_mapping(em_tree, split);
+			BUG_ON(ret);
+			free_extent_map(split);
+			split = split2;
+			split2 = NULL;
+		}
+		if (em->block_start < EXTENT_MAP_LAST_BYTE &&
+		    testend && em->start + em->len > start + len) {
+			u64 diff = start + len - em->start;
+
+			split->start = start + len;
+			split->len = em->start + em->len - (start + len);
+			split->bdev = em->bdev;
+			split->flags = em->flags;
+
+			split->block_start = em->block_start + diff;
+
+			ret = add_extent_mapping(em_tree, split);
+			BUG_ON(ret);
+			free_extent_map(split);
+			split = NULL;
+		}
 		spin_unlock(&em_tree->lock);
 
 		/* once for us */
@@ -376,6 +417,10 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end)
 		/* once for the tree*/
 		free_extent_map(em);
 	}
+	if (split)
+		free_extent_map(split);
+	if (split2)
+		free_extent_map(split2);
 	return 0;
 }
 
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 5632ea760077..40f8da884090 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -122,6 +122,8 @@ static int cow_file_range(struct inode *inode, u64 start, u64 end)
 	if (alloc_hint == EXTENT_MAP_INLINE)
 		goto out;
 
+	BUG_ON(num_bytes > btrfs_super_total_bytes(&root->fs_info->super_copy));
+
 	while(num_bytes > 0) {
 		cur_alloc_size = min(num_bytes, root->fs_info->max_extent);
 		ret = btrfs_alloc_extent(trans, root, cur_alloc_size,
@@ -140,6 +142,11 @@ static int cow_file_range(struct inode *inode, u64 start, u64 end)
 					       ins.offset);
 		inode->i_blocks += ins.offset >> 9;
 		btrfs_check_file(root, inode);
+		if (num_bytes < cur_alloc_size) {
+			printk("num_bytes %Lu cur_alloc %Lu\n", num_bytes,
+			       cur_alloc_size);
+			break;
+		}
 		num_bytes -= cur_alloc_size;
 		alloc_hint = ins.objectid + ins.offset;
 		start += cur_alloc_size;
@@ -427,6 +434,7 @@ int btrfs_readpage_io_failed_hook(struct bio *failed_bio,
 	struct extent_map *em;
 	struct inode *inode = page->mapping->host;
 	struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
+	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
 	struct bio *bio;
 	int num_copies;
 	int ret;
@@ -434,7 +442,6 @@ int btrfs_readpage_io_failed_hook(struct bio *failed_bio,
 
 	ret = get_state_private(failure_tree, start, &private);
 	if (ret) {
-		size_t pg_offset = start - page_offset(page);
 		failrec = kmalloc(sizeof(*failrec), GFP_NOFS);
 		if (!failrec)
 			return -ENOMEM;
@@ -442,8 +449,13 @@ int btrfs_readpage_io_failed_hook(struct bio *failed_bio,
 		failrec->len = end - start + 1;
 		failrec->last_mirror = 0;
 
-		em = btrfs_get_extent(inode, NULL, pg_offset, start,
-				      failrec->len, 0);
+		spin_lock(&em_tree->lock);
+		em = lookup_extent_mapping(em_tree, start, failrec->len);
+		if (em->start > start || em->start + em->len < start) {
+			free_extent_map(em);
+			em = NULL;
+		}
+		spin_unlock(&em_tree->lock);
 
 		if (!em || IS_ERR(em)) {
 			kfree(failrec);
@@ -559,6 +571,8 @@ zeroit:
 	flush_dcache_page(page);
 	kunmap_atomic(kaddr, KM_IRQ0);
 	local_irq_restore(flags);
+	if (private == 0)
+		return 0;
 	return -EIO;
 }
 
@@ -908,8 +922,9 @@ static int btrfs_truncate_in_trans(struct btrfs_trans_handle *trans,
 	int pending_del_nr = 0;
 	int pending_del_slot = 0;
 	int extent_type = -1;
+	u64 mask = root->sectorsize - 1;
 
-	btrfs_drop_extent_cache(inode, inode->i_size, (u64)-1);
+	btrfs_drop_extent_cache(inode, inode->i_size & (~mask), (u64)-1);
 	path = btrfs_alloc_path();
 	path->reada = -1;
 	BUG_ON(!path);
@@ -1212,7 +1227,7 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
 						       hole_start, 0, 0,
 						       hole_size);
 			btrfs_drop_extent_cache(inode, hole_start,
-						hole_size - 1);
+						(u64)-1);
 			btrfs_check_file(root, inode);
 		}
 		btrfs_end_transaction(trans, root);
@@ -2083,6 +2098,68 @@ out_unlock:
 	return err;
 }
 
+static int merge_extent_mapping(struct extent_map_tree *em_tree,
+				struct extent_map *existing,
+				struct extent_map *em)
+{
+	u64 start_diff;
+	u64 new_end;
+	int ret = 0;
+	int real_blocks = existing->block_start < EXTENT_MAP_LAST_BYTE;
+
+	if (real_blocks && em->block_start >= EXTENT_MAP_LAST_BYTE)
+		goto invalid;
+
+	if (!real_blocks && em->block_start != existing->block_start)
+		goto invalid;
+
+	new_end = max(existing->start + existing->len, em->start + em->len);
+
+	if (existing->start >= em->start) {
+		if (em->start + em->len < existing->start)
+			goto invalid;
+
+		start_diff = existing->start - em->start;
+		if (real_blocks && em->block_start + start_diff !=
+		    existing->block_start)
+			goto invalid;
+
+		em->len = new_end - em->start;
+
+		remove_extent_mapping(em_tree, existing);
+		/* free for the tree */
+		free_extent_map(existing);
+		ret = add_extent_mapping(em_tree, em);
+
+	} else if (em->start > existing->start) {
+
+		if (existing->start + existing->len < em->start)
+			goto invalid;
+
+		start_diff = em->start - existing->start;
+		if (real_blocks && existing->block_start + start_diff !=
+		    em->block_start)
+			goto invalid;
+
+		remove_extent_mapping(em_tree, existing);
+		em->block_start = existing->block_start;
+		em->start = existing->start;
+		em->len = new_end - existing->start;
+		free_extent_map(existing);
+
+		ret = add_extent_mapping(em_tree, em);
+	} else {
+		goto invalid;
+	}
+	return ret;
+
+invalid:
+	printk("invalid extent map merge [%Lu %Lu %Lu] [%Lu %Lu %Lu]\n",
+	       existing->start, existing->len, existing->block_start,
+	       em->start, em->len, em->block_start);
+	return -EIO;
+}
+
 struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
 				    size_t pg_offset, u64 start, u64 len,
 				    int create)
@@ -2267,12 +2344,35 @@ insert:
 	err = 0;
 	spin_lock(&em_tree->lock);
 	ret = add_extent_mapping(em_tree, em);
+
+	/* it is possible that someone inserted the extent into the tree
+	 * while we had the lock dropped.  It is also possible that
+	 * an overlapping map exists in the tree
+	 */
 	if (ret == -EEXIST) {
-		free_extent_map(em);
-		em = lookup_extent_mapping(em_tree, start, len);
-		if (!em) {
-			err = -EIO;
-			printk("failing to insert %Lu %Lu\n", start, len);
+		struct extent_map *existing;
+		existing = lookup_extent_mapping(em_tree, start, len);
+		if (!existing) {
+			existing = lookup_extent_mapping(em_tree, em->start,
+							 em->len);
+			if (existing) {
+				err = merge_extent_mapping(em_tree, existing,
+							   em);
+				free_extent_map(existing);
+				if (err) {
+					free_extent_map(em);
+					em = NULL;
+				}
+			} else {
+				err = -EIO;
+				printk("failing to insert %Lu %Lu\n",
+				       start, len);
+				free_extent_map(em);
+				em = NULL;
+			}
+		} else {
+			free_extent_map(em);
+			em = existing;
 		}
 	}
 	spin_unlock(&em_tree->lock);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index e6417a573d44..0e658c1d8211 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -883,6 +883,9 @@ again:
 	spin_lock(&em_tree->lock);
 	em = lookup_extent_mapping(em_tree, logical, *length);
 	spin_unlock(&em_tree->lock);
+	if (!em) {
+		printk("unable to find logical %Lu\n", logical);
+	}
 	BUG_ON(!em);
 
 	BUG_ON(em->start > logical || em->start + em->len < logical);
-- 
cgit v1.2.3


From 7bf3b490dfa57bdef5892dffa5021469a0cd2b7f Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 17 Apr 2008 11:58:30 -0400
Subject: Btrfs: Avoid 64 bit div for RAID10

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/volumes.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs/btrfs/volumes.c')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 0e658c1d8211..c11b6fd408a6 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -724,7 +724,7 @@ again:
 	if (type & (BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_DUP))
 		*num_bytes = calc_size;
 	else if (type & BTRFS_BLOCK_GROUP_RAID10)
-		*num_bytes = calc_size * num_stripes / sub_stripes;
+		*num_bytes = calc_size * (num_stripes / sub_stripes);
 	else
 		*num_bytes = calc_size * num_stripes;
 
-- 
cgit v1.2.3


From a443755f1ca3e190e12e3a845ddecb3ee1782512 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 18 Apr 2008 10:29:38 -0400
Subject: Btrfs: Check device uuids along with devids

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h   |  5 +++++
 fs/btrfs/volumes.c | 30 +++++++++++++++++++++++-------
 2 files changed, 28 insertions(+), 7 deletions(-)

(limited to 'fs/btrfs/volumes.c')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index b56ae1950658..d119d95d139a 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -800,6 +800,11 @@ static inline struct btrfs_stripe *btrfs_stripe_nr(struct btrfs_chunk *c,
 	return (struct btrfs_stripe *)offset;
 }
 
+static inline char *btrfs_stripe_dev_uuid_nr(struct btrfs_chunk *c, int nr)
+{
+	return btrfs_stripe_dev_uuid(btrfs_stripe_nr(c, nr));
+}
+
 static inline u64 btrfs_stripe_offset_nr(struct extent_buffer *eb,
 					 struct btrfs_chunk *c, int nr)
 {
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index c11b6fd408a6..cdf0019cca2e 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -69,15 +69,18 @@ int btrfs_cleanup_fs_uuids(void)
 	return 0;
 }
 
-static struct btrfs_device *__find_device(struct list_head *head, u64 devid)
+static struct btrfs_device *__find_device(struct list_head *head, u64 devid,
+					  u8 *uuid)
 {
 	struct btrfs_device *dev;
 	struct list_head *cur;
 
 	list_for_each(cur, head) {
 		dev = list_entry(cur, struct btrfs_device, dev_list);
-		if (dev->devid == devid)
+		if (dev->devid == devid &&
+		    !memcmp(dev->uuid, uuid, BTRFS_UUID_SIZE)) {
 			return dev;
+		}
 	}
 	return NULL;
 }
@@ -117,7 +120,8 @@ static int device_list_add(const char *path,
 		fs_devices->num_devices = 0;
 		device = NULL;
 	} else {
-		device = __find_device(&fs_devices->devices, devid);
+		device = __find_device(&fs_devices->devices, devid,
+				       disk_super->dev_item.uuid);
 	}
 	if (!device) {
 		device = kzalloc(sizeof(*device), GFP_NOFS);
@@ -126,6 +130,8 @@ static int device_list_add(const char *path,
 			return -ENOMEM;
 		}
 		device->devid = devid;
+		memcpy(device->uuid, disk_super->dev_item.uuid,
+		       BTRFS_UUID_SIZE);
 		device->barriers = 1;
 		spin_lock_init(&device->io_lock);
 		device->name = kstrdup(path, GFP_NOFS);
@@ -1098,11 +1104,12 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
 	return 0;
 }
 
-struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid)
+struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid,
+				       u8 *uuid)
 {
 	struct list_head *head = &root->fs_info->fs_devices->devices;
 
-	return __find_device(head, devid);
+	return __find_device(head, devid, uuid);
 }
 
 static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
@@ -1115,6 +1122,7 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
 	u64 logical;
 	u64 length;
 	u64 devid;
+	u8 uuid[BTRFS_UUID_SIZE];
 	int num_stripes;
 	int ret;
 	int i;
@@ -1163,7 +1171,10 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
 		map->stripes[i].physical =
 			btrfs_stripe_offset_nr(leaf, chunk, i);
 		devid = btrfs_stripe_devid_nr(leaf, chunk, i);
-		map->stripes[i].dev = btrfs_find_device(root, devid);
+		read_extent_buffer(leaf, uuid, (unsigned long)
+				   btrfs_stripe_dev_uuid_nr(chunk, i),
+				   BTRFS_UUID_SIZE);
+		map->stripes[i].dev = btrfs_find_device(root, devid, uuid);
 		if (!map->stripes[i].dev) {
 			kfree(map);
 			free_extent_map(em);
@@ -1207,8 +1218,13 @@ static int read_one_dev(struct btrfs_root *root,
 	struct btrfs_device *device;
 	u64 devid;
 	int ret;
+	u8 dev_uuid[BTRFS_UUID_SIZE];
+
 	devid = btrfs_device_id(leaf, dev_item);
-	device = btrfs_find_device(root, devid);
+	read_extent_buffer(leaf, dev_uuid,
+			   (unsigned long)btrfs_device_uuid(dev_item),
+			   BTRFS_UUID_SIZE);
+	device = btrfs_find_device(root, devid, dev_uuid);
 	if (!device) {
 		printk("warning devid %Lu not found already\n", devid);
 		device = kzalloc(sizeof(*device), GFP_NOFS);
-- 
cgit v1.2.3


From 7ae9c09d8f001eb19ee2ba219dc5c3d4f6d60614 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 18 Apr 2008 10:29:49 -0400
Subject: Btrfs: Add support for labels in the super block

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h   |  2 ++
 fs/btrfs/volumes.c | 17 +++++++++--------
 2 files changed, 11 insertions(+), 8 deletions(-)

(limited to 'fs/btrfs/volumes.c')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index d119d95d139a..f00c4be59ad6 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -247,6 +247,7 @@ struct btrfs_header {
  * room to translate 14 chunks with 3 stripes each.
  */
 #define BTRFS_SYSTEM_CHUNK_ARRAY_SIZE 2048
+#define BTRFS_LABEL_SIZE 256
 
 /*
  * the super block basically lists the main trees of the FS
@@ -276,6 +277,7 @@ struct btrfs_super_block {
 	u8 root_level;
 	u8 chunk_root_level;
 	struct btrfs_dev_item dev_item;
+	char label[BTRFS_LABEL_SIZE];
 	u8 sys_chunk_array[BTRFS_SYSTEM_CHUNK_ARRAY_SIZE];
 } __attribute__ ((__packed__));
 
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index cdf0019cca2e..93aa36e2436e 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -57,9 +57,7 @@ int btrfs_cleanup_fs_uuids(void)
 			devices_cur = fs_devices->devices.next;
 			dev = list_entry(devices_cur, struct btrfs_device,
 					 dev_list);
-			printk("uuid cleanup finds %s\n", dev->name);
 			if (dev->bdev) {
-				printk("closing\n");
 				close_bdev_excl(dev->bdev);
 			}
 			list_del(&dev->dev_list);
@@ -149,7 +147,6 @@ static int device_list_add(const char *path,
 	}
 	if (fs_devices->lowest_devid > devid) {
 		fs_devices->lowest_devid = devid;
-		printk("lowest devid now %Lu\n", devid);
 	}
 	*fs_devices_ret = fs_devices;
 	return 0;
@@ -166,7 +163,6 @@ int btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
 		device = list_entry(cur, struct btrfs_device, dev_list);
 		if (device->bdev) {
 			close_bdev_excl(device->bdev);
-			printk("close devices closes %s\n", device->name);
 		}
 		device->bdev = NULL;
 	}
@@ -220,11 +216,9 @@ int btrfs_scan_one_device(const char *path, int flags, void *holder,
 
 	mutex_lock(&uuid_mutex);
 
-	printk("scan one opens %s\n", path);
 	bdev = open_bdev_excl(path, flags, holder);
 
 	if (IS_ERR(bdev)) {
-		printk("open failed\n");
 		ret = PTR_ERR(bdev);
 		goto error;
 	}
@@ -240,13 +234,20 @@ int btrfs_scan_one_device(const char *path, int flags, void *holder,
 	disk_super = (struct btrfs_super_block *)bh->b_data;
 	if (strncmp((char *)(&disk_super->magic), BTRFS_MAGIC,
 	    sizeof(disk_super->magic))) {
-		printk("no btrfs found on %s\n", path);
 		ret = -EINVAL;
 		goto error_brelse;
 	}
 	devid = le64_to_cpu(disk_super->dev_item.devid);
 	transid = btrfs_super_generation(disk_super);
-	printk("found device %Lu transid %Lu on %s\n", devid, transid, path);
+	if (disk_super->label[0])
+		printk("device label %s ", disk_super->label);
+	else {
+		/* FIXME, make a readl uuid parser */
+		printk("device fsid %llx-%llx ",
+		       *(unsigned long long *)disk_super->fsid,
+		       *(unsigned long long *)(disk_super->fsid + 8));
+	}
+	printk("devid %Lu transid %Lu %s\n", devid, transid, path);
 	ret = device_list_add(path, disk_super, devid, fs_devices_ret);
 
 error_brelse:
-- 
cgit v1.2.3


From 9b3f68b90674419add8be1c0aa740dcdf04f44cc Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 18 Apr 2008 10:29:51 -0400
Subject: Btrfs: Calculate appropriate chunk sizes for both small and large
 filesystems

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/volumes.c | 68 ++++++++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 61 insertions(+), 7 deletions(-)

(limited to 'fs/btrfs/volumes.c')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 93aa36e2436e..e3ddd7fb8edd 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -627,6 +627,27 @@ int btrfs_add_system_chunk(struct btrfs_trans_handle *trans,
 	return 0;
 }
 
+static u64 div_factor(u64 num, int factor)
+{
+	if (factor == 10)
+		return num;
+	num *= factor;
+	do_div(num, 10);
+	return num;
+}
+
+static u64 chunk_bytes_by_type(u64 type, u64 calc_size, int num_stripes,
+			       int sub_stripes)
+{
+	if (type & (BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_DUP))
+		return calc_size;
+	else if (type & BTRFS_BLOCK_GROUP_RAID10)
+		return calc_size * (num_stripes / sub_stripes);
+	else
+		return calc_size * num_stripes;
+}
+
+
 int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 		      struct btrfs_root *extent_root, u64 *start,
 		      u64 *num_bytes, u64 type)
@@ -643,11 +664,14 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 	struct extent_map_tree *em_tree;
 	struct map_lookup *map;
 	struct extent_map *em;
+	int min_chunk_size = 8 * 1024 * 1024;
 	u64 physical;
 	u64 calc_size = 1024 * 1024 * 1024;
-	u64 min_free = calc_size;
+	u64 max_chunk_size = calc_size;
+	u64 min_free;
 	u64 avail;
 	u64 max_avail = 0;
+	u64 percent_max;
 	int num_stripes = 1;
 	int sub_stripes = 0;
 	int looped = 0;
@@ -666,6 +690,8 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 	if (type & (BTRFS_BLOCK_GROUP_RAID1)) {
 		num_stripes = min_t(u64, 2,
 				  btrfs_super_num_devices(&info->super_copy));
+		if (num_stripes < 2)
+			return -ENOSPC;
 	}
 	if (type & (BTRFS_BLOCK_GROUP_RAID10)) {
 		num_stripes = btrfs_super_num_devices(&info->super_copy);
@@ -674,13 +700,45 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 		num_stripes &= ~(u32)1;
 		sub_stripes = 2;
 	}
+
+	if (type & BTRFS_BLOCK_GROUP_DATA) {
+		max_chunk_size = 10 * calc_size;
+		min_chunk_size = 256 * 1024 * 1024;
+	} else if (type & BTRFS_BLOCK_GROUP_METADATA) {
+		max_chunk_size = 4 * calc_size;
+		min_chunk_size = 64 * 1024 * 1024;
+	} else {
+		min_chunk_size = 32 * 1024 * 1024;
+	}
+
+	/* we don't want a chunk larger than 10% of the FS */
+	percent_max = div_factor(btrfs_super_total_bytes(&info->super_copy), 1);
+	max_chunk_size = min(percent_max, max_chunk_size);
+
+	if (calc_size * num_stripes > max_chunk_size) {
+		calc_size = max_chunk_size;
+		do_div(calc_size, num_stripes);
+		do_div(calc_size, stripe_len);
+		calc_size *= stripe_len;
+	}
+	/* we don't want tiny stripes */
+	*num_bytes = chunk_bytes_by_type(type, calc_size,
+					 num_stripes, sub_stripes);
+	calc_size = max_t(u64, chunk_bytes_by_type(type, min_chunk_size,
+		          num_stripes, sub_stripes), calc_size);
+
 again:
+	do_div(calc_size, stripe_len);
+	calc_size *= stripe_len;
+
 	INIT_LIST_HEAD(&private_devs);
 	cur = dev_list->next;
 	index = 0;
 
 	if (type & BTRFS_BLOCK_GROUP_DUP)
 		min_free = calc_size * 2;
+	else
+		min_free = calc_size;
 
 	/* build a private list of devices we will allocate from */
 	while(index < num_stripes) {
@@ -727,13 +785,9 @@ again:
 	}
 
 	stripes = &chunk->stripe;
+	*num_bytes = chunk_bytes_by_type(type, calc_size,
+					 num_stripes, sub_stripes);
 
-	if (type & (BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_DUP))
-		*num_bytes = calc_size;
-	else if (type & BTRFS_BLOCK_GROUP_RAID10)
-		*num_bytes = calc_size * (num_stripes / sub_stripes);
-	else
-		*num_bytes = calc_size * num_stripes;
 
 	index = 0;
 printk("new chunk type %Lu start %Lu size %Lu\n", type, key.offset, *num_bytes);
-- 
cgit v1.2.3


From a40a90a0420abd5ff86a0917facd3293ebb6a9b6 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 18 Apr 2008 11:55:51 -0400
Subject: Btrfs: Fix chunk allocation when some devices don't have enough room
 for stripes

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/volumes.c | 45 +++++++++++++++++++++++++++++----------------
 1 file changed, 29 insertions(+), 16 deletions(-)

(limited to 'fs/btrfs/volumes.c')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index e3ddd7fb8edd..fe5b00986d22 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -664,7 +664,7 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 	struct extent_map_tree *em_tree;
 	struct map_lookup *map;
 	struct extent_map *em;
-	int min_chunk_size = 8 * 1024 * 1024;
+	int min_stripe_size = 1 * 1024 * 1024;
 	u64 physical;
 	u64 calc_size = 1024 * 1024 * 1024;
 	u64 max_chunk_size = calc_size;
@@ -673,6 +673,7 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 	u64 max_avail = 0;
 	u64 percent_max;
 	int num_stripes = 1;
+	int min_stripes = 1;
 	int sub_stripes = 0;
 	int looped = 0;
 	int ret;
@@ -683,15 +684,20 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 	if (list_empty(dev_list))
 		return -ENOSPC;
 
-	if (type & (BTRFS_BLOCK_GROUP_RAID0))
+	if (type & (BTRFS_BLOCK_GROUP_RAID0)) {
 		num_stripes = btrfs_super_num_devices(&info->super_copy);
-	if (type & (BTRFS_BLOCK_GROUP_DUP))
+		min_stripes = 2;
+	}
+	if (type & (BTRFS_BLOCK_GROUP_DUP)) {
 		num_stripes = 2;
+		min_stripes = 2;
+	}
 	if (type & (BTRFS_BLOCK_GROUP_RAID1)) {
 		num_stripes = min_t(u64, 2,
 				  btrfs_super_num_devices(&info->super_copy));
 		if (num_stripes < 2)
 			return -ENOSPC;
+		min_stripes = 2;
 	}
 	if (type & (BTRFS_BLOCK_GROUP_RAID10)) {
 		num_stripes = btrfs_super_num_devices(&info->super_copy);
@@ -699,22 +705,26 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 			return -ENOSPC;
 		num_stripes &= ~(u32)1;
 		sub_stripes = 2;
+		min_stripes = 4;
 	}
 
 	if (type & BTRFS_BLOCK_GROUP_DATA) {
 		max_chunk_size = 10 * calc_size;
-		min_chunk_size = 256 * 1024 * 1024;
+		min_stripe_size = 64 * 1024 * 1024;
 	} else if (type & BTRFS_BLOCK_GROUP_METADATA) {
 		max_chunk_size = 4 * calc_size;
-		min_chunk_size = 64 * 1024 * 1024;
-	} else {
-		min_chunk_size = 32 * 1024 * 1024;
+		min_stripe_size = 32 * 1024 * 1024;
+	} else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
+		calc_size = 8 * 1024 * 1024;
+		max_chunk_size = calc_size * 2;
+		min_stripe_size = 1 * 1024 * 1024;
 	}
 
 	/* we don't want a chunk larger than 10% of the FS */
 	percent_max = div_factor(btrfs_super_total_bytes(&info->super_copy), 1);
 	max_chunk_size = min(percent_max, max_chunk_size);
 
+again:
 	if (calc_size * num_stripes > max_chunk_size) {
 		calc_size = max_chunk_size;
 		do_div(calc_size, num_stripes);
@@ -722,12 +732,8 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 		calc_size *= stripe_len;
 	}
 	/* we don't want tiny stripes */
-	*num_bytes = chunk_bytes_by_type(type, calc_size,
-					 num_stripes, sub_stripes);
-	calc_size = max_t(u64, chunk_bytes_by_type(type, min_chunk_size,
-		          num_stripes, sub_stripes), calc_size);
+	calc_size = max_t(u64, min_stripe_size, calc_size);
 
-again:
 	do_div(calc_size, stripe_len);
 	calc_size *= stripe_len;
 
@@ -746,19 +752,27 @@ again:
 
 		avail = device->total_bytes - device->bytes_used;
 		cur = cur->next;
-		if (avail > max_avail)
-			max_avail = avail;
 		if (avail >= min_free) {
 			list_move_tail(&device->dev_list, &private_devs);
 			index++;
 			if (type & BTRFS_BLOCK_GROUP_DUP)
 				index++;
-		}
+		} else if (avail > max_avail)
+			max_avail = avail;
 		if (cur == dev_list)
 			break;
 	}
 	if (index < num_stripes) {
 		list_splice(&private_devs, dev_list);
+		if (index >= min_stripes) {
+			num_stripes = index;
+			if (type & (BTRFS_BLOCK_GROUP_RAID10)) {
+				num_stripes /= sub_stripes;
+				num_stripes *= sub_stripes;
+			}
+			looped = 1;
+			goto again;
+		}
 		if (!looped && max_avail > 0) {
 			looped = 1;
 			calc_size = max_avail;
@@ -766,7 +780,6 @@ again:
 		}
 		return -ENOSPC;
 	}
-
 	key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
 	key.type = BTRFS_CHUNK_ITEM_KEY;
 	ret = find_next_chunk(chunk_root, BTRFS_FIRST_CHUNK_TREE_OBJECTID,
-- 
cgit v1.2.3


From ad5bd91ecee2b687071da00094564fd075f934ee Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 21 Apr 2008 08:28:10 -0400
Subject: Btrfs: Add 1MB to the min_free in alloc_chunk

This properly reflects the first 1MB we skip at the start of the device

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/volumes.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs/btrfs/volumes.c')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index fe5b00986d22..4fc92cf813d8 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -746,6 +746,9 @@ again:
 	else
 		min_free = calc_size;
 
+	/* we add 1MB because we never use the first 1MB of the device */
+	min_free += 1024 * 1024;
+
 	/* build a private list of devices we will allocate from */
 	while(index < num_stripes) {
 		device = list_entry(cur, struct btrfs_device, dev_list);
-- 
cgit v1.2.3


From f2d8d74d7874f8f81222363cd6459a365796e35a Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 21 Apr 2008 10:03:05 -0400
Subject: Btrfs: Make an unplug function that doesn't unplug every spindle

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/disk-io.c | 26 ++++++++++--------
 fs/btrfs/inode.c   |  6 +----
 fs/btrfs/volumes.c | 79 +++++++++++++++++++++++++++++++++++++++---------------
 fs/btrfs/volumes.h |  2 ++
 4 files changed, 75 insertions(+), 38 deletions(-)

(limited to 'fs/btrfs/volumes.c')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index c829612c797e..7f5aca35494d 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -913,18 +913,22 @@ static int btrfs_congested_fn(void *congested_data, int bdi_bits)
 
 void btrfs_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
 {
-	struct list_head *cur;
-	struct btrfs_device *device;
-	struct btrfs_fs_info *info;
+	struct inode *inode = page->mapping->host;
+	struct extent_map_tree *em_tree;
+	struct extent_map *em;
+	u64 offset = page_offset(page);
 
-	info = (struct btrfs_fs_info *)bdi->unplug_io_data;
-	list_for_each(cur, &info->fs_devices->devices) {
-		device = list_entry(cur, struct btrfs_device, dev_list);
-		bdi = blk_get_backing_dev_info(device->bdev);
-		if (bdi->unplug_io_fn) {
-			bdi->unplug_io_fn(bdi, page);
-		}
-	}
+	em_tree = &BTRFS_I(inode)->extent_tree;
+	spin_lock(&em_tree->lock);
+	em = lookup_extent_mapping(em_tree, offset, PAGE_CACHE_SIZE);
+	spin_unlock(&em_tree->lock);
+	if (!em)
+		return;
+
+	offset = offset - em->start;
+	btrfs_unplug_page(&BTRFS_I(inode)->root->fs_info->mapping_tree,
+			  em->block_start + offset, page);
+	free_extent_map(em);
 }
 
 static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi)
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 5b1514e58de4..e875c7c8a647 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -313,13 +313,9 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
 	u64 logical = bio->bi_sector << 9;
 	u64 length = 0;
 	u64 map_length;
-	struct bio_vec *bvec;
-	int i;
 	int ret;
 
-	bio_for_each_segment(bvec, bio, i) {
-		length += bvec->bv_len;
-	}
+	length = bio->bi_size;
 	map_tree = &root->fs_info->mapping_tree;
 	map_length = length;
 	ret = btrfs_map_block(map_tree, READ, logical,
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 4fc92cf813d8..460240706505 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -18,6 +18,7 @@
 #include <linux/sched.h>
 #include <linux/bio.h>
 #include <linux/buffer_head.h>
+#include <linux/blkdev.h>
 #include <asm/div64.h>
 #include "ctree.h"
 #include "extent_map.h"
@@ -930,9 +931,10 @@ int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len)
 	return ret;
 }
 
-int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
-		    u64 logical, u64 *length,
-		    struct btrfs_multi_bio **multi_ret, int mirror_num)
+static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
+			     u64 logical, u64 *length,
+			     struct btrfs_multi_bio **multi_ret,
+			     int mirror_num, struct page *unplug_page)
 {
 	struct extent_map *em;
 	struct map_lookup *map;
@@ -944,6 +946,7 @@ int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
 	int stripes_required = 1;
 	int stripe_index;
 	int i;
+	int num_stripes;
 	struct btrfs_multi_bio *multi = NULL;
 
 	if (multi_ret && !(rw & (1 << BIO_RW))) {
@@ -960,10 +963,14 @@ again:
 	spin_lock(&em_tree->lock);
 	em = lookup_extent_mapping(em_tree, logical, *length);
 	spin_unlock(&em_tree->lock);
+
+	if (!em && unplug_page)
+		return 0;
+
 	if (!em) {
 		printk("unable to find logical %Lu\n", logical);
+		BUG();
 	}
-	BUG_ON(!em);
 
 	BUG_ON(em->start > logical || em->start + em->len < logical);
 	map = (struct map_lookup *)em->bdev;
@@ -1010,14 +1017,15 @@ again:
 	} else {
 		*length = em->len - offset;
 	}
-	if (!multi_ret)
+
+	if (!multi_ret && !unplug_page)
 		goto out;
 
-	multi->num_stripes = 1;
+	num_stripes = 1;
 	stripe_index = 0;
 	if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
-		if (rw & (1 << BIO_RW))
-			multi->num_stripes = map->num_stripes;
+		if (unplug_page || (rw & (1 << BIO_RW)))
+			num_stripes = map->num_stripes;
 		else if (mirror_num) {
 			stripe_index = mirror_num - 1;
 		} else {
@@ -1037,7 +1045,7 @@ again:
 		}
 	} else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
 		if (rw & (1 << BIO_RW))
-			multi->num_stripes = map->num_stripes;
+			num_stripes = map->num_stripes;
 		else if (mirror_num)
 			stripe_index = mirror_num - 1;
 	} else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
@@ -1047,8 +1055,8 @@ again:
 		stripe_index = do_div(stripe_nr, factor);
 		stripe_index *= map->sub_stripes;
 
-		if (rw & (1 << BIO_RW))
-			multi->num_stripes = map->sub_stripes;
+		if (unplug_page || (rw & (1 << BIO_RW)))
+			num_stripes = map->sub_stripes;
 		else if (mirror_num)
 			stripe_index += mirror_num - 1;
 		else
@@ -1063,19 +1071,50 @@ again:
 	}
 	BUG_ON(stripe_index >= map->num_stripes);
 
-	for (i = 0; i < multi->num_stripes; i++) {
-		multi->stripes[i].physical =
-			map->stripes[stripe_index].physical + stripe_offset +
-			stripe_nr * map->stripe_len;
-		multi->stripes[i].dev = map->stripes[stripe_index].dev;
+	for (i = 0; i < num_stripes; i++) {
+		if (unplug_page) {
+			struct btrfs_device *device;
+			struct backing_dev_info *bdi;
+
+			device = map->stripes[stripe_index].dev;
+			bdi = blk_get_backing_dev_info(device->bdev);
+			if (bdi->unplug_io_fn) {
+				bdi->unplug_io_fn(bdi, unplug_page);
+			}
+		} else {
+			multi->stripes[i].physical =
+				map->stripes[stripe_index].physical +
+				stripe_offset + stripe_nr * map->stripe_len;
+			multi->stripes[i].dev = map->stripes[stripe_index].dev;
+		}
 		stripe_index++;
 	}
-	*multi_ret = multi;
+	if (multi_ret) {
+		*multi_ret = multi;
+		multi->num_stripes = num_stripes;
+	}
 out:
 	free_extent_map(em);
 	return 0;
 }
 
+int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
+		      u64 logical, u64 *length,
+		      struct btrfs_multi_bio **multi_ret, int mirror_num)
+{
+	return __btrfs_map_block(map_tree, rw, logical, length, multi_ret,
+				 mirror_num, NULL);
+}
+
+int btrfs_unplug_page(struct btrfs_mapping_tree *map_tree,
+		      u64 logical, struct page *page)
+{
+	u64 length = PAGE_CACHE_SIZE;
+	return __btrfs_map_block(map_tree, READ, logical, &length,
+				 NULL, 0, page);
+}
+
+
 #if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,23)
 static void end_bio_multi_stripe(struct bio *bio, int err)
 #else
@@ -1122,16 +1161,12 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
 	u64 logical = bio->bi_sector << 9;
 	u64 length = 0;
 	u64 map_length;
-	struct bio_vec *bvec;
 	struct btrfs_multi_bio *multi = NULL;
-	int i;
 	int ret;
 	int dev_nr = 0;
 	int total_devs = 1;
 
-	bio_for_each_segment(bvec, bio, i) {
-		length += bvec->bv_len;
-	}
+	length = bio->bi_size;
 
 	map_tree = &root->fs_info->mapping_tree;
 	map_length = length;
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index f9cae3072171..dfe7e8a9507c 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -119,4 +119,6 @@ int btrfs_add_device(struct btrfs_trans_handle *trans,
 		     struct btrfs_device *device);
 int btrfs_cleanup_fs_uuids(void);
 int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len);
+int btrfs_unplug_page(struct btrfs_mapping_tree *map_tree,
+		      u64 logical, struct page *page);
 #endif
-- 
cgit v1.2.3


From 3c12ac7205d4bd679fefa722aa9b61385e4b4749 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 21 Apr 2008 12:01:38 -0400
Subject: Btrfs: Simplify device selection for mirrored reads

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c |  8 ++++----
 fs/btrfs/volumes.c     | 23 +++++++----------------
 2 files changed, 11 insertions(+), 20 deletions(-)

(limited to 'fs/btrfs/volumes.c')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 593011e5d455..cc0d7f30c36b 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -640,7 +640,7 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
 	if (!path)
 		return -ENOMEM;
 
-	path->reada = 0;
+	path->reada = 1;
 	key.objectid = bytenr;
 	btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
 	key.offset = num_bytes;
@@ -660,7 +660,7 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
 
 	btrfs_release_path(root->fs_info->extent_root, path);
 
-	path->reada = 0;
+	path->reada = 1;
 	ret = btrfs_insert_extent_backref(trans, root->fs_info->extent_root,
 					  path, bytenr, root_objectid,
 					  ref_generation, owner, owner_offset);
@@ -692,7 +692,7 @@ static int lookup_extent_ref(struct btrfs_trans_handle *trans,
 
 	WARN_ON(num_bytes < root->sectorsize);
 	path = btrfs_alloc_path();
-	path->reada = 0;
+	path->reada = 1;
 	key.objectid = bytenr;
 	key.offset = num_bytes;
 	btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
@@ -1328,7 +1328,7 @@ static int __free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
 	if (!path)
 		return -ENOMEM;
 
-	path->reada = 0;
+	path->reada = 1;
 	ret = lookup_extent_backref(trans, extent_root, path,
 				    bytenr, root_objectid,
 				    ref_generation,
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 460240706505..107fc74c3ab8 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1029,19 +1029,8 @@ again:
 		else if (mirror_num) {
 			stripe_index = mirror_num - 1;
 		} else {
-			int i;
-			u64 least = (u64)-1;
-			struct btrfs_device *cur;
-
-			for (i = 0; i < map->num_stripes; i++) {
-				cur = map->stripes[i].dev;
-				spin_lock(&cur->io_lock);
-				if (cur->total_ios < least) {
-					least = cur->total_ios;
-					stripe_index = i;
-				}
-				spin_unlock(&cur->io_lock);
-			}
+			u64 orig_stripe_nr = stripe_nr;
+			stripe_index = do_div(orig_stripe_nr, num_stripes);
 		}
 	} else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
 		if (rw & (1 << BIO_RW))
@@ -1050,7 +1039,6 @@ again:
 			stripe_index = mirror_num - 1;
 	} else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
 		int factor = map->num_stripes / map->sub_stripes;
-		int orig_stripe_nr = stripe_nr;
 
 		stripe_index = do_div(stripe_nr, factor);
 		stripe_index *= map->sub_stripes;
@@ -1059,8 +1047,11 @@ again:
 			num_stripes = map->sub_stripes;
 		else if (mirror_num)
 			stripe_index += mirror_num - 1;
-		else
-			stripe_index += orig_stripe_nr % map->sub_stripes;
+		else {
+			u64 orig_stripe_nr = stripe_nr;
+			stripe_index += do_div(orig_stripe_nr,
+					       map->sub_stripes);
+		}
 	} else {
 		/*
 		 * after this do_div call, stripe_nr is the number of stripes
-- 
cgit v1.2.3


From b30757178dad19a0388d958ff9eea66e674d39ed Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 22 Apr 2008 09:22:07 -0400
Subject: Btrfs: Add a special device list for chunk allocations

This allows other code that needs to walk every device in the FS to do so
without locking against allocations.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/volumes.c | 15 ++++++++++-----
 fs/btrfs/volumes.h |  7 +++++++
 2 files changed, 17 insertions(+), 5 deletions(-)

(limited to 'fs/btrfs/volumes.c')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 107fc74c3ab8..5619e50583e3 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -111,6 +111,7 @@ static int device_list_add(const char *path,
 		if (!fs_devices)
 			return -ENOMEM;
 		INIT_LIST_HEAD(&fs_devices->devices);
+		INIT_LIST_HEAD(&fs_devices->alloc_list);
 		list_add(&fs_devices->list, &fs_uuids);
 		memcpy(fs_devices->fsid, disk_super->fsid, BTRFS_FSID_SIZE);
 		fs_devices->latest_devid = devid;
@@ -139,6 +140,7 @@ static int device_list_add(const char *path,
 			return -ENOMEM;
 		}
 		list_add(&device->dev_list, &fs_devices->devices);
+		list_add(&device->dev_alloc_list, &fs_devices->alloc_list);
 		fs_devices->num_devices++;
 	}
 
@@ -660,7 +662,7 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 	struct btrfs_device *device = NULL;
 	struct btrfs_chunk *chunk;
 	struct list_head private_devs;
-	struct list_head *dev_list = &extent_root->fs_info->fs_devices->devices;
+	struct list_head *dev_list;
 	struct list_head *cur;
 	struct extent_map_tree *em_tree;
 	struct map_lookup *map;
@@ -682,6 +684,7 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 	int stripe_len = 64 * 1024;
 	struct btrfs_key key;
 
+	dev_list = &extent_root->fs_info->fs_devices->alloc_list;
 	if (list_empty(dev_list))
 		return -ENOSPC;
 
@@ -752,12 +755,12 @@ again:
 
 	/* build a private list of devices we will allocate from */
 	while(index < num_stripes) {
-		device = list_entry(cur, struct btrfs_device, dev_list);
+		device = list_entry(cur, struct btrfs_device, dev_alloc_list);
 
 		avail = device->total_bytes - device->bytes_used;
 		cur = cur->next;
 		if (avail >= min_free) {
-			list_move_tail(&device->dev_list, &private_devs);
+			list_move_tail(&device->dev_alloc_list, &private_devs);
 			index++;
 			if (type & BTRFS_BLOCK_GROUP_DUP)
 				index++;
@@ -812,12 +815,12 @@ printk("new chunk type %Lu start %Lu size %Lu\n", type, key.offset, *num_bytes);
 		struct btrfs_stripe *stripe;
 		BUG_ON(list_empty(&private_devs));
 		cur = private_devs.next;
-		device = list_entry(cur, struct btrfs_device, dev_list);
+		device = list_entry(cur, struct btrfs_device, dev_alloc_list);
 
 		/* loop over this device again if we're doing a dup group */
 		if (!(type & BTRFS_BLOCK_GROUP_DUP) ||
 		    (index == num_stripes - 1))
-			list_move_tail(&device->dev_list, dev_list);
+			list_move_tail(&device->dev_alloc_list, dev_list);
 
 		ret = btrfs_alloc_dev_extent(trans, device,
 			     info->chunk_root->root_key.objectid,
@@ -1329,6 +1332,8 @@ static int read_one_dev(struct btrfs_root *root,
 			return -ENOMEM;
 		list_add(&device->dev_list,
 			 &root->fs_info->fs_devices->devices);
+		list_add(&device->dev_alloc_list,
+			 &root->fs_info->fs_devices->alloc_list);
 		device->barriers = 1;
 		spin_lock_init(&device->io_lock);
 	}
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index dfe7e8a9507c..a2660d2b6b34 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -24,9 +24,12 @@
 struct buffer_head;
 struct btrfs_device {
 	struct list_head dev_list;
+	struct list_head dev_alloc_list;
 	struct btrfs_root *dev_root;
 	struct buffer_head *pending_io;
+
 	int barriers;
+
 	spinlock_t io_lock;
 
 	struct block_device *bdev;
@@ -70,7 +73,11 @@ struct btrfs_fs_devices {
 	u64 num_devices;
 	struct block_device *latest_bdev;
 	struct block_device *lowest_bdev;
+	/* all of the devices in the FS */
 	struct list_head devices;
+
+	/* devices not currently being allocated */
+	struct list_head alloc_list;
 	struct list_head list;
 };
 
-- 
cgit v1.2.3


From e1c4b7451e22f5b0a9fbccfa560ee7b80c35b8cd Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 22 Apr 2008 13:26:46 -0400
Subject: Fix btrfs_get_extent and get_block corner cases, and disable O_DIRECT
 reads

The generic O_DIRECT code assumes all the bios have the same bdev,
which isn't true for multi-device btrfs.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent_io.c |  1 +
 fs/btrfs/inode.c     | 32 +++++++++++++++++++++-----------
 fs/btrfs/volumes.c   |  2 +-
 3 files changed, 23 insertions(+), 12 deletions(-)

(limited to 'fs/btrfs/volumes.c')

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 81f8b4fd069b..6e4bf029c6d4 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1694,6 +1694,7 @@ extent_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
 	}
 
 	if (bio) {
+		bio->bi_size = 0;
 		bio->bi_bdev = bdev;
 		bio->bi_sector = first_sector;
 	}
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index e875c7c8a647..abfe86df02d2 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -494,6 +494,7 @@ int btrfs_readpage_io_failed_hook(struct bio *failed_bio,
 	bio->bi_end_io = failed_bio->bi_end_io;
 	bio->bi_sector = failrec->logical >> 9;
 	bio->bi_bdev = failed_bio->bi_bdev;
+	bio->bi_size = 0;
 	bio_add_page(bio, page, failrec->len, start - page_offset(page));
 	btrfs_submit_bio_hook(inode, READ, bio, failrec->last_mirror);
 	return 0;
@@ -2187,12 +2188,9 @@ again:
 	spin_unlock(&em_tree->lock);
 
 	if (em) {
-		if (em->start > start) {
-			printk("get_extent lookup [%Lu %Lu] em [%Lu %Lu]\n",
-			       start, len, em->start, em->len);
-			WARN_ON(1);
-		}
-		if (em->block_start == EXTENT_MAP_INLINE && page)
+		if (em->start > start || em->start + em->len <= start)
+			free_extent_map(em);
+		else if (em->block_start == EXTENT_MAP_INLINE && page)
 			free_extent_map(em);
 		else
 			goto out;
@@ -2340,7 +2338,6 @@ insert:
 	err = 0;
 	spin_lock(&em_tree->lock);
 	ret = add_extent_mapping(em_tree, em);
-
 	/* it is possible that someone inserted the extent into the tree
 	 * while we had the lock dropped.  It is also possible that
 	 * an overlapping map exists in the tree
@@ -2348,6 +2345,11 @@ insert:
 	if (ret == -EEXIST) {
 		struct extent_map *existing;
 		existing = lookup_extent_mapping(em_tree, start, len);
+		if (existing && (existing->start > start ||
+		    existing->start + existing->len <= start)) {
+			free_extent_map(existing);
+			existing = NULL;
+		}
 		if (!existing) {
 			existing = lookup_extent_mapping(em_tree, em->start,
 							 em->len);
@@ -2388,6 +2390,7 @@ out:
 	return em;
 }
 
+#if 0 /* waiting for O_DIRECT reads */
 static int btrfs_get_block(struct inode *inode, sector_t iblock,
 			struct buffer_head *bh_result, int create)
 {
@@ -2405,22 +2408,24 @@ static int btrfs_get_block(struct inode *inode, sector_t iblock,
 	if (!em || IS_ERR(em))
 		goto out;
 
-	if (em->start > start || em->start + em->len <= start)
+	if (em->start > start || em->start + em->len <= start) {
 	    goto out;
+	}
 
 	if (em->block_start == EXTENT_MAP_INLINE) {
 		ret = -EINVAL;
 		goto out;
 	}
 
+	len = em->start + em->len - start;
+	len = min_t(u64, len, INT_LIMIT(typeof(bh_result->b_size)));
+
 	if (em->block_start == EXTENT_MAP_HOLE ||
 	    em->block_start == EXTENT_MAP_DELALLOC) {
+		bh_result->b_size = len;
 		goto out;
 	}
 
-	len = em->start + em->len - start;
-	len = min_t(u64, len, INT_LIMIT(typeof(bh_result->b_size)));
-
 	logical = start - em->start;
 	logical = em->block_start + logical;
 
@@ -2430,6 +2435,7 @@ static int btrfs_get_block(struct inode *inode, sector_t iblock,
 	BUG_ON(ret);
 	bh_result->b_blocknr = multi->stripes[0].physical >> inode->i_blkbits;
 	bh_result->b_size = min(map_length, len);
+
 	bh_result->b_bdev = multi->stripes[0].dev->bdev;
 	set_buffer_mapped(bh_result);
 	kfree(multi);
@@ -2437,11 +2443,14 @@ out:
 	free_extent_map(em);
 	return ret;
 }
+#endif
 
 static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
 			const struct iovec *iov, loff_t offset,
 			unsigned long nr_segs)
 {
+	return -EINVAL;
+#if 0
 	struct file *file = iocb->ki_filp;
 	struct inode *inode = file->f_mapping->host;
 
@@ -2450,6 +2459,7 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
 
 	return blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
 				  offset, nr_segs, btrfs_get_block, NULL);
+#endif
 }
 
 static sector_t btrfs_bmap(struct address_space *mapping, sector_t iblock)
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 5619e50583e3..bccb5566fd84 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1161,7 +1161,6 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
 	int total_devs = 1;
 
 	length = bio->bi_size;
-
 	map_tree = &root->fs_info->mapping_tree;
 	map_length = length;
 
@@ -1192,6 +1191,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
 		}
 		bio->bi_sector = multi->stripes[dev_nr].physical >> 9;
 		dev = multi->stripes[dev_nr].dev;
+
 		bio->bi_bdev = dev->bdev;
 		spin_lock(&dev->io_lock);
 		dev->total_ios++;
-- 
cgit v1.2.3


From 84eed90fac1b927a2657ff3bb7a0f18b9cb688f7 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 25 Apr 2008 09:04:37 -0400
Subject: Btrfs: Add failure handling for read_sys_array

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/disk-io.c | 11 +++++++++--
 fs/btrfs/volumes.c | 16 +++++++++-------
 2 files changed, 18 insertions(+), 9 deletions(-)

(limited to 'fs/btrfs/volumes.c')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 1281c393c7e6..71838264ca6b 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1278,7 +1278,11 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	mutex_lock(&fs_info->fs_mutex);
 
 	ret = btrfs_read_sys_array(tree_root);
-	BUG_ON(ret);
+	if (ret) {
+		printk("btrfs: failed to read the system array on %s\n",
+		       sb->s_id);
+		goto fail_sys_array;
+	}
 
 	blocksize = btrfs_level_size(tree_root,
 				     btrfs_super_chunk_root_level(disk_super));
@@ -1335,8 +1339,9 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 fail_extent_root:
 	free_extent_buffer(extent_root->node);
 fail_tree_root:
-	mutex_unlock(&fs_info->fs_mutex);
 	free_extent_buffer(tree_root->node);
+fail_sys_array:
+	mutex_unlock(&fs_info->fs_mutex);
 fail_sb_buffer:
 	free_extent_buffer(fs_info->sb_buffer);
 	extent_io_tree_empty_lru(&BTRFS_I(fs_info->btree_inode)->io_tree);
@@ -1344,6 +1349,8 @@ fail_iput:
 	iput(fs_info->btree_inode);
 fail:
 	close_all_devices(fs_info);
+	btrfs_mapping_tree_free(&fs_info->mapping_tree);
+
 	kfree(extent_root);
 	kfree(tree_root);
 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,23)
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index bccb5566fd84..c63a982e31d0 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1365,14 +1365,14 @@ int btrfs_read_sys_array(struct btrfs_root *root)
 	struct extent_buffer *sb = root->fs_info->sb_buffer;
 	struct btrfs_disk_key *disk_key;
 	struct btrfs_chunk *chunk;
-	struct btrfs_key key;
+	u8 *ptr;
+	unsigned long sb_ptr;
+	int ret = 0;
 	u32 num_stripes;
 	u32 array_size;
 	u32 len = 0;
-	u8 *ptr;
-	unsigned long sb_ptr;
 	u32 cur;
-	int ret;
+	struct btrfs_key key;
 
 	array_size = btrfs_super_sys_array_size(super_copy);
 
@@ -1397,17 +1397,19 @@ int btrfs_read_sys_array(struct btrfs_root *root)
 		if (key.type == BTRFS_CHUNK_ITEM_KEY) {
 			chunk = (struct btrfs_chunk *)sb_ptr;
 			ret = read_one_chunk(root, &key, sb, chunk);
-			BUG_ON(ret);
+			if (ret)
+				break;
 			num_stripes = btrfs_chunk_num_stripes(sb, chunk);
 			len = btrfs_chunk_item_size(num_stripes);
 		} else {
-			BUG();
+			ret = -EIO;
+			break;
 		}
 		ptr += len;
 		sb_ptr += len;
 		cur += len;
 	}
-	return 0;
+	return ret;
 }
 
 int btrfs_read_chunk_tree(struct btrfs_root *root)
-- 
cgit v1.2.3


From 8f18cf13396caae5a3d7ae91201cfb15181a9642 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 25 Apr 2008 16:53:30 -0400
Subject: Btrfs: Make the resizer work based on shrinking and growing devices

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h       |   1 +
 fs/btrfs/disk-io.c     |   4 +
 fs/btrfs/extent-tree.c | 145 +++++++++-------------
 fs/btrfs/inode.c       |  36 ++++--
 fs/btrfs/volumes.c     | 324 +++++++++++++++++++++++++++++++++++++++++++++++--
 fs/btrfs/volumes.h     |   5 +
 6 files changed, 407 insertions(+), 108 deletions(-)

(limited to 'fs/btrfs/volumes.c')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index f00c4be59ad6..ac7106ec5357 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -474,6 +474,7 @@ struct btrfs_block_group_cache {
 	u64 pinned;
 	u64 flags;
 	int cached;
+	int ro;
 };
 
 struct btrfs_device;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 71838264ca6b..a9ce491d279b 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -818,6 +818,10 @@ struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info,
 		return fs_info->tree_root;
 	if (location->objectid == BTRFS_EXTENT_TREE_OBJECTID)
 		return fs_info->extent_root;
+	if (location->objectid == BTRFS_CHUNK_TREE_OBJECTID)
+		return fs_info->chunk_root;
+	if (location->objectid == BTRFS_DEV_TREE_OBJECTID)
+		return fs_info->dev_root;
 
 	root = radix_tree_lookup(&fs_info->fs_roots_radix,
 				 (unsigned long)location->objectid);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index c49592c5127a..6540095544e8 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -187,6 +187,7 @@ static int noinline find_search_start(struct btrfs_root *root,
 
 	if (!cache)
 		goto out;
+
 	total_fs_bytes = btrfs_super_total_bytes(&root->fs_info->super_copy);
 	free_space_cache = &root->fs_info->free_space_cache;
 
@@ -196,7 +197,7 @@ again:
 		goto out;
 
 	last = max(search_start, cache->key.objectid);
-	if (!block_group_bits(cache, data)) {
+	if (!block_group_bits(cache, data) || cache->ro) {
 		goto new_group;
 	}
 
@@ -221,6 +222,8 @@ again:
 			continue;
 		}
 		spin_unlock_irq(&free_space_cache->lock);
+		if (cache->ro)
+			goto new_group;
 		if (start + num > cache->key.objectid + cache->key.offset)
 			goto new_group;
 		if (start + num  > total_fs_bytes)
@@ -319,7 +322,7 @@ struct btrfs_block_group_cache *btrfs_find_block_group(struct btrfs_root *root,
 	if (search_start && search_start < total_fs_bytes) {
 		struct btrfs_block_group_cache *shint;
 		shint = btrfs_lookup_block_group(info, search_start);
-		if (shint && block_group_bits(shint, data)) {
+		if (shint && block_group_bits(shint, data) && !shint->ro) {
 			used = btrfs_block_group_used(&shint->item);
 			if (used + shint->pinned <
 			    div_factor(shint->key.offset, factor)) {
@@ -327,7 +330,7 @@ struct btrfs_block_group_cache *btrfs_find_block_group(struct btrfs_root *root,
 			}
 		}
 	}
-	if (hint && block_group_bits(hint, data) &&
+	if (hint && !hint->ro && block_group_bits(hint, data) &&
 	    hint->key.objectid < total_fs_bytes) {
 		used = btrfs_block_group_used(&hint->item);
 		if (used + hint->pinned <
@@ -364,7 +367,7 @@ again:
 		if (cache->key.objectid > total_fs_bytes)
 			break;
 
-		if (block_group_bits(cache, data)) {
+		if (!cache->ro && block_group_bits(cache, data)) {
 			if (full_search)
 				free_check = cache->key.offset;
 			else
@@ -1020,6 +1023,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
 	if (found) {
 		found->total_bytes += total_bytes;
 		found->bytes_used += bytes_used;
+		found->full = 0;
 		WARN_ON(found->total_bytes < found->bytes_used);
 		*space_info = found;
 		return 0;
@@ -1700,7 +1704,6 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
 	u64 super_used;
 	u64 root_used;
 	u64 search_start = 0;
-	u64 new_hint;
 	u64 alloc_profile;
 	u32 sizes[2];
 	struct btrfs_fs_info *info = root->fs_info;
@@ -1724,7 +1727,7 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
 		data = BTRFS_BLOCK_GROUP_METADATA | alloc_profile;
 	}
 again:
-	if (root->ref_cows) {
+	if (root != root->fs_info->extent_root) {
 		if (!(data & BTRFS_BLOCK_GROUP_METADATA)) {
 			ret = do_chunk_alloc(trans, root->fs_info->extent_root,
 					     2 * 1024 * 1024,
@@ -1738,10 +1741,6 @@ again:
 		BUG_ON(ret);
 	}
 
-	new_hint = max(hint_byte, root->fs_info->alloc_start);
-	if (new_hint < btrfs_super_total_bytes(&info->super_copy))
-		hint_byte = new_hint;
-
 	WARN_ON(num_bytes < root->sectorsize);
 	ret = find_free_extent(trans, root, num_bytes, empty_size,
 			       search_start, search_end, hint_byte, ins,
@@ -2473,15 +2472,16 @@ out:
 	return ret;
 }
 
-int btrfs_shrink_extent_tree(struct btrfs_root *root, u64 new_size)
+int btrfs_shrink_extent_tree(struct btrfs_root *root, u64 shrink_start)
 {
 	struct btrfs_trans_handle *trans;
 	struct btrfs_root *tree_root = root->fs_info->tree_root;
 	struct btrfs_path *path;
 	u64 cur_byte;
 	u64 total_found;
+	u64 shrink_last_byte;
+	struct btrfs_block_group_cache *shrink_block_group;
 	struct btrfs_fs_info *info = root->fs_info;
-	struct extent_io_tree *block_group_cache;
 	struct btrfs_key key;
 	struct btrfs_key found_key;
 	struct extent_buffer *leaf;
@@ -2489,17 +2489,29 @@ int btrfs_shrink_extent_tree(struct btrfs_root *root, u64 new_size)
 	int ret;
 	int progress = 0;
 
-	btrfs_set_super_total_bytes(&info->super_copy, new_size);
-	clear_extent_dirty(&info->free_space_cache, new_size, (u64)-1,
-			   GFP_NOFS);
-	block_group_cache = &info->block_group_cache;
+	shrink_block_group = btrfs_lookup_block_group(root->fs_info,
+						      shrink_start);
+	BUG_ON(!shrink_block_group);
+
+	shrink_last_byte = shrink_start + shrink_block_group->key.offset;
+
+	shrink_block_group->space_info->total_bytes -=
+		shrink_block_group->key.offset;
+printk("shrink_extent_tree %Lu -> %Lu type %Lu\n", shrink_start, shrink_last_byte, shrink_block_group->flags);
 	path = btrfs_alloc_path();
 	root = root->fs_info->extent_root;
 	path->reada = 2;
 
 again:
+	trans = btrfs_start_transaction(root, 1);
+	do_chunk_alloc(trans, root->fs_info->extent_root,
+			btrfs_block_group_used(&shrink_block_group->item) +
+			2 * 1024 * 1024, shrink_block_group->flags);
+	btrfs_end_transaction(trans, root);
+	shrink_block_group->ro = 1;
+
 	total_found = 0;
-	key.objectid = new_size;
+	key.objectid = shrink_start;
 	key.offset = 0;
 	key.type = 0;
 	cur_byte = key.objectid;
@@ -2511,10 +2523,12 @@ again:
 	ret = btrfs_previous_item(root, path, 0, BTRFS_EXTENT_ITEM_KEY);
 	if (ret < 0)
 		goto out;
+
 	if (ret == 0) {
 		leaf = path->nodes[0];
 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
-		if (found_key.objectid + found_key.offset > new_size) {
+		if (found_key.objectid + found_key.offset > shrink_start &&
+		    found_key.objectid < shrink_last_byte) {
 			cur_byte = found_key.objectid;
 			key.objectid = cur_byte;
 		}
@@ -2543,6 +2557,9 @@ next:
 
 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
 
+		if (found_key.objectid >= shrink_last_byte)
+			break;
+
 		if (progress && need_resched()) {
 			memcpy(&key, &found_key, sizeof(key));
 			mutex_unlock(&root->fs_info->fs_mutex);
@@ -2583,68 +2600,31 @@ next:
 		goto again;
 	}
 
+	/*
+	 * we've freed all the extents, now remove the block
+	 * group item from the tree
+	 */
 	trans = btrfs_start_transaction(root, 1);
-	key.objectid = new_size;
-	key.offset = 0;
-	key.type = 0;
-	while(1) {
-		u64 ptr;
-
-		ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
-		if (ret < 0)
-			goto out;
-
-		leaf = path->nodes[0];
-		nritems = btrfs_header_nritems(leaf);
-bg_next:
-		if (path->slots[0] >= nritems) {
-			ret = btrfs_next_leaf(root, path);
-			if (ret < 0)
-				break;
-			if (ret == 1) {
-				ret = 0;
-				break;
-			}
-			leaf = path->nodes[0];
-			btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+	memcpy(&key, &shrink_block_group->key, sizeof(key));
 
-			/*
-			 * btrfs_next_leaf doesn't cow buffers, we have to
-			 * do the search again
-			 */
-			memcpy(&key, &found_key, sizeof(key));
-			btrfs_release_path(root, path);
-			goto resched_check;
-		}
+	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+	if (ret > 0)
+		ret = -EIO;
+	if (ret < 0)
+		goto out;
 
-		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
-		if (btrfs_key_type(&found_key) != BTRFS_BLOCK_GROUP_ITEM_KEY) {
-			printk("shrinker found key %Lu %u %Lu\n",
-				found_key.objectid, found_key.type,
-				found_key.offset);
-			path->slots[0]++;
-			goto bg_next;
-		}
-		ret = get_state_private(&info->block_group_cache,
-					found_key.objectid, &ptr);
-		if (!ret)
-			kfree((void *)(unsigned long)ptr);
+	leaf = path->nodes[0];
+	nritems = btrfs_header_nritems(leaf);
+	btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+	kfree(shrink_block_group);
 
-		clear_extent_bits(&info->block_group_cache, found_key.objectid,
-				  found_key.objectid + found_key.offset - 1,
-				  (unsigned int)-1, GFP_NOFS);
+	clear_extent_bits(&info->block_group_cache, found_key.objectid,
+			  found_key.objectid + found_key.offset - 1,
+			  (unsigned int)-1, GFP_NOFS);
 
-		key.objectid = found_key.objectid + 1;
-		btrfs_del_item(trans, root, path);
-		btrfs_release_path(root, path);
-resched_check:
-		if (need_resched()) {
-			mutex_unlock(&root->fs_info->fs_mutex);
-			cond_resched();
-			mutex_lock(&root->fs_info->fs_mutex);
-		}
-	}
-	clear_extent_dirty(&info->free_space_cache, new_size, (u64)-1,
+	btrfs_del_item(trans, root, path);
+	clear_extent_dirty(&info->free_space_cache,
+			   shrink_start, shrink_last_byte - 1,
 			   GFP_NOFS);
 	btrfs_commit_transaction(trans, root);
 out:
@@ -2652,13 +2632,6 @@ out:
 	return ret;
 }
 
-int btrfs_grow_extent_tree(struct btrfs_trans_handle *trans,
-			   struct btrfs_root *root, u64 new_size)
-{
-	btrfs_set_super_total_bytes(&root->fs_info->super_copy, new_size);
-	return 0;
-}
-
 int find_first_block_group(struct btrfs_root *root, struct btrfs_path *path,
 			   struct btrfs_key *key)
 {
@@ -2726,7 +2699,7 @@ int btrfs_read_block_groups(struct btrfs_root *root)
 
 		leaf = path->nodes[0];
 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
-		cache = kmalloc(sizeof(*cache), GFP_NOFS);
+		cache = kzalloc(sizeof(*cache), GFP_NOFS);
 		if (!cache) {
 			ret = -ENOMEM;
 			break;
@@ -2736,8 +2709,6 @@ int btrfs_read_block_groups(struct btrfs_root *root)
 				   btrfs_item_ptr_offset(leaf, path->slots[0]),
 				   sizeof(cache->item));
 		memcpy(&cache->key, &found_key, sizeof(found_key));
-		cache->cached = 0;
-		cache->pinned = 0;
 
 		key.objectid = found_key.objectid + found_key.offset;
 		btrfs_release_path(root, path);
@@ -2789,12 +2760,10 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
 	extent_root = root->fs_info->extent_root;
 	block_group_cache = &root->fs_info->block_group_cache;
 
-	cache = kmalloc(sizeof(*cache), GFP_NOFS);
+	cache = kzalloc(sizeof(*cache), GFP_NOFS);
 	BUG_ON(!cache);
 	cache->key.objectid = chunk_offset;
 	cache->key.offset = size;
-	cache->cached = 0;
-	cache->pinned = 0;
 
 	btrfs_set_key_type(&cache->key, BTRFS_BLOCK_GROUP_ITEM_KEY);
 	memset(&cache->item, 0, sizeof(cache->item));
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index b31f52d4f2ca..4d12aa532c5b 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -16,6 +16,7 @@
  * Boston, MA 021110-1307, USA.
  */
 
+#include <linux/kernel.h>
 #include <linux/bio.h>
 #include <linux/buffer_head.h>
 #include <linux/fs.h>
@@ -2887,9 +2888,12 @@ static int btrfs_ioctl_resize(struct btrfs_root *root, void __user *arg)
 {
 	u64 new_size;
 	u64 old_size;
+	u64 devid = 1;
 	struct btrfs_ioctl_vol_args *vol_args;
 	struct btrfs_trans_handle *trans;
+	struct btrfs_device *device = NULL;
 	char *sizestr;
+	char *devstr = NULL;
 	int ret = 0;
 	int namelen;
 	int mod = 0;
@@ -2909,9 +2913,25 @@ static int btrfs_ioctl_resize(struct btrfs_root *root, void __user *arg)
 		goto out;
 	}
 
+	mutex_lock(&root->fs_info->fs_mutex);
 	sizestr = vol_args->name;
+	devstr = strchr(sizestr, ':');
+	if (devstr) {
+		char *end;
+		sizestr = devstr + 1;
+		*devstr = '\0';
+		devstr = vol_args->name;
+		devid = simple_strtoull(devstr, &end, 10);
+printk("resizing devid %Lu\n", devid);
+	}
+	device = btrfs_find_device(root, devid, NULL);
+	if (!device) {
+		printk("resizer unable to find device %Lu\n", devid);
+		ret = -EINVAL;
+		goto out_unlock;
+	}
 	if (!strcmp(sizestr, "max"))
-		new_size = root->fs_info->sb->s_bdev->bd_inode->i_size;
+		new_size = device->bdev->bd_inode->i_size;
 	else {
 		if (sizestr[0] == '-') {
 			mod = -1;
@@ -2923,12 +2943,11 @@ static int btrfs_ioctl_resize(struct btrfs_root *root, void __user *arg)
 		new_size = btrfs_parse_size(sizestr);
 		if (new_size == 0) {
 			ret = -EINVAL;
-			goto out;
+			goto out_unlock;
 		}
 	}
 
-	mutex_lock(&root->fs_info->fs_mutex);
-	old_size = btrfs_super_total_bytes(&root->fs_info->super_copy);
+	old_size = device->total_bytes;
 
 	if (mod < 0) {
 		if (new_size > old_size) {
@@ -2944,7 +2963,7 @@ static int btrfs_ioctl_resize(struct btrfs_root *root, void __user *arg)
 		ret = -EINVAL;
 		goto out_unlock;
 	}
-	if (new_size > root->fs_info->sb->s_bdev->bd_inode->i_size) {
+	if (new_size > device->bdev->bd_inode->i_size) {
 		ret = -EFBIG;
 		goto out_unlock;
 	}
@@ -2952,13 +2971,14 @@ static int btrfs_ioctl_resize(struct btrfs_root *root, void __user *arg)
 	do_div(new_size, root->sectorsize);
 	new_size *= root->sectorsize;
 
-printk("new size is %Lu\n", new_size);
+printk("new size for %s is %llu\n", device->name, (unsigned long long)new_size);
+
 	if (new_size > old_size) {
 		trans = btrfs_start_transaction(root, 1);
-		ret = btrfs_grow_extent_tree(trans, root, new_size);
+		ret = btrfs_grow_device(trans, device, new_size);
 		btrfs_commit_transaction(trans, root);
 	} else {
-		ret = btrfs_shrink_extent_tree(root, new_size);
+		ret = btrfs_shrink_device(device, new_size);
 	}
 
 out_unlock:
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index c63a982e31d0..a2c56de1548a 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -77,7 +77,7 @@ static struct btrfs_device *__find_device(struct list_head *head, u64 devid,
 	list_for_each(cur, head) {
 		dev = list_entry(cur, struct btrfs_device, dev_list);
 		if (dev->devid == devid &&
-		    !memcmp(dev->uuid, uuid, BTRFS_UUID_SIZE)) {
+		    (!uuid || !memcmp(dev->uuid, uuid, BTRFS_UUID_SIZE))) {
 			return dev;
 		}
 	}
@@ -293,6 +293,10 @@ static int find_free_dev_extent(struct btrfs_trans_handle *trans,
 	 * so we make sure to start at an offset of at least 1MB
 	 */
 	search_start = max((u64)1024 * 1024, search_start);
+
+	if (root->fs_info->alloc_start + num_bytes <= device->total_bytes)
+		search_start = max(root->fs_info->alloc_start, search_start);
+
 	key.objectid = device->devid;
 	key.offset = search_start;
 	key.type = BTRFS_DEV_EXTENT_KEY;
@@ -380,6 +384,33 @@ error:
 	return ret;
 }
 
+int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
+			  struct btrfs_device *device,
+			  u64 start)
+{
+	int ret;
+	struct btrfs_path *path;
+	struct btrfs_root *root = device->dev_root;
+	struct btrfs_key key;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	key.objectid = device->devid;
+	key.offset = start;
+	key.type = BTRFS_DEV_EXTENT_KEY;
+
+	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+	BUG_ON(ret);
+
+	ret = btrfs_del_item(trans, root, path);
+	BUG_ON(ret);
+
+	btrfs_free_path(path);
+	return ret;
+}
+
 int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
 			   struct btrfs_device *device,
 			   u64 chunk_tree, u64 chunk_objectid,
@@ -560,6 +591,7 @@ out:
 	btrfs_free_path(path);
 	return ret;
 }
+
 int btrfs_update_device(struct btrfs_trans_handle *trans,
 			struct btrfs_device *device)
 {
@@ -606,6 +638,254 @@ out:
 	return ret;
 }
 
+int btrfs_grow_device(struct btrfs_trans_handle *trans,
+		      struct btrfs_device *device, u64 new_size)
+{
+	struct btrfs_super_block *super_copy =
+		&device->dev_root->fs_info->super_copy;
+	u64 old_total = btrfs_super_total_bytes(super_copy);
+	u64 diff = new_size - device->total_bytes;
+
+	btrfs_set_super_total_bytes(super_copy, old_total + diff);
+	return btrfs_update_device(trans, device);
+}
+
+static int btrfs_free_chunk(struct btrfs_trans_handle *trans,
+			    struct btrfs_root *root,
+			    u64 chunk_tree, u64 chunk_objectid,
+			    u64 chunk_offset)
+{
+	int ret;
+	struct btrfs_path *path;
+	struct btrfs_key key;
+
+	root = root->fs_info->chunk_root;
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	key.objectid = chunk_objectid;
+	key.offset = chunk_offset;
+	key.type = BTRFS_CHUNK_ITEM_KEY;
+
+	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+	BUG_ON(ret);
+
+	ret = btrfs_del_item(trans, root, path);
+	BUG_ON(ret);
+
+	btrfs_free_path(path);
+	return 0;
+}
+
+int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64
+			chunk_offset)
+{
+	struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
+	struct btrfs_disk_key *disk_key;
+	struct btrfs_chunk *chunk;
+	u8 *ptr;
+	int ret = 0;
+	u32 num_stripes;
+	u32 array_size;
+	u32 len = 0;
+	u32 cur;
+	struct btrfs_key key;
+
+	array_size = btrfs_super_sys_array_size(super_copy);
+
+	ptr = super_copy->sys_chunk_array;
+	cur = 0;
+
+	while (cur < array_size) {
+		disk_key = (struct btrfs_disk_key *)ptr;
+		btrfs_disk_key_to_cpu(&key, disk_key);
+
+		len = sizeof(*disk_key);
+
+		if (key.type == BTRFS_CHUNK_ITEM_KEY) {
+			chunk = (struct btrfs_chunk *)(ptr + len);
+			num_stripes = btrfs_stack_chunk_num_stripes(chunk);
+			len += btrfs_chunk_item_size(num_stripes);
+		} else {
+			ret = -EIO;
+			break;
+		}
+		if (key.objectid == chunk_objectid &&
+		    key.offset == chunk_offset) {
+			memmove(ptr, ptr + len, array_size - (cur + len));
+			array_size -= len;
+			btrfs_set_super_sys_array_size(super_copy, array_size);
+		} else {
+			ptr += len;
+			cur += len;
+		}
+	}
+	return ret;
+}
+
+
+int btrfs_relocate_chunk(struct btrfs_root *root,
+			 u64 chunk_tree, u64 chunk_objectid,
+			 u64 chunk_offset)
+{
+	struct extent_map_tree *em_tree;
+	struct btrfs_root *extent_root;
+	struct btrfs_trans_handle *trans;
+	struct extent_map *em;
+	struct map_lookup *map;
+	int ret;
+	int i;
+
+	root = root->fs_info->chunk_root;
+	extent_root = root->fs_info->extent_root;
+	em_tree = &root->fs_info->mapping_tree.map_tree;
+
+	/* step one, relocate all the extents inside this chunk */
+	ret = btrfs_shrink_extent_tree(extent_root, chunk_offset);
+	BUG_ON(ret);
+
+	trans = btrfs_start_transaction(root, 1);
+	BUG_ON(!trans);
+
+	/*
+	 * step two, delete the device extents and the
+	 * chunk tree entries
+	 */
+	spin_lock(&em_tree->lock);
+	em = lookup_extent_mapping(em_tree, chunk_offset, 1);
+	spin_unlock(&em_tree->lock);
+
+	BUG_ON(em->start > chunk_offset || em->start + em->len < chunk_offset);
+	map = (struct map_lookup *)em->bdev;
+
+	for (i = 0; i < map->num_stripes; i++) {
+		ret = btrfs_free_dev_extent(trans, map->stripes[i].dev,
+					    map->stripes[i].physical);
+		BUG_ON(ret);
+	}
+	ret = btrfs_free_chunk(trans, root, chunk_tree, chunk_objectid,
+			       chunk_offset);
+
+	BUG_ON(ret);
+
+	if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
+		ret = btrfs_del_sys_chunk(root, chunk_objectid, chunk_offset);
+		BUG_ON(ret);
+		goto out;
+	}
+
+
+
+	spin_lock(&em_tree->lock);
+	remove_extent_mapping(em_tree, em);
+	kfree(map);
+	em->bdev = NULL;
+
+	/* once for the tree */
+	free_extent_map(em);
+	spin_unlock(&em_tree->lock);
+
+out:
+	/* once for us */
+	free_extent_map(em);
+
+	btrfs_end_transaction(trans, root);
+	return 0;
+}
+
+/*
+ * shrinking a device means finding all of the device extents past
+ * the new size, and then following the back refs to the chunks.
+ * The chunk relocation code actually frees the device extent
+ */
+int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
+{
+	struct btrfs_trans_handle *trans;
+	struct btrfs_root *root = device->dev_root;
+	struct btrfs_dev_extent *dev_extent = NULL;
+	struct btrfs_path *path;
+	u64 length;
+	u64 chunk_tree;
+	u64 chunk_objectid;
+	u64 chunk_offset;
+	int ret;
+	int slot;
+	struct extent_buffer *l;
+	struct btrfs_key key;
+	struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
+	u64 old_total = btrfs_super_total_bytes(super_copy);
+	u64 diff = device->total_bytes - new_size;
+
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	trans = btrfs_start_transaction(root, 1);
+	if (!trans) {
+		ret = -ENOMEM;
+		goto done;
+	}
+
+	path->reada = 2;
+
+	device->total_bytes = new_size;
+	ret = btrfs_update_device(trans, device);
+	if (ret) {
+		btrfs_end_transaction(trans, root);
+		goto done;
+	}
+	WARN_ON(diff > old_total);
+	btrfs_set_super_total_bytes(super_copy, old_total - diff);
+	btrfs_end_transaction(trans, root);
+
+	key.objectid = device->devid;
+	key.offset = (u64)-1;
+	key.type = BTRFS_DEV_EXTENT_KEY;
+
+	while (1) {
+		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+		if (ret < 0)
+			goto done;
+
+		ret = btrfs_previous_item(root, path, 0, key.type);
+		if (ret < 0)
+			goto done;
+		if (ret) {
+			ret = 0;
+			goto done;
+		}
+
+		l = path->nodes[0];
+		slot = path->slots[0];
+		btrfs_item_key_to_cpu(l, &key, path->slots[0]);
+
+		if (key.objectid != device->devid)
+			goto done;
+
+		dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
+		length = btrfs_dev_extent_length(l, dev_extent);
+
+		if (key.offset + length <= new_size)
+			goto done;
+
+		chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent);
+		chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent);
+		chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
+		btrfs_release_path(root, path);
+
+		ret = btrfs_relocate_chunk(root, chunk_tree, chunk_objectid,
+					   chunk_offset);
+		if (ret)
+			goto done;
+	}
+
+done:
+	btrfs_free_path(path);
+	return ret;
+}
+
 int btrfs_add_system_chunk(struct btrfs_trans_handle *trans,
 			   struct btrfs_root *root,
 			   struct btrfs_key *key,
@@ -658,6 +938,7 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 	u64 dev_offset;
 	struct btrfs_fs_info *info = extent_root->fs_info;
 	struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root;
+	struct btrfs_path *path;
 	struct btrfs_stripe *stripes;
 	struct btrfs_device *device = NULL;
 	struct btrfs_chunk *chunk;
@@ -724,6 +1005,10 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 		min_stripe_size = 1 * 1024 * 1024;
 	}
 
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
 	/* we don't want a chunk larger than 10% of the FS */
 	percent_max = div_factor(btrfs_super_total_bytes(&info->super_copy), 1);
 	max_chunk_size = min(percent_max, max_chunk_size);
@@ -759,11 +1044,19 @@ again:
 
 		avail = device->total_bytes - device->bytes_used;
 		cur = cur->next;
+
 		if (avail >= min_free) {
-			list_move_tail(&device->dev_alloc_list, &private_devs);
-			index++;
-			if (type & BTRFS_BLOCK_GROUP_DUP)
+			u64 ignored_start = 0;
+			ret = find_free_dev_extent(trans, device, path,
+						   min_free,
+						   &ignored_start);
+			if (ret == 0) {
+				list_move_tail(&device->dev_alloc_list,
+					       &private_devs);
 				index++;
+				if (type & BTRFS_BLOCK_GROUP_DUP)
+					index++;
+			}
 		} else if (avail > max_avail)
 			max_avail = avail;
 		if (cur == dev_list)
@@ -785,30 +1078,37 @@ again:
 			calc_size = max_avail;
 			goto again;
 		}
+		btrfs_free_path(path);
 		return -ENOSPC;
 	}
 	key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
 	key.type = BTRFS_CHUNK_ITEM_KEY;
 	ret = find_next_chunk(chunk_root, BTRFS_FIRST_CHUNK_TREE_OBJECTID,
 			      &key.offset);
-	if (ret)
+	if (ret) {
+		btrfs_free_path(path);
 		return ret;
+	}
 
 	chunk = kmalloc(btrfs_chunk_item_size(num_stripes), GFP_NOFS);
-	if (!chunk)
+	if (!chunk) {
+		btrfs_free_path(path);
 		return -ENOMEM;
+	}
 
 	map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
 	if (!map) {
 		kfree(chunk);
+		btrfs_free_path(path);
 		return -ENOMEM;
 	}
+	btrfs_free_path(path);
+	path = NULL;
 
 	stripes = &chunk->stripe;
 	*num_bytes = chunk_bytes_by_type(type, calc_size,
 					 num_stripes, sub_stripes);
 
-
 	index = 0;
 printk("new chunk type %Lu start %Lu size %Lu\n", type, key.offset, *num_bytes);
 	while(index < num_stripes) {
@@ -874,6 +1174,11 @@ printk("alloc chunk start %Lu size %Lu from dev %Lu type %Lu\n", key.offset, cal
 	em->len = *num_bytes;
 	em->block_start = 0;
 
+	if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
+		ret = btrfs_add_system_chunk(trans, chunk_root, &key,
+				    chunk, btrfs_chunk_item_size(num_stripes));
+		BUG_ON(ret);
+	}
 	kfree(chunk);
 
 	em_tree = &extent_root->fs_info->mapping_tree.map_tree;
@@ -1376,11 +1681,6 @@ int btrfs_read_sys_array(struct btrfs_root *root)
 
 	array_size = btrfs_super_sys_array_size(super_copy);
 
-	/*
-	 * we do this loop twice, once for the device items and
-	 * once for all of the chunks.  This way there are device
-	 * structs filled in for every chunk
-	 */
 	ptr = super_copy->sys_chunk_array;
 	sb_ptr = offsetof(struct btrfs_super_block, sys_chunk_array);
 	cur = 0;
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index a2660d2b6b34..6fe8440b37a7 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -128,4 +128,9 @@ int btrfs_cleanup_fs_uuids(void);
 int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len);
 int btrfs_unplug_page(struct btrfs_mapping_tree *map_tree,
 		      u64 logical, struct page *page);
+int btrfs_grow_device(struct btrfs_trans_handle *trans,
+		      struct btrfs_device *device, u64 new_size);
+struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid,
+				       u8 *uuid);
+int btrfs_shrink_device(struct btrfs_device *device, u64 new_size);
 #endif
-- 
cgit v1.2.3


From 788f20eb5affef584e75ea84bb80a4c3352a2c0e Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 28 Apr 2008 15:29:42 -0400
Subject: Btrfs: Add new ioctl to add devices

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h   |  2 ++
 fs/btrfs/inode.c   | 23 +++++++++++++++++
 fs/btrfs/ioctl.h   |  6 +++++
 fs/btrfs/super.c   |  1 +
 fs/btrfs/volumes.c | 75 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/btrfs/volumes.h |  1 +
 6 files changed, 108 insertions(+)

(limited to 'fs/btrfs/volumes.c')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index ac7106ec5357..b9257b37bb96 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -551,6 +551,8 @@ struct btrfs_fs_info {
 	u64 data_alloc_profile;
 	u64 metadata_alloc_profile;
 	u64 system_alloc_profile;
+
+	void *bdev_holder;
 };
 
 /*
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index ab707c0930df..1a74b5018699 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3070,6 +3070,27 @@ static int btrfs_ioctl_defrag(struct file *file)
 	return 0;
 }
 
+long btrfs_ioctl_add_dev(struct btrfs_root *root, void __user *arg)
+{
+	struct btrfs_ioctl_vol_args *vol_args;
+	int ret;
+
+	vol_args = kmalloc(sizeof(*vol_args), GFP_NOFS);
+
+	if (!vol_args)
+		return -ENOMEM;
+
+	if (copy_from_user(vol_args, arg, sizeof(*vol_args))) {
+		ret = -EFAULT;
+		goto out;
+	}
+	ret = btrfs_init_new_device(root, vol_args->name);
+
+out:
+	kfree(vol_args);
+	return ret;
+}
+
 long btrfs_ioctl(struct file *file, unsigned int
 		cmd, unsigned long arg)
 {
@@ -3082,6 +3103,8 @@ long btrfs_ioctl(struct file *file, unsigned int
 		return btrfs_ioctl_defrag(file);
 	case BTRFS_IOC_RESIZE:
 		return btrfs_ioctl_resize(root, (void __user *)arg);
+	case BTRFS_IOC_ADD_DEV:
+		return btrfs_ioctl_add_dev(root, (void __user *)arg);
 	}
 
 	return -ENOTTY;
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
index 4551e82013c8..8ad35fc4ba56 100644
--- a/fs/btrfs/ioctl.h
+++ b/fs/btrfs/ioctl.h
@@ -36,4 +36,10 @@ struct btrfs_ioctl_vol_args {
 				   struct btrfs_ioctl_vol_args)
 #define BTRFS_IOC_SCAN_DEV _IOW(BTRFS_IOCTL_MAGIC, 4, \
 				   struct btrfs_ioctl_vol_args)
+#define BTRFS_IOC_ADD_DEV _IOW(BTRFS_IOCTL_MAGIC, 10, \
+				   struct btrfs_ioctl_vol_args)
+#define BTRFS_IOC_RM_DEV _IOW(BTRFS_IOCTL_MAGIC, 11, \
+				   struct btrfs_ioctl_vol_args)
+#define BTRFS_IOC_BALANCE _IOW(BTRFS_IOCTL_MAGIC, 12, \
+				   struct btrfs_ioctl_vol_args)
 #endif
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 892864906880..7153dfaa3404 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -388,6 +388,7 @@ int btrfs_get_sb_bdev(struct file_system_type *fs_type,
 			goto error;
 		}
 
+		btrfs_sb(s)->fs_info->bdev_holder = fs_type;
 		s->s_flags |= MS_ACTIVE;
 	}
 
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index a2c56de1548a..b93c15aa17db 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -19,6 +19,7 @@
 #include <linux/bio.h>
 #include <linux/buffer_head.h>
 #include <linux/blkdev.h>
+#include <linux/random.h>
 #include <asm/div64.h>
 #include "ctree.h"
 #include "extent_map.h"
@@ -592,6 +593,80 @@ out:
 	return ret;
 }
 
+int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
+{
+	struct btrfs_trans_handle *trans;
+	struct btrfs_device *device;
+	struct block_device *bdev;
+	struct list_head *cur;
+	struct list_head *devices;
+	u64 total_bytes;
+	int ret = 0;
+
+
+	bdev = open_bdev_excl(device_path, 0, root->fs_info->bdev_holder);
+	if (!bdev) {
+		return -EIO;
+	}
+	mutex_lock(&root->fs_info->fs_mutex);
+	trans = btrfs_start_transaction(root, 1);
+	devices = &root->fs_info->fs_devices->devices;
+	list_for_each(cur, devices) {
+		device = list_entry(cur, struct btrfs_device, dev_list);
+		if (device->bdev == bdev) {
+			ret = -EEXIST;
+			goto out;
+		}
+	}
+
+	device = kzalloc(sizeof(*device), GFP_NOFS);
+	if (!device) {
+		/* we can safely leave the fs_devices entry around */
+		ret = -ENOMEM;
+		goto out_close_bdev;
+	}
+
+	device->barriers = 1;
+	generate_random_uuid(device->uuid);
+	spin_lock_init(&device->io_lock);
+	device->name = kstrdup(device_path, GFP_NOFS);
+	if (!device->name) {
+		kfree(device);
+		goto out_close_bdev;
+	}
+	device->io_width = root->sectorsize;
+	device->io_align = root->sectorsize;
+	device->sector_size = root->sectorsize;
+	device->total_bytes = i_size_read(bdev->bd_inode);
+	device->dev_root = root->fs_info->dev_root;
+	device->bdev = bdev;
+
+	ret = btrfs_add_device(trans, root, device);
+	if (ret)
+		goto out_close_bdev;
+
+	total_bytes = btrfs_super_total_bytes(&root->fs_info->super_copy);
+	btrfs_set_super_total_bytes(&root->fs_info->super_copy,
+				    total_bytes + device->total_bytes);
+
+	total_bytes = btrfs_super_num_devices(&root->fs_info->super_copy);
+	btrfs_set_super_num_devices(&root->fs_info->super_copy,
+				    total_bytes + 1);
+
+	list_add(&device->dev_list, &root->fs_info->fs_devices->devices);
+	list_add(&device->dev_alloc_list,
+		 &root->fs_info->fs_devices->alloc_list);
+	root->fs_info->fs_devices->num_devices++;
+out:
+	btrfs_end_transaction(trans, root);
+	mutex_unlock(&root->fs_info->fs_mutex);
+	return ret;
+
+out_close_bdev:
+	close_bdev_excl(bdev);
+	goto out;
+}
+
 int btrfs_update_device(struct btrfs_trans_handle *trans,
 			struct btrfs_device *device)
 {
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 6fe8440b37a7..6f173450378b 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -133,4 +133,5 @@ int btrfs_grow_device(struct btrfs_trans_handle *trans,
 struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid,
 				       u8 *uuid);
 int btrfs_shrink_device(struct btrfs_device *device, u64 new_size);
+int btrfs_init_new_device(struct btrfs_root *root, char *path);
 #endif
-- 
cgit v1.2.3


From ec44a35cbeb26ab2da84cb280d778260f2312feb Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 28 Apr 2008 15:29:52 -0400
Subject: Btrfs: Add balance ioctl to restripe the chunks

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h       |   2 +-
 fs/btrfs/extent-tree.c | 106 +++++++++++++++++++++++++++++++++++++--------
 fs/btrfs/inode.c       |  11 +++++
 fs/btrfs/volumes.c     | 115 +++++++++++++++++++++++++++++++++++++++++++++----
 fs/btrfs/volumes.h     |   1 +
 5 files changed, 208 insertions(+), 27 deletions(-)

(limited to 'fs/btrfs/volumes.c')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index b9257b37bb96..73b92dd150ff 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1364,7 +1364,7 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
 		       u64 root_objectid, u64 ref_generation,
 		       u64 owner, u64 owner_offset,
 		       u64 empty_size, u64 hint_byte,
-		       u64 search_end, struct btrfs_key *ins, int data);
+		       u64 search_end, struct btrfs_key *ins, u64 data);
 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 		  struct extent_buffer *buf);
 int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index fe4fe709c312..95aee5a29375 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -17,6 +17,7 @@
  */
 #include <linux/sched.h>
 #include <linux/pagemap.h>
+#include <linux/writeback.h>
 #include "hash.h"
 #include "crc32c.h"
 #include "ctree.h"
@@ -1058,6 +1059,26 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
 	}
 }
 
+static u64 reduce_alloc_profile(u64 flags)
+{
+	if ((flags & BTRFS_BLOCK_GROUP_DUP) &&
+	    (flags & (BTRFS_BLOCK_GROUP_RAID1 |
+		      BTRFS_BLOCK_GROUP_RAID10)))
+		flags &= ~BTRFS_BLOCK_GROUP_DUP;
+
+	if ((flags & BTRFS_BLOCK_GROUP_RAID1) &&
+	    (flags & BTRFS_BLOCK_GROUP_RAID10))
+		flags &= ~BTRFS_BLOCK_GROUP_RAID1;
+
+	if ((flags & BTRFS_BLOCK_GROUP_RAID0) &&
+	    ((flags & BTRFS_BLOCK_GROUP_RAID1) |
+	     (flags & BTRFS_BLOCK_GROUP_RAID10) |
+	     (flags & BTRFS_BLOCK_GROUP_DUP)))
+		flags &= ~BTRFS_BLOCK_GROUP_RAID0;
+	return flags;
+}
+
+
 static int do_chunk_alloc(struct btrfs_trans_handle *trans,
 			  struct btrfs_root *extent_root, u64 alloc_bytes,
 			  u64 flags)
@@ -1068,6 +1089,8 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
 	u64 num_bytes;
 	int ret;
 
+	flags = reduce_alloc_profile(flags);
+
 	space_info = __find_space_info(extent_root->fs_info, flags);
 	if (!space_info) {
 		ret = update_space_info(extent_root->fs_info, flags,
@@ -1684,6 +1707,7 @@ enospc:
 error:
 	return ret;
 }
+
 /*
  * finds a free extent and does all the dirty work required for allocation
  * returns the key for the extent through ins, and a tree buffer for
@@ -1697,7 +1721,7 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
 		       u64 root_objectid, u64 ref_generation,
 		       u64 owner, u64 owner_offset,
 		       u64 empty_size, u64 hint_byte,
-		       u64 search_end, struct btrfs_key *ins, int data)
+		       u64 search_end, struct btrfs_key *ins, u64 data)
 {
 	int ret;
 	int pending_ret;
@@ -1727,6 +1751,7 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
 		data = BTRFS_BLOCK_GROUP_METADATA | alloc_profile;
 	}
 again:
+	data = reduce_alloc_profile(data);
 	if (root->ref_cows) {
 		if (!(data & BTRFS_BLOCK_GROUP_METADATA)) {
 			ret = do_chunk_alloc(trans, root->fs_info->extent_root,
@@ -1752,6 +1777,9 @@ again:
 		num_bytes = max(num_bytes, min_alloc_size);
 		goto again;
 	}
+	if (ret) {
+		printk("allocation failed flags %Lu\n", data);
+	}
 	BUG_ON(ret);
 	if (ret)
 		return ret;
@@ -2274,8 +2302,6 @@ static int noinline relocate_inode_pages(struct inode *inode, u64 start,
 {
 	u64 page_start;
 	u64 page_end;
-	u64 delalloc_start;
-	u64 existing_delalloc;
 	unsigned long last_index;
 	unsigned long i;
 	struct page *page;
@@ -2293,7 +2319,6 @@ static int noinline relocate_inode_pages(struct inode *inode, u64 start,
 	ra_pages = BTRFS_I(inode)->root->fs_info->bdi.ra_pages;
 
 	file_ra_state_init(ra, inode->i_mapping);
-	kfree(ra);
 
 	for (; i <= last_index; i++) {
 		if (total_read % ra_pages == 0) {
@@ -2313,26 +2338,30 @@ static int noinline relocate_inode_pages(struct inode *inode, u64 start,
 				goto out_unlock;
 			}
 		}
+#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18)
+		ClearPageDirty(page);
+#else
+		cancel_dirty_page(page, PAGE_CACHE_SIZE);
+#endif
+		wait_on_page_writeback(page);
+		set_page_extent_mapped(page);
 		page_start = (u64)page->index << PAGE_CACHE_SHIFT;
 		page_end = page_start + PAGE_CACHE_SIZE - 1;
 
 		lock_extent(io_tree, page_start, page_end, GFP_NOFS);
 
-		delalloc_start = page_start;
-		existing_delalloc = count_range_bits(io_tree,
-					     &delalloc_start, page_end,
-					     PAGE_CACHE_SIZE, EXTENT_DELALLOC);
-
+		set_page_dirty(page);
 		set_extent_delalloc(io_tree, page_start,
 				    page_end, GFP_NOFS);
 
 		unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
-		set_page_dirty(page);
 		unlock_page(page);
 		page_cache_release(page);
+		balance_dirty_pages_ratelimited_nr(inode->i_mapping, 1);
 	}
 
 out_unlock:
+	kfree(ra);
 	mutex_unlock(&inode->i_mutex);
 	return 0;
 }
@@ -2397,8 +2426,6 @@ static int noinline relocate_one_reference(struct btrfs_root *extent_root,
 			goto out;
 		}
 		relocate_inode_pages(inode, ref_offset, extent_key->offset);
-		/* FIXME, data=ordered will help get rid of this */
-		filemap_fdatawrite(inode->i_mapping);
 		iput(inode);
 		mutex_lock(&extent_root->fs_info->fs_mutex);
 	} else {
@@ -2486,6 +2513,47 @@ out:
 	return ret;
 }
 
+static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
+{
+	u64 num_devices;
+	u64 stripped = BTRFS_BLOCK_GROUP_RAID0 |
+		BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10;
+
+	num_devices = btrfs_super_num_devices(&root->fs_info->super_copy);
+	if (num_devices == 1) {
+		stripped |= BTRFS_BLOCK_GROUP_DUP;
+		stripped = flags & ~stripped;
+
+		/* turn raid0 into single device chunks */
+		if (flags & BTRFS_BLOCK_GROUP_RAID0)
+			return stripped;
+
+		/* turn mirroring into duplication */
+		if (flags & (BTRFS_BLOCK_GROUP_RAID1 |
+			     BTRFS_BLOCK_GROUP_RAID10))
+			return stripped | BTRFS_BLOCK_GROUP_DUP;
+		return flags;
+	} else {
+		/* they already had raid on here, just return */
+		if ((flags & BTRFS_BLOCK_GROUP_DUP) &&
+		    (flags & BTRFS_BLOCK_GROUP_RAID1)) {
+		}
+		if (flags & stripped)
+			return flags;
+
+		stripped |= BTRFS_BLOCK_GROUP_DUP;
+		stripped = flags & ~stripped;
+
+		/* switch duplicated blocks with raid1 */
+		if (flags & BTRFS_BLOCK_GROUP_DUP)
+			return stripped | BTRFS_BLOCK_GROUP_RAID1;
+
+		/* turn single device chunks into raid0 */
+		return stripped | BTRFS_BLOCK_GROUP_RAID0;
+	}
+	return flags;
+}
+
 int btrfs_shrink_extent_tree(struct btrfs_root *root, u64 shrink_start)
 {
 	struct btrfs_trans_handle *trans;
@@ -2494,6 +2562,7 @@ int btrfs_shrink_extent_tree(struct btrfs_root *root, u64 shrink_start)
 	u64 cur_byte;
 	u64 total_found;
 	u64 shrink_last_byte;
+	u64 new_alloc_flags;
 	struct btrfs_block_group_cache *shrink_block_group;
 	struct btrfs_fs_info *info = root->fs_info;
 	struct btrfs_key key;
@@ -2511,17 +2580,20 @@ int btrfs_shrink_extent_tree(struct btrfs_root *root, u64 shrink_start)
 
 	shrink_block_group->space_info->total_bytes -=
 		shrink_block_group->key.offset;
-printk("shrink_extent_tree %Lu -> %Lu type %Lu\n", shrink_start, shrink_last_byte, shrink_block_group->flags);
 	path = btrfs_alloc_path();
 	root = root->fs_info->extent_root;
 	path->reada = 2;
 
 again:
-	trans = btrfs_start_transaction(root, 1);
-	do_chunk_alloc(trans, root->fs_info->extent_root,
+	if (btrfs_block_group_used(&shrink_block_group->item) > 0) {
+		trans = btrfs_start_transaction(root, 1);
+		new_alloc_flags = update_block_group_flags(root,
+						   shrink_block_group->flags);
+		do_chunk_alloc(trans, root->fs_info->extent_root,
 			btrfs_block_group_used(&shrink_block_group->item) +
-			2 * 1024 * 1024, shrink_block_group->flags);
-	btrfs_end_transaction(trans, root);
+			2 * 1024 * 1024, new_alloc_flags);
+		btrfs_end_transaction(trans, root);
+	}
 	shrink_block_group->ro = 1;
 
 	total_found = 0;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 1a74b5018699..994834474590 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2864,6 +2864,15 @@ int btrfs_defrag_file(struct file *file) {
 				goto out_unlock;
 			}
 		}
+
+#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18)
+		ClearPageDirty(page);
+#else
+		cancel_dirty_page(page, PAGE_CACHE_SIZE);
+#endif
+		wait_on_page_writeback(page);
+		set_page_extent_mapped(page);
+
 		page_start = (u64)page->index << PAGE_CACHE_SHIFT;
 		page_end = page_start + PAGE_CACHE_SIZE - 1;
 
@@ -3105,6 +3114,8 @@ long btrfs_ioctl(struct file *file, unsigned int
 		return btrfs_ioctl_resize(root, (void __user *)arg);
 	case BTRFS_IOC_ADD_DEV:
 		return btrfs_ioctl_add_dev(root, (void __user *)arg);
+	case BTRFS_IOC_BALANCE:
+		return btrfs_balance(root->fs_info->dev_root);
 	}
 
 	return -ENOTTY;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index b93c15aa17db..6476ecbf132e 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -869,6 +869,107 @@ out:
 	return 0;
 }
 
+static u64 div_factor(u64 num, int factor)
+{
+	if (factor == 10)
+		return num;
+	num *= factor;
+	do_div(num, 10);
+	return num;
+}
+
+
+int btrfs_balance(struct btrfs_root *dev_root)
+{
+	int ret;
+	struct list_head *cur;
+	struct list_head *devices = &dev_root->fs_info->fs_devices->devices;
+	struct btrfs_device *device;
+	u64 old_size;
+	u64 size_to_free;
+	struct btrfs_path *path;
+	struct btrfs_key key;
+	struct btrfs_chunk *chunk;
+	struct btrfs_root *chunk_root = dev_root->fs_info->chunk_root;
+	struct btrfs_trans_handle *trans;
+	struct btrfs_key found_key;
+
+
+	dev_root = dev_root->fs_info->dev_root;
+
+	mutex_lock(&dev_root->fs_info->fs_mutex);
+	/* step one make some room on all the devices */
+	list_for_each(cur, devices) {
+		device = list_entry(cur, struct btrfs_device, dev_list);
+		old_size = device->total_bytes;
+		size_to_free = div_factor(old_size, 1);
+		size_to_free = min(size_to_free, (u64)1 * 1024 * 1024);
+		if (device->total_bytes - device->bytes_used > size_to_free)
+			continue;
+
+		ret = btrfs_shrink_device(device, old_size - size_to_free);
+		BUG_ON(ret);
+
+		trans = btrfs_start_transaction(dev_root, 1);
+		BUG_ON(!trans);
+
+		ret = btrfs_grow_device(trans, device, old_size);
+		BUG_ON(ret);
+
+		btrfs_end_transaction(trans, dev_root);
+	}
+
+	/* step two, relocate all the chunks */
+	path = btrfs_alloc_path();
+	BUG_ON(!path);
+
+	key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
+	key.offset = (u64)-1;
+	key.type = BTRFS_CHUNK_ITEM_KEY;
+
+	while(1) {
+		ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
+		if (ret < 0)
+			goto error;
+
+		/*
+		 * this shouldn't happen, it means the last relocate
+		 * failed
+		 */
+		if (ret == 0)
+			break;
+
+		ret = btrfs_previous_item(chunk_root, path, 0,
+					  BTRFS_CHUNK_ITEM_KEY);
+		if (ret) {
+			break;
+		}
+		btrfs_item_key_to_cpu(path->nodes[0], &found_key,
+				      path->slots[0]);
+		if (found_key.objectid != key.objectid)
+			break;
+		chunk = btrfs_item_ptr(path->nodes[0],
+				       path->slots[0],
+				       struct btrfs_chunk);
+		key.offset = found_key.offset;
+		/* chunk zero is special */
+		if (key.offset == 0)
+			break;
+
+		ret = btrfs_relocate_chunk(chunk_root,
+					   chunk_root->root_key.objectid,
+					   found_key.objectid,
+					   found_key.offset);
+		BUG_ON(ret);
+		btrfs_release_path(chunk_root, path);
+	}
+	ret = 0;
+error:
+	btrfs_free_path(path);
+	mutex_unlock(&dev_root->fs_info->fs_mutex);
+	return ret;
+}
+
 /*
  * shrinking a device means finding all of the device extents past
  * the new size, and then following the back refs to the chunks.
@@ -985,15 +1086,6 @@ int btrfs_add_system_chunk(struct btrfs_trans_handle *trans,
 	return 0;
 }
 
-static u64 div_factor(u64 num, int factor)
-{
-	if (factor == 10)
-		return num;
-	num *= factor;
-	do_div(num, 10);
-	return num;
-}
-
 static u64 chunk_bytes_by_type(u64 type, u64 calc_size, int num_stripes,
 			       int sub_stripes)
 {
@@ -1040,6 +1132,11 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 	int stripe_len = 64 * 1024;
 	struct btrfs_key key;
 
+	if ((type & BTRFS_BLOCK_GROUP_RAID1) &&
+	    (type & BTRFS_BLOCK_GROUP_DUP)) {
+		WARN_ON(1);
+		type &= ~BTRFS_BLOCK_GROUP_DUP;
+	}
 	dev_list = &extent_root->fs_info->fs_devices->alloc_list;
 	if (list_empty(dev_list))
 		return -ENOSPC;
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 6f173450378b..3f9a17f2e41b 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -134,4 +134,5 @@ struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid,
 				       u8 *uuid);
 int btrfs_shrink_device(struct btrfs_device *device, u64 new_size);
 int btrfs_init_new_device(struct btrfs_root *root, char *path);
+int btrfs_balance(struct btrfs_root *dev_root);
 #endif
-- 
cgit v1.2.3


From 4235298e4fc3c1a09f659cfe2fd285024eeb2241 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 28 Apr 2008 16:40:52 -0400
Subject: Btrfs: Drop some verbose printks

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/disk-io.c   | 18 +++++-------------
 fs/btrfs/extent_io.c | 13 +++++--------
 fs/btrfs/volumes.c   |  2 --
 3 files changed, 10 insertions(+), 23 deletions(-)

(limited to 'fs/btrfs/volumes.c')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index a9ce491d279b..1c7d84aff864 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -214,25 +214,18 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root,
 	while (1) {
 		ret = read_extent_buffer_pages(io_tree, eb, start, 1,
 					       btree_get_extent, mirror_num);
-		if (!ret) {
-			if (mirror_num)
-printk("good read %Lu mirror %d total %d\n", eb->start, mirror_num, num_copies);
+		if (!ret)
 			return ret;
-		}
+
 		num_copies = btrfs_num_copies(&root->fs_info->mapping_tree,
 					      eb->start, eb->len);
-printk("failed to read %Lu mirror %d total %d\n", eb->start, mirror_num, num_copies);
-		if (num_copies == 1) {
-printk("reading %Lu failed only one copy\n", eb->start);
+		if (num_copies == 1)
 			return ret;
-		}
+
 		mirror_num++;
-		if (mirror_num > num_copies) {
-printk("bailing at mirror %d of %d\n", mirror_num, num_copies);
+		if (mirror_num > num_copies)
 			return ret;
-		}
 	}
-printk("read extent buffer page last\n");
 	return -EIO;
 }
 
@@ -322,7 +315,6 @@ int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
 	btrfs_clear_buffer_defrag(eb);
 	found_start = btrfs_header_bytenr(eb);
 	if (found_start != start) {
-printk("bad start on %Lu found %Lu\n", eb->start, found_start);
 		ret = -EIO;
 		goto err;
 	}
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 2368536629e7..dd403b426ff5 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -3003,17 +3003,18 @@ int extent_buffer_uptodate(struct extent_io_tree *tree,
 			   struct extent_buffer *eb)
 {
 	int ret = 0;
-	int ret2;
 	unsigned long num_pages;
 	unsigned long i;
 	struct page *page;
 	int pg_uptodate = 1;
 
 	if (eb->flags & EXTENT_UPTODATE)
-		ret = 1;
+		return 1;
 
-	ret2  = test_range_bit(tree, eb->start, eb->start + eb->len - 1,
+	ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1,
 			   EXTENT_UPTODATE, 1);
+	if (ret)
+		return ret;
 
 	num_pages = num_extent_pages(eb->start, eb->len);
 	for (i = 0; i < num_pages; i++) {
@@ -3023,11 +3024,7 @@ int extent_buffer_uptodate(struct extent_io_tree *tree,
 			break;
 		}
 	}
-	if ((ret || ret2) && !pg_uptodate) {
-printk("uptodate error2 eb %Lu ret %d ret2 %d pg_uptodate %d\n", eb->start, ret, ret2, pg_uptodate);
-		WARN_ON(1);
-	}
-	return (ret || ret2);
+	return pg_uptodate;
 }
 EXPORT_SYMBOL(extent_buffer_uptodate);
 
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 6476ecbf132e..9a7241134560 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1282,7 +1282,6 @@ again:
 					 num_stripes, sub_stripes);
 
 	index = 0;
-printk("new chunk type %Lu start %Lu size %Lu\n", type, key.offset, *num_bytes);
 	while(index < num_stripes) {
 		struct btrfs_stripe *stripe;
 		BUG_ON(list_empty(&private_devs));
@@ -1299,7 +1298,6 @@ printk("new chunk type %Lu start %Lu size %Lu\n", type, key.offset, *num_bytes);
 			     BTRFS_FIRST_CHUNK_TREE_OBJECTID, key.offset,
 			     calc_size, &dev_offset);
 		BUG_ON(ret);
-printk("alloc chunk start %Lu size %Lu from dev %Lu type %Lu\n", key.offset, calc_size, device->devid, type);
 		device->bytes_used += calc_size;
 		ret = btrfs_update_device(trans, device);
 		BUG_ON(ret);
-- 
cgit v1.2.3


From a236aed14ccb0661611d4416f6b573d892bdc60a Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 29 Apr 2008 09:38:00 -0400
Subject: Btrfs: Deal with failed writes in mirrored configurations

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/disk-io.c     | 17 +++++++++++++++--
 fs/btrfs/extent-tree.c |  4 ++--
 fs/btrfs/volumes.c     | 17 ++++++++++++++---
 fs/btrfs/volumes.h     |  3 ++-
 4 files changed, 33 insertions(+), 8 deletions(-)

(limited to 'fs/btrfs/volumes.c')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 1c7d84aff864..e35e70165b53 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1385,7 +1385,10 @@ int write_all_supers(struct btrfs_root *root)
 	struct buffer_head *bh;
 	int ret;
 	int do_barriers;
+	int max_errors;
+	int total_errors = 0;
 
+	max_errors = btrfs_super_num_devices(&root->fs_info->super_copy) - 1;
 	do_barriers = !btrfs_test_opt(root, NOBARRIER);
 
 	sb = root->fs_info->sb_buffer;
@@ -1433,8 +1436,14 @@ int write_all_supers(struct btrfs_root *root)
 		} else {
 			ret = submit_bh(WRITE, bh);
 		}
-		BUG_ON(ret);
+		if (ret)
+			total_errors++;
 	}
+	if (total_errors > max_errors) {
+		printk("btrfs: %d errors while writing supers\n", total_errors);
+		BUG();
+	}
+	total_errors = 0;
 
 	list_for_each(cur, head) {
 		dev = list_entry(cur, struct btrfs_device, dev_list);
@@ -1454,13 +1463,17 @@ int write_all_supers(struct btrfs_root *root)
 				wait_on_buffer(bh);
 				BUG_ON(!buffer_uptodate(bh));
 			} else {
-				BUG();
+				total_errors++;
 			}
 
 		}
 		dev->pending_io = NULL;
 		brelse(bh);
 	}
+	if (total_errors > max_errors) {
+		printk("btrfs: %d errors while writing supers\n", total_errors);
+		BUG();
+	}
 	return 0;
 }
 
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 95aee5a29375..f94794a99329 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -315,8 +315,8 @@ struct btrfs_block_group_cache *btrfs_find_block_group(struct btrfs_root *root,
 	block_group_cache = &info->block_group_cache;
 	total_fs_bytes = btrfs_super_total_bytes(&root->fs_info->super_copy);
 
-	if (!owner)
-		factor = 10;
+	if (data & BTRFS_BLOCK_GROUP_METADATA)
+		factor = 9;
 
 	bit = block_group_state_bits(data);
 
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 9a7241134560..57ab755aca76 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1425,6 +1425,7 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
 	int stripe_index;
 	int i;
 	int num_stripes;
+	int max_errors = 0;
 	struct btrfs_multi_bio *multi = NULL;
 
 	if (multi_ret && !(rw & (1 << BIO_RW))) {
@@ -1436,6 +1437,8 @@ again:
 				GFP_NOFS);
 		if (!multi)
 			return -ENOMEM;
+
+		atomic_set(&multi->error, 0);
 	}
 
 	spin_lock(&em_tree->lock);
@@ -1462,8 +1465,10 @@ again:
 		if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
 				 BTRFS_BLOCK_GROUP_DUP)) {
 			stripes_required = map->num_stripes;
+			max_errors = 1;
 		} else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
 			stripes_required = map->sub_stripes;
+			max_errors = 1;
 		}
 	}
 	if (multi_ret && rw == WRITE &&
@@ -1561,6 +1566,7 @@ again:
 	if (multi_ret) {
 		*multi_ret = multi;
 		multi->num_stripes = num_stripes;
+		multi->max_errors = max_errors;
 	}
 out:
 	free_extent_map(em);
@@ -1598,14 +1604,19 @@ static int end_bio_multi_stripe(struct bio *bio,
 		return 1;
 #endif
 	if (err)
-		multi->error = err;
+		atomic_inc(&multi->error);
 
 	if (atomic_dec_and_test(&multi->stripes_pending)) {
 		bio->bi_private = multi->private;
 		bio->bi_end_io = multi->end_io;
 
-		if (!err && multi->error)
-			err = multi->error;
+		/* only send an error to the higher layers if it is
+		 * beyond the tolerance of the multi-bio
+		 */
+		if (atomic_read(&multi->error) > multi->max_errors)
+			err = -EIO;
+		else
+			err = 0;
 		kfree(multi);
 
 #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 3f9a17f2e41b..a9663e92bb14 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -90,7 +90,8 @@ struct btrfs_multi_bio {
 	atomic_t stripes_pending;
 	bio_end_io_t *end_io;
 	void *private;
-	int error;
+	atomic_t error;
+	int max_errors;
 	int num_stripes;
 	struct btrfs_bio_stripe stripes[];
 };
-- 
cgit v1.2.3


From 2fff734fafa742236aecbdcdc8b1ff4d221cbaca Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 29 Apr 2008 14:12:09 -0400
Subject: Btrfs: Tune stripe selection for raid1 and raid10

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/volumes.c | 17 +++++++----------
 1 file changed, 7 insertions(+), 10 deletions(-)

(limited to 'fs/btrfs/volumes.c')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 57ab755aca76..6d8dd4438c76 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1509,17 +1509,17 @@ again:
 	if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
 		if (unplug_page || (rw & (1 << BIO_RW)))
 			num_stripes = map->num_stripes;
-		else if (mirror_num) {
+		else if (mirror_num)
 			stripe_index = mirror_num - 1;
-		} else {
-			u64 orig_stripe_nr = stripe_nr;
-			stripe_index = do_div(orig_stripe_nr, num_stripes);
-		}
+		else
+			stripe_index = current->pid % map->num_stripes;
+
 	} else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
 		if (rw & (1 << BIO_RW))
 			num_stripes = map->num_stripes;
 		else if (mirror_num)
 			stripe_index = mirror_num - 1;
+
 	} else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
 		int factor = map->num_stripes / map->sub_stripes;
 
@@ -1530,11 +1530,8 @@ again:
 			num_stripes = map->sub_stripes;
 		else if (mirror_num)
 			stripe_index += mirror_num - 1;
-		else {
-			u64 orig_stripe_nr = stripe_nr;
-			stripe_index += do_div(orig_stripe_nr,
-					       map->sub_stripes);
-		}
+		else
+			stripe_index += current->pid % map->sub_stripes;
 	} else {
 		/*
 		 * after this do_div call, stripe_nr is the number of stripes
-- 
cgit v1.2.3


From 006a58a25fe8aaa7559be80dcff94d98aee84309 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 2 May 2008 14:43:15 -0400
Subject: Btrfs: Compile warning fixup in volume.c

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/volumes.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs/btrfs/volumes.c')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 6d8dd4438c76..b38187573108 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -547,7 +547,7 @@ int btrfs_add_device(struct btrfs_trans_handle *trans,
 	struct extent_buffer *leaf;
 	struct btrfs_key key;
 	unsigned long ptr;
-	u64 free_devid;
+	u64 free_devid = 0;
 
 	root = root->fs_info->chunk_root;
 
-- 
cgit v1.2.3


From a061fc8da7b990faa41ca503e66faef3ecdeead0 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 7 May 2008 11:43:44 -0400
Subject: Btrfs: Add support for online device removal

This required a few structural changes to the code that manages bdev pointers:

The VFS super block now gets an anon-bdev instead of a pointer to the
lowest bdev.  This allows us to avoid swapping the super block bdev pointer
around at run time.

The code to read in the super block no longer goes through the extent
buffer interface.  Things got ugly keeping the mapping constant.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h       |   3 +-
 fs/btrfs/disk-io.c     | 101 ++++++++++------------
 fs/btrfs/extent-tree.c | 139 +++++++++++++++++++++++++------
 fs/btrfs/inode.c       |  27 +++++-
 fs/btrfs/super.c       |  35 ++------
 fs/btrfs/transaction.c |   5 +-
 fs/btrfs/volumes.c     | 221 +++++++++++++++++++++++++++++++++++++++++++++++--
 fs/btrfs/volumes.h     |   3 +
 8 files changed, 412 insertions(+), 122 deletions(-)

(limited to 'fs/btrfs/volumes.c')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 3b6f8524a4ad..33ab165591c5 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -505,7 +505,7 @@ struct btrfs_fs_info {
 	u64 alloc_start;
 	struct btrfs_transaction *running_transaction;
 	struct btrfs_super_block super_copy;
-	struct extent_buffer *sb_buffer;
+	struct btrfs_super_block super_for_commit;
 	struct block_device *__bdev;
 	struct super_block *sb;
 	struct inode *btree_inode;
@@ -1208,6 +1208,7 @@ BTRFS_SETGET_STACK_FUNCS(root_limit, struct btrfs_root_item, byte_limit, 64);
 
 /* struct btrfs_super_block */
 BTRFS_SETGET_STACK_FUNCS(super_bytenr, struct btrfs_super_block, bytenr, 64);
+BTRFS_SETGET_STACK_FUNCS(super_flags, struct btrfs_super_block, flags, 64);
 BTRFS_SETGET_STACK_FUNCS(super_generation, struct btrfs_super_block,
 			 generation, 64);
 BTRFS_SETGET_STACK_FUNCS(super_root, struct btrfs_super_block, root, 64);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index fabc31b334b6..9d5424ad01a3 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -78,9 +78,13 @@ struct extent_map *btree_get_extent(struct inode *inode, struct page *page,
 
 	spin_lock(&em_tree->lock);
 	em = lookup_extent_mapping(em_tree, start, len);
-	spin_unlock(&em_tree->lock);
-	if (em)
+	if (em) {
+		em->bdev =
+			BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
+		spin_unlock(&em_tree->lock);
 		goto out;
+	}
+	spin_unlock(&em_tree->lock);
 
 	em = alloc_extent_map(GFP_NOFS);
 	if (!em) {
@@ -90,7 +94,7 @@ struct extent_map *btree_get_extent(struct inode *inode, struct page *page,
 	em->start = 0;
 	em->len = (u64)-1;
 	em->block_start = 0;
-	em->bdev = inode->i_sb->s_bdev;
+	em->bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
 
 	spin_lock(&em_tree->lock);
 	ret = add_extent_mapping(em_tree, em);
@@ -435,11 +439,6 @@ static int __btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
 	ret = btrfs_bio_wq_end_io(root->fs_info, bio, 1);
 	BUG_ON(ret);
 
-	if (offset == BTRFS_SUPER_INFO_OFFSET) {
-		bio->bi_bdev = root->fs_info->fs_devices->latest_bdev;
-		submit_bio(rw, bio);
-		return 0;
-	}
 	return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num);
 }
 
@@ -587,8 +586,7 @@ static int close_all_devices(struct btrfs_fs_info *fs_info)
 	list = &fs_info->fs_devices->devices;
 	list_for_each(next, list) {
 		device = list_entry(next, struct btrfs_device, dev_list);
-		if (device->bdev && device->bdev != fs_info->sb->s_bdev)
-			close_bdev_excl(device->bdev);
+		close_bdev_excl(device->bdev);
 		device->bdev = NULL;
 	}
 	return 0;
@@ -1118,6 +1116,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	u32 leafsize;
 	u32 blocksize;
 	u32 stripesize;
+	struct buffer_head *bh;
 	struct btrfs_root *extent_root = kmalloc(sizeof(struct btrfs_root),
 						 GFP_NOFS);
 	struct btrfs_root *tree_root = kmalloc(sizeof(struct btrfs_root),
@@ -1153,7 +1152,6 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	spin_lock_init(&fs_info->new_trans_lock);
 
 	init_completion(&fs_info->kobj_unregister);
-	sb_set_blocksize(sb, BTRFS_SUPER_INFO_SIZE);
 	fs_info->tree_root = tree_root;
 	fs_info->extent_root = extent_root;
 	fs_info->chunk_root = chunk_root;
@@ -1170,6 +1168,9 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	fs_info->btree_inode->i_ino = 1;
 	fs_info->btree_inode->i_nlink = 1;
 
+	sb->s_blocksize = 4096;
+	sb->s_blocksize_bits = blksize_bits(4096);
+
 	/*
 	 * we set the i_size on the btree inode to the max possible int.
 	 * the real end of the address space is determined by all of
@@ -1229,19 +1230,16 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	__setup_root(4096, 4096, 4096, 4096, tree_root,
 		     fs_info, BTRFS_ROOT_TREE_OBJECTID);
 
-	fs_info->sb_buffer = read_tree_block(tree_root,
-					     BTRFS_SUPER_INFO_OFFSET,
-					     4096);
 
-	if (!fs_info->sb_buffer)
+	bh = __bread(fs_devices->latest_bdev,
+		     BTRFS_SUPER_INFO_OFFSET / 4096, 4096);
+	if (!bh)
 		goto fail_iput;
 
-	read_extent_buffer(fs_info->sb_buffer, &fs_info->super_copy, 0,
-			   sizeof(fs_info->super_copy));
+	memcpy(&fs_info->super_copy, bh->b_data, sizeof(fs_info->super_copy));
+	brelse(bh);
 
-	read_extent_buffer(fs_info->sb_buffer, fs_info->fsid,
-			   (unsigned long)btrfs_super_fsid(fs_info->sb_buffer),
-			   BTRFS_FSID_SIZE);
+	memcpy(fs_info->fsid, fs_info->super_copy.fsid, BTRFS_FSID_SIZE);
 
 	disk_super = &fs_info->super_copy;
 	if (!btrfs_super_root(disk_super))
@@ -1263,7 +1261,9 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	tree_root->leafsize = leafsize;
 	tree_root->sectorsize = sectorsize;
 	tree_root->stripesize = stripesize;
-	sb_set_blocksize(sb, sectorsize);
+
+	sb->s_blocksize = sectorsize;
+	sb->s_blocksize_bits = blksize_bits(sectorsize);
 
 	if (strncmp((char *)(&disk_super->magic), BTRFS_MAGIC,
 		    sizeof(disk_super->magic))) {
@@ -1339,7 +1339,6 @@ fail_tree_root:
 fail_sys_array:
 	mutex_unlock(&fs_info->fs_mutex);
 fail_sb_buffer:
-	free_extent_buffer(fs_info->sb_buffer);
 	extent_io_tree_empty_lru(&BTRFS_I(fs_info->btree_inode)->io_tree);
 fail_iput:
 	iput(fs_info->btree_inode);
@@ -1380,41 +1379,44 @@ int write_all_supers(struct btrfs_root *root)
 	struct list_head *cur;
 	struct list_head *head = &root->fs_info->fs_devices->devices;
 	struct btrfs_device *dev;
-	struct extent_buffer *sb;
+	struct btrfs_super_block *sb;
 	struct btrfs_dev_item *dev_item;
 	struct buffer_head *bh;
 	int ret;
 	int do_barriers;
 	int max_errors;
 	int total_errors = 0;
+	u32 crc;
+	u64 flags;
 
 	max_errors = btrfs_super_num_devices(&root->fs_info->super_copy) - 1;
 	do_barriers = !btrfs_test_opt(root, NOBARRIER);
 
-	sb = root->fs_info->sb_buffer;
-	dev_item = (struct btrfs_dev_item *)offsetof(struct btrfs_super_block,
-						      dev_item);
+	sb = &root->fs_info->super_for_commit;
+	dev_item = &sb->dev_item;
 	list_for_each(cur, head) {
 		dev = list_entry(cur, struct btrfs_device, dev_list);
-		btrfs_set_device_type(sb, dev_item, dev->type);
-		btrfs_set_device_id(sb, dev_item, dev->devid);
-		btrfs_set_device_total_bytes(sb, dev_item, dev->total_bytes);
-		btrfs_set_device_bytes_used(sb, dev_item, dev->bytes_used);
-		btrfs_set_device_io_align(sb, dev_item, dev->io_align);
-		btrfs_set_device_io_width(sb, dev_item, dev->io_width);
-		btrfs_set_device_sector_size(sb, dev_item, dev->sector_size);
-		write_extent_buffer(sb, dev->uuid,
-				    (unsigned long)btrfs_device_uuid(dev_item),
-				    BTRFS_UUID_SIZE);
-
-		btrfs_set_header_flag(sb, BTRFS_HEADER_FLAG_WRITTEN);
-		csum_tree_block(root, sb, 0);
-
-		bh = __getblk(dev->bdev, BTRFS_SUPER_INFO_OFFSET /
-			      root->fs_info->sb->s_blocksize,
+		btrfs_set_stack_device_type(dev_item, dev->type);
+		btrfs_set_stack_device_id(dev_item, dev->devid);
+		btrfs_set_stack_device_total_bytes(dev_item, dev->total_bytes);
+		btrfs_set_stack_device_bytes_used(dev_item, dev->bytes_used);
+		btrfs_set_stack_device_io_align(dev_item, dev->io_align);
+		btrfs_set_stack_device_io_width(dev_item, dev->io_width);
+		btrfs_set_stack_device_sector_size(dev_item, dev->sector_size);
+		memcpy(dev_item->uuid, dev->uuid, BTRFS_UUID_SIZE);
+		flags = btrfs_super_flags(sb);
+		btrfs_set_super_flags(sb, flags | BTRFS_HEADER_FLAG_WRITTEN);
+
+
+		crc = ~(u32)0;
+		crc = btrfs_csum_data(root, (char *)sb + BTRFS_CSUM_SIZE, crc,
+				      BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE);
+		btrfs_csum_final(crc, sb->csum);
+
+		bh = __getblk(dev->bdev, BTRFS_SUPER_INFO_OFFSET / 4096,
 			      BTRFS_SUPER_INFO_SIZE);
 
-		read_extent_buffer(sb, bh->b_data, 0, BTRFS_SUPER_INFO_SIZE);
+		memcpy(bh->b_data, sb, BTRFS_SUPER_INFO_SIZE);
 		dev->pending_io = bh;
 
 		get_bh(bh);
@@ -1483,15 +1485,6 @@ int write_ctree_super(struct btrfs_trans_handle *trans, struct btrfs_root
 	int ret;
 
 	ret = write_all_supers(root);
-#if 0
-	if (!btrfs_test_opt(root, NOBARRIER))
-		blkdev_issue_flush(sb->s_bdev, NULL);
-	set_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree, super);
-	ret = sync_page_range_nolock(btree_inode, btree_inode->i_mapping,
-				     super->start, super->len);
-	if (!btrfs_test_opt(root, NOBARRIER))
-		blkdev_issue_flush(sb->s_bdev, NULL);
-#endif
 	return ret;
 }
 
@@ -1570,8 +1563,6 @@ int close_ctree(struct btrfs_root *root)
 	if (root->fs_info->dev_root->node);
 		free_extent_buffer(root->fs_info->dev_root->node);
 
-	free_extent_buffer(fs_info->sb_buffer);
-
 	btrfs_free_block_groups(root->fs_info);
 	del_fs_roots(fs_info);
 
@@ -1652,7 +1643,7 @@ void btrfs_throttle(struct btrfs_root *root)
 {
 	struct backing_dev_info *bdi;
 
-	bdi = root->fs_info->sb->s_bdev->bd_inode->i_mapping->backing_dev_info;
+	bdi = &root->fs_info->bdi;
 	if (root->fs_info->throttles && bdi_write_congested(bdi)) {
 #if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,18)
 		congestion_wait(WRITE, HZ/20);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index f94794a99329..c0e67bde8428 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -147,6 +147,8 @@ struct btrfs_block_group_cache *btrfs_lookup_block_group(struct
 	u64 end;
 	int ret;
 
+	bytenr = max_t(u64, bytenr,
+		       BTRFS_SUPER_INFO_OFFSET + BTRFS_SUPER_INFO_SIZE);
 	block_group_cache = &info->block_group_cache;
 	ret = find_first_extent_bit(block_group_cache,
 				    bytenr, &start, &end,
@@ -1059,16 +1061,25 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
 	}
 }
 
-static u64 reduce_alloc_profile(u64 flags)
+static u64 reduce_alloc_profile(struct btrfs_root *root, u64 flags)
 {
+	u64 num_devices = root->fs_info->fs_devices->num_devices;
+
+	if (num_devices == 1)
+		flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0);
+	if (num_devices < 4)
+		flags &= ~BTRFS_BLOCK_GROUP_RAID10;
+
 	if ((flags & BTRFS_BLOCK_GROUP_DUP) &&
 	    (flags & (BTRFS_BLOCK_GROUP_RAID1 |
-		      BTRFS_BLOCK_GROUP_RAID10)))
+		      BTRFS_BLOCK_GROUP_RAID10))) {
 		flags &= ~BTRFS_BLOCK_GROUP_DUP;
+	}
 
 	if ((flags & BTRFS_BLOCK_GROUP_RAID1) &&
-	    (flags & BTRFS_BLOCK_GROUP_RAID10))
+	    (flags & BTRFS_BLOCK_GROUP_RAID10)) {
 		flags &= ~BTRFS_BLOCK_GROUP_RAID1;
+	}
 
 	if ((flags & BTRFS_BLOCK_GROUP_RAID0) &&
 	    ((flags & BTRFS_BLOCK_GROUP_RAID1) |
@@ -1078,7 +1089,6 @@ static u64 reduce_alloc_profile(u64 flags)
 	return flags;
 }
 
-
 static int do_chunk_alloc(struct btrfs_trans_handle *trans,
 			  struct btrfs_root *extent_root, u64 alloc_bytes,
 			  u64 flags)
@@ -1089,7 +1099,7 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
 	u64 num_bytes;
 	int ret;
 
-	flags = reduce_alloc_profile(flags);
+	flags = reduce_alloc_profile(extent_root, flags);
 
 	space_info = __find_space_info(extent_root->fs_info, flags);
 	if (!space_info) {
@@ -1169,6 +1179,21 @@ static int update_block_group(struct btrfs_trans_handle *trans,
 	return 0;
 }
 
+static u64 first_logical_byte(struct btrfs_root *root, u64 search_start)
+{
+	u64 start;
+	u64 end;
+	int ret;
+	ret = find_first_extent_bit(&root->fs_info->block_group_cache,
+				    search_start, &start, &end,
+				    BLOCK_GROUP_DATA | BLOCK_GROUP_METADATA |
+				    BLOCK_GROUP_SYSTEM);
+	if (ret)
+		return 0;
+	return start;
+}
+
+
 static int update_pinned_extents(struct btrfs_root *root,
 				u64 bytenr, u64 num, int pin)
 {
@@ -1185,16 +1210,25 @@ static int update_pinned_extents(struct btrfs_root *root,
 	}
 	while (num > 0) {
 		cache = btrfs_lookup_block_group(fs_info, bytenr);
-		WARN_ON(!cache);
-		len = min(num, cache->key.offset -
-			  (bytenr - cache->key.objectid));
+		if (!cache) {
+			u64 first = first_logical_byte(root, bytenr);
+			WARN_ON(first < bytenr);
+			len = min(first - bytenr, num);
+		} else {
+			len = min(num, cache->key.offset -
+				  (bytenr - cache->key.objectid));
+		}
 		if (pin) {
-			cache->pinned += len;
-			cache->space_info->bytes_pinned += len;
+			if (cache) {
+				cache->pinned += len;
+				cache->space_info->bytes_pinned += len;
+			}
 			fs_info->total_pinned += len;
 		} else {
-			cache->pinned -= len;
-			cache->space_info->bytes_pinned -= len;
+			if (cache) {
+				cache->pinned -= len;
+				cache->space_info->bytes_pinned -= len;
+			}
 			fs_info->total_pinned -= len;
 		}
 		bytenr += len;
@@ -1547,7 +1581,7 @@ static int noinline find_free_extent(struct btrfs_trans_handle *trans,
 				     int data)
 {
 	int ret;
-	u64 orig_search_start = search_start;
+	u64 orig_search_start;
 	struct btrfs_root * root = orig_root->fs_info->extent_root;
 	struct btrfs_fs_info *info = root->fs_info;
 	u64 total_needed = num_bytes;
@@ -1577,6 +1611,9 @@ static int noinline find_free_extent(struct btrfs_trans_handle *trans,
 		}
 	}
 
+	search_start = max(search_start, first_logical_byte(root, 0));
+	orig_search_start = search_start;
+
 	if (search_end == (u64)-1)
 		search_end = btrfs_super_total_bytes(&info->super_copy);
 
@@ -1751,7 +1788,7 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
 		data = BTRFS_BLOCK_GROUP_METADATA | alloc_profile;
 	}
 again:
-	data = reduce_alloc_profile(data);
+	data = reduce_alloc_profile(root, data);
 	if (root->ref_cows) {
 		if (!(data & BTRFS_BLOCK_GROUP_METADATA)) {
 			ret = do_chunk_alloc(trans, root->fs_info->extent_root,
@@ -2309,6 +2346,7 @@ static int noinline relocate_inode_pages(struct inode *inode, u64 start,
 	struct file_ra_state *ra;
 	unsigned long total_read = 0;
 	unsigned long ra_pages;
+	struct btrfs_trans_handle *trans;
 
 	ra = kzalloc(sizeof(*ra), GFP_NOFS);
 
@@ -2326,9 +2364,13 @@ static int noinline relocate_inode_pages(struct inode *inode, u64 start,
 				       calc_ra(i, last_index, ra_pages));
 		}
 		total_read++;
+		if (((u64)i << PAGE_CACHE_SHIFT) > inode->i_size)
+			goto truncate_racing;
+
 		page = grab_cache_page(inode->i_mapping, i);
-		if (!page)
+		if (!page) {
 			goto out_unlock;
+		}
 		if (!PageUptodate(page)) {
 			btrfs_readpage(NULL, page);
 			lock_page(page);
@@ -2350,20 +2392,33 @@ static int noinline relocate_inode_pages(struct inode *inode, u64 start,
 
 		lock_extent(io_tree, page_start, page_end, GFP_NOFS);
 
-		set_page_dirty(page);
 		set_extent_delalloc(io_tree, page_start,
 				    page_end, GFP_NOFS);
+		set_page_dirty(page);
 
 		unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
 		unlock_page(page);
 		page_cache_release(page);
-		balance_dirty_pages_ratelimited_nr(inode->i_mapping, 1);
 	}
+	balance_dirty_pages_ratelimited_nr(inode->i_mapping,
+					   total_read);
 
 out_unlock:
 	kfree(ra);
+	trans = btrfs_start_transaction(BTRFS_I(inode)->root, 1);
+	if (trans) {
+		btrfs_add_ordered_inode(inode);
+		btrfs_end_transaction(trans, BTRFS_I(inode)->root);
+		mark_inode_dirty(inode);
+	}
 	mutex_unlock(&inode->i_mutex);
 	return 0;
+
+truncate_racing:
+	vmtruncate(inode, inode->i_size);
+	balance_dirty_pages_ratelimited_nr(inode->i_mapping,
+					   total_read);
+	goto out_unlock;
 }
 
 /*
@@ -2466,6 +2521,27 @@ out:
 	return 0;
 }
 
+static int noinline del_extent_zero(struct btrfs_root *extent_root,
+				    struct btrfs_path *path,
+				    struct btrfs_key *extent_key)
+{
+	int ret;
+	struct btrfs_trans_handle *trans;
+
+	trans = btrfs_start_transaction(extent_root, 1);
+	ret = btrfs_search_slot(trans, extent_root, extent_key, path, -1, 1);
+	if (ret > 0) {
+		ret = -EIO;
+		goto out;
+	}
+	if (ret < 0)
+		goto out;
+	ret = btrfs_del_item(trans, extent_root, path);
+out:
+	btrfs_end_transaction(trans, extent_root);
+	return ret;
+}
+
 static int noinline relocate_one_extent(struct btrfs_root *extent_root,
 					struct btrfs_path *path,
 					struct btrfs_key *extent_key)
@@ -2477,6 +2553,10 @@ static int noinline relocate_one_extent(struct btrfs_root *extent_root,
 	u32 item_size;
 	int ret = 0;
 
+	if (extent_key->objectid == 0) {
+		ret = del_extent_zero(extent_root, path, extent_key);
+		goto out;
+	}
 	key.objectid = extent_key->objectid;
 	key.type = BTRFS_EXTENT_REF_KEY;
 	key.offset = 0;
@@ -2490,15 +2570,24 @@ static int noinline relocate_one_extent(struct btrfs_root *extent_root,
 		ret = 0;
 		leaf = path->nodes[0];
 		nritems = btrfs_header_nritems(leaf);
-		if (path->slots[0] == nritems)
-			goto out;
+		if (path->slots[0] == nritems) {
+			ret = btrfs_next_leaf(extent_root, path);
+			if (ret > 0) {
+				ret = 0;
+				goto out;
+			}
+			if (ret < 0)
+				goto out;
+		}
 
 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
-		if (found_key.objectid != extent_key->objectid)
+		if (found_key.objectid != extent_key->objectid) {
 			break;
+		}
 
-		if (found_key.type != BTRFS_EXTENT_REF_KEY)
+		if (found_key.type != BTRFS_EXTENT_REF_KEY) {
 			break;
+		}
 
 		key.offset = found_key.offset + 1;
 		item_size = btrfs_item_size_nr(leaf, path->slots[0]);
@@ -2519,7 +2608,7 @@ static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
 	u64 stripped = BTRFS_BLOCK_GROUP_RAID0 |
 		BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10;
 
-	num_devices = btrfs_super_num_devices(&root->fs_info->super_copy);
+	num_devices = root->fs_info->fs_devices->num_devices;
 	if (num_devices == 1) {
 		stripped |= BTRFS_BLOCK_GROUP_DUP;
 		stripped = flags & ~stripped;
@@ -2535,9 +2624,6 @@ static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
 		return flags;
 	} else {
 		/* they already had raid on here, just return */
-		if ((flags & BTRFS_BLOCK_GROUP_DUP) &&
-		    (flags & BTRFS_BLOCK_GROUP_RAID1)) {
-		}
 		if (flags & stripped)
 			return flags;
 
@@ -2570,7 +2656,7 @@ int btrfs_shrink_extent_tree(struct btrfs_root *root, u64 shrink_start)
 	struct extent_buffer *leaf;
 	u32 nritems;
 	int ret;
-	int progress = 0;
+	int progress;
 
 	shrink_block_group = btrfs_lookup_block_group(root->fs_info,
 						      shrink_start);
@@ -2597,6 +2683,7 @@ again:
 	shrink_block_group->ro = 1;
 
 	total_found = 0;
+	progress = 0;
 	key.objectid = shrink_start;
 	key.offset = 0;
 	key.type = 0;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index f7beb9b0d37a..b437d3bdf95e 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2194,6 +2194,8 @@ struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
 again:
 	spin_lock(&em_tree->lock);
 	em = lookup_extent_mapping(em_tree, start, len);
+	if (em)
+		em->bdev = root->fs_info->fs_devices->latest_bdev;
 	spin_unlock(&em_tree->lock);
 
 	if (em) {
@@ -2212,7 +2214,7 @@ again:
 
 	em->start = EXTENT_MAP_HOLE;
 	em->len = (u64)-1;
-	em->bdev = inode->i_sb->s_bdev;
+	em->bdev = root->fs_info->fs_devices->latest_bdev;
 	ret = btrfs_lookup_file_extent(trans, root, path,
 				       objectid, start, trans != NULL);
 	if (ret < 0) {
@@ -3101,6 +3103,27 @@ out:
 	return ret;
 }
 
+long btrfs_ioctl_rm_dev(struct btrfs_root *root, void __user *arg)
+{
+	struct btrfs_ioctl_vol_args *vol_args;
+	int ret;
+
+	vol_args = kmalloc(sizeof(*vol_args), GFP_NOFS);
+
+	if (!vol_args)
+		return -ENOMEM;
+
+	if (copy_from_user(vol_args, arg, sizeof(*vol_args))) {
+		ret = -EFAULT;
+		goto out;
+	}
+	ret = btrfs_rm_device(root, vol_args->name);
+
+out:
+	kfree(vol_args);
+	return ret;
+}
+
 int dup_item_to_inode(struct btrfs_trans_handle *trans,
 		       struct btrfs_root *root,
 		       struct btrfs_path *path,
@@ -3294,6 +3317,8 @@ long btrfs_ioctl(struct file *file, unsigned int
 		return btrfs_ioctl_resize(root, (void __user *)arg);
 	case BTRFS_IOC_ADD_DEV:
 		return btrfs_ioctl_add_dev(root, (void __user *)arg);
+	case BTRFS_IOC_RM_DEV:
+		return btrfs_ioctl_rm_dev(root, (void __user *)arg);
 	case BTRFS_IOC_BALANCE:
 		return btrfs_balance(root->fs_info->dev_root);
 	case BTRFS_IOC_CLONE:
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 7153dfaa3404..020e5a83e31f 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -315,24 +315,12 @@ static void btrfs_write_super(struct super_block *sb)
 	sb->s_dirt = 0;
 }
 
-/*
- * This is almost a copy of get_sb_bdev in fs/super.c.
- * We need the local copy to allow direct mounting of
- * subvolumes, but this could be easily integrated back
- * into the generic version.  --hch
- */
-
-/* start copy & paste */
-static int set_bdev_super(struct super_block *s, void *data)
+static int btrfs_test_super(struct super_block *s, void *data)
 {
-	s->s_bdev = data;
-	s->s_dev = s->s_bdev->bd_dev;
-	return 0;
-}
+	struct btrfs_fs_devices *test_fs_devices = data;
+	struct btrfs_root *root = btrfs_sb(s);
 
-static int test_bdev_super(struct super_block *s, void *data)
-{
-	return (void *)s->s_bdev == data;
+	return root->fs_info->fs_devices == test_fs_devices;
 }
 
 int btrfs_get_sb_bdev(struct file_system_type *fs_type,
@@ -354,14 +342,9 @@ int btrfs_get_sb_bdev(struct file_system_type *fs_type,
 		return error;
 
 	bdev = fs_devices->lowest_bdev;
-	/*
-	 * once the super is inserted into the list by sget, s_umount
-	 * will protect the lockfs code from trying to start a snapshot
-	 * while we are mounting
-	 */
-	down(&bdev->bd_mount_sem);
-	s = sget(fs_type, test_bdev_super, set_bdev_super, bdev);
-	up(&bdev->bd_mount_sem);
+	btrfs_lock_volumes();
+	s = sget(fs_type, btrfs_test_super, set_anon_super, fs_devices);
+	btrfs_unlock_volumes();
 	if (IS_ERR(s))
 		goto error_s;
 
@@ -373,13 +356,11 @@ int btrfs_get_sb_bdev(struct file_system_type *fs_type,
 			goto error_bdev;
 		}
 
-		close_bdev_excl(bdev);
 	} else {
 		char b[BDEVNAME_SIZE];
 
 		s->s_flags = flags;
 		strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id));
-		sb_set_blocksize(s, block_size(bdev));
 		error = btrfs_fill_super(s, fs_devices, data,
 					 flags & MS_SILENT ? 1 : 0);
 		if (error) {
@@ -458,7 +439,7 @@ static struct file_system_type btrfs_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "btrfs",
 	.get_sb		= btrfs_get_sb,
-	.kill_sb	= kill_block_super,
+	.kill_sb	= kill_anon_super,
 	.fs_flags	= FS_REQUIRES_DEV,
 };
 
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 9826942fa18a..57746c11eae3 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -738,9 +738,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 				   chunk_root->node->start);
 	btrfs_set_super_chunk_root_level(&root->fs_info->super_copy,
 					 btrfs_header_level(chunk_root->node));
-	write_extent_buffer(root->fs_info->sb_buffer,
-			    &root->fs_info->super_copy, 0,
-			    sizeof(root->fs_info->super_copy));
+	memcpy(&root->fs_info->super_for_commit, &root->fs_info->super_copy,
+	       sizeof(root->fs_info->super_copy));
 
 	btrfs_copy_pinned(root, pinned_copy);
 
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index b38187573108..55da5f0c56e3 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -45,6 +45,16 @@ struct map_lookup {
 static DEFINE_MUTEX(uuid_mutex);
 static LIST_HEAD(fs_uuids);
 
+void btrfs_lock_volumes(void)
+{
+	mutex_lock(&uuid_mutex);
+}
+
+void btrfs_unlock_volumes(void)
+{
+	mutex_unlock(&uuid_mutex);
+}
+
 int btrfs_cleanup_fs_uuids(void)
 {
 	struct btrfs_fs_devices *fs_devices;
@@ -193,12 +203,14 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
 			ret = PTR_ERR(bdev);
 			goto fail;
 		}
+		set_blocksize(bdev, 4096);
 		if (device->devid == fs_devices->latest_devid)
 			fs_devices->latest_bdev = bdev;
 		if (device->devid == fs_devices->lowest_devid) {
 			fs_devices->lowest_bdev = bdev;
 		}
 		device->bdev = bdev;
+
 	}
 	mutex_unlock(&uuid_mutex);
 	return 0;
@@ -393,6 +405,9 @@ int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
 	struct btrfs_path *path;
 	struct btrfs_root *root = device->dev_root;
 	struct btrfs_key key;
+	struct btrfs_key found_key;
+	struct extent_buffer *leaf = NULL;
+	struct btrfs_dev_extent *extent = NULL;
 
 	path = btrfs_alloc_path();
 	if (!path)
@@ -403,8 +418,25 @@ int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
 	key.type = BTRFS_DEV_EXTENT_KEY;
 
 	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+	if (ret > 0) {
+		ret = btrfs_previous_item(root, path, key.objectid,
+					  BTRFS_DEV_EXTENT_KEY);
+		BUG_ON(ret);
+		leaf = path->nodes[0];
+		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+		extent = btrfs_item_ptr(leaf, path->slots[0],
+					struct btrfs_dev_extent);
+		BUG_ON(found_key.offset > start || found_key.offset +
+		       btrfs_dev_extent_length(leaf, extent) < start);
+		ret = 0;
+	} else if (ret == 0) {
+		leaf = path->nodes[0];
+		extent = btrfs_item_ptr(leaf, path->slots[0],
+					struct btrfs_dev_extent);
+	}
 	BUG_ON(ret);
 
+	device->bytes_used -= btrfs_dev_extent_length(leaf, extent);
 	ret = btrfs_del_item(trans, root, path);
 	BUG_ON(ret);
 
@@ -593,6 +625,170 @@ out:
 	return ret;
 }
 
+static int btrfs_rm_dev_item(struct btrfs_root *root,
+			     struct btrfs_device *device)
+{
+	int ret;
+	struct btrfs_path *path;
+	struct block_device *bdev = device->bdev;
+	struct btrfs_device *next_dev;
+	struct btrfs_key key;
+	u64 total_bytes;
+	struct btrfs_fs_devices *fs_devices;
+	struct btrfs_trans_handle *trans;
+
+	root = root->fs_info->chunk_root;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	trans = btrfs_start_transaction(root, 1);
+	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
+	key.type = BTRFS_DEV_ITEM_KEY;
+	key.offset = device->devid;
+
+	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+	if (ret < 0)
+		goto out;
+
+	if (ret > 0) {
+		ret = -ENOENT;
+		goto out;
+	}
+
+	ret = btrfs_del_item(trans, root, path);
+	if (ret)
+		goto out;
+
+	/*
+	 * at this point, the device is zero sized.  We want to
+	 * remove it from the devices list and zero out the old super
+	 */
+	list_del_init(&device->dev_list);
+	list_del_init(&device->dev_alloc_list);
+	fs_devices = root->fs_info->fs_devices;
+
+	next_dev = list_entry(fs_devices->devices.next, struct btrfs_device,
+			      dev_list);
+	if (bdev == fs_devices->lowest_bdev)
+		fs_devices->lowest_bdev = next_dev->bdev;
+	if (bdev == root->fs_info->sb->s_bdev)
+		root->fs_info->sb->s_bdev = next_dev->bdev;
+	if (bdev == fs_devices->latest_bdev)
+		fs_devices->latest_bdev = next_dev->bdev;
+
+	total_bytes = btrfs_super_total_bytes(&root->fs_info->super_copy);
+	btrfs_set_super_total_bytes(&root->fs_info->super_copy,
+				    total_bytes - device->total_bytes);
+
+	total_bytes = btrfs_super_num_devices(&root->fs_info->super_copy);
+	btrfs_set_super_num_devices(&root->fs_info->super_copy,
+				    total_bytes - 1);
+out:
+	btrfs_free_path(path);
+	btrfs_commit_transaction(trans, root);
+	return ret;
+}
+
+int btrfs_rm_device(struct btrfs_root *root, char *device_path)
+{
+	struct btrfs_device *device;
+	struct block_device *bdev;
+	struct buffer_head *bh;
+	struct btrfs_super_block *disk_super;
+	u64 all_avail;
+	u64 devid;
+	int ret = 0;
+
+	mutex_lock(&root->fs_info->fs_mutex);
+	mutex_lock(&uuid_mutex);
+
+	all_avail = root->fs_info->avail_data_alloc_bits |
+		root->fs_info->avail_system_alloc_bits |
+		root->fs_info->avail_metadata_alloc_bits;
+
+	if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) &&
+	    root->fs_info->fs_devices->num_devices <= 4) {
+		printk("btrfs: unable to go below four devices on raid10\n");
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) &&
+	    root->fs_info->fs_devices->num_devices <= 2) {
+		printk("btrfs: unable to go below two devices on raid1\n");
+		ret = -EINVAL;
+		goto out;
+	}
+
+	bdev = open_bdev_excl(device_path, 0, root->fs_info->bdev_holder);
+	if (IS_ERR(bdev)) {
+		ret = PTR_ERR(bdev);
+		goto out;
+	}
+
+	bh = __bread(bdev, BTRFS_SUPER_INFO_OFFSET / 4096, 4096);
+	if (!bh) {
+		ret = -EIO;
+		goto error_close;
+	}
+	disk_super = (struct btrfs_super_block *)bh->b_data;
+	if (strncmp((char *)(&disk_super->magic), BTRFS_MAGIC,
+	    sizeof(disk_super->magic))) {
+		ret = -ENOENT;
+		goto error_brelse;
+	}
+	if (memcmp(disk_super->fsid, root->fs_info->fsid, BTRFS_FSID_SIZE)) {
+		ret = -ENOENT;
+		goto error_brelse;
+	}
+	devid = le64_to_cpu(disk_super->dev_item.devid);
+	device = btrfs_find_device(root, devid, NULL);
+	if (!device) {
+		ret = -ENOENT;
+		goto error_brelse;
+	}
+
+	root->fs_info->fs_devices->num_devices--;
+
+	ret = btrfs_shrink_device(device, 0);
+	if (ret)
+		goto error_brelse;
+
+
+	ret = btrfs_rm_dev_item(root->fs_info->chunk_root, device);
+	if (ret)
+		goto error_brelse;
+
+	/* make sure this device isn't detected as part of the FS anymore */
+	memset(&disk_super->magic, 0, sizeof(disk_super->magic));
+	set_buffer_dirty(bh);
+	sync_dirty_buffer(bh);
+
+	brelse(bh);
+
+	/* one close for the device struct or super_block */
+	close_bdev_excl(device->bdev);
+
+	/* one close for us */
+	close_bdev_excl(device->bdev);
+
+	kfree(device->name);
+	kfree(device);
+	ret = 0;
+	goto out;
+
+error_brelse:
+	brelse(bh);
+error_close:
+	close_bdev_excl(bdev);
+out:
+	mutex_unlock(&uuid_mutex);
+	mutex_unlock(&root->fs_info->fs_mutex);
+	return ret;
+}
+
 int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
 {
 	struct btrfs_trans_handle *trans;
@@ -831,13 +1027,17 @@ int btrfs_relocate_chunk(struct btrfs_root *root,
 	em = lookup_extent_mapping(em_tree, chunk_offset, 1);
 	spin_unlock(&em_tree->lock);
 
-	BUG_ON(em->start > chunk_offset || em->start + em->len < chunk_offset);
+	BUG_ON(em->start > chunk_offset ||
+	       em->start + em->len < chunk_offset);
 	map = (struct map_lookup *)em->bdev;
 
 	for (i = 0; i < map->num_stripes; i++) {
 		ret = btrfs_free_dev_extent(trans, map->stripes[i].dev,
 					    map->stripes[i].physical);
 		BUG_ON(ret);
+
+		ret = btrfs_update_device(trans, map->stripes[i].dev);
+		BUG_ON(ret);
 	}
 	ret = btrfs_free_chunk(trans, root, chunk_tree, chunk_objectid,
 			       chunk_offset);
@@ -847,11 +1047,8 @@ int btrfs_relocate_chunk(struct btrfs_root *root,
 	if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
 		ret = btrfs_del_sys_chunk(root, chunk_objectid, chunk_offset);
 		BUG_ON(ret);
-		goto out;
 	}
 
-
-
 	spin_lock(&em_tree->lock);
 	remove_extent_mapping(em_tree, em);
 	kfree(map);
@@ -861,7 +1058,6 @@ int btrfs_relocate_chunk(struct btrfs_root *root,
 	free_extent_map(em);
 	spin_unlock(&em_tree->lock);
 
-out:
 	/* once for us */
 	free_extent_map(em);
 
@@ -1449,7 +1645,7 @@ again:
 		return 0;
 
 	if (!em) {
-		printk("unable to find logical %Lu\n", logical);
+		printk("unable to find logical %Lu len %Lu\n", logical, *length);
 		BUG();
 	}
 
@@ -1712,6 +1908,7 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
 
 	logical = key->offset;
 	length = btrfs_chunk_length(leaf, chunk);
+
 	spin_lock(&map_tree->map_tree.lock);
 	em = lookup_extent_mapping(&map_tree->map_tree, logical, 1);
 	spin_unlock(&map_tree->map_tree.lock);
@@ -1845,7 +2042,7 @@ int btrfs_read_super_device(struct btrfs_root *root, struct extent_buffer *buf)
 int btrfs_read_sys_array(struct btrfs_root *root)
 {
 	struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
-	struct extent_buffer *sb = root->fs_info->sb_buffer;
+	struct extent_buffer *sb;
 	struct btrfs_disk_key *disk_key;
 	struct btrfs_chunk *chunk;
 	u8 *ptr;
@@ -1857,6 +2054,12 @@ int btrfs_read_sys_array(struct btrfs_root *root)
 	u32 cur;
 	struct btrfs_key key;
 
+	sb = btrfs_find_create_tree_block(root, BTRFS_SUPER_INFO_OFFSET,
+					  BTRFS_SUPER_INFO_SIZE);
+	if (!sb)
+		return -ENOMEM;
+	btrfs_set_buffer_uptodate(sb);
+	write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE);
 	array_size = btrfs_super_sys_array_size(super_copy);
 
 	ptr = super_copy->sys_chunk_array;
@@ -1867,8 +2070,7 @@ int btrfs_read_sys_array(struct btrfs_root *root)
 		disk_key = (struct btrfs_disk_key *)ptr;
 		btrfs_disk_key_to_cpu(&key, disk_key);
 
-		len = sizeof(*disk_key);
-		ptr += len;
+		len = sizeof(*disk_key); ptr += len;
 		sb_ptr += len;
 		cur += len;
 
@@ -1887,6 +2089,7 @@ int btrfs_read_sys_array(struct btrfs_root *root)
 		sb_ptr += len;
 		cur += len;
 	}
+	free_extent_buffer(sb);
 	return ret;
 }
 
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index a9663e92bb14..0f94a69e6eb6 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -125,6 +125,7 @@ int btrfs_close_devices(struct btrfs_fs_devices *fs_devices);
 int btrfs_add_device(struct btrfs_trans_handle *trans,
 		     struct btrfs_root *root,
 		     struct btrfs_device *device);
+int btrfs_rm_device(struct btrfs_root *root, char *device_path);
 int btrfs_cleanup_fs_uuids(void);
 int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len);
 int btrfs_unplug_page(struct btrfs_mapping_tree *map_tree,
@@ -136,4 +137,6 @@ struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid,
 int btrfs_shrink_device(struct btrfs_device *device, u64 new_size);
 int btrfs_init_new_device(struct btrfs_root *root, char *path);
 int btrfs_balance(struct btrfs_root *dev_root);
+void btrfs_unlock_volumes(void);
+void btrfs_lock_volumes(void);
 #endif
-- 
cgit v1.2.3


From c1c4d91c38e6e5b29543b02d4a4fb7f007e44f70 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 8 May 2008 15:05:58 -0400
Subject: Btrfs: Only open block devices once during mount -o subvol=

btrfs_open_devices needed a check to see if the device was already
open.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/volumes.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs/btrfs/volumes.c')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 55da5f0c56e3..aa001b08ced0 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -196,6 +196,9 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
 	mutex_lock(&uuid_mutex);
 	list_for_each(cur, head) {
 		device = list_entry(cur, struct btrfs_device, dev_list);
+		if (device->bdev)
+			continue;
+
 		bdev = open_bdev_excl(device->name, flags, holder);
 
 		if (IS_ERR(bdev)) {
-- 
cgit v1.2.3


From 323da79c9f096ed4da04e5ea00f766f75b28aeaa Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 9 May 2008 11:46:48 -0400
Subject: Btrfs: Chunk relocation fine tuning, and add a few printks to show
 progress

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c | 20 +++++++++++++++++---
 fs/btrfs/volumes.c     |  2 ++
 2 files changed, 19 insertions(+), 3 deletions(-)

(limited to 'fs/btrfs/volumes.c')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index db996f0edf0b..df95fb660d49 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2493,7 +2493,7 @@ static int find_root_for_ref(struct btrfs_root *root,
 					btrfs_file_extent_disk_bytenr(leaf,
 							       file_extent);
 		       }
-		} else if (ret == 0) {
+		} else if (!file_key) {
 			if (path->nodes[level])
 				found_bytenr = path->nodes[level]->start;
 		}
@@ -2797,14 +2797,25 @@ int btrfs_shrink_extent_tree(struct btrfs_root *root, u64 shrink_start)
 	root = root->fs_info->extent_root;
 	path->reada = 2;
 
+	printk("btrfs relocating block group %llu flags %llu\n",
+	       (unsigned long long)shrink_start,
+	       (unsigned long long)shrink_block_group->flags);
+
 again:
 	if (btrfs_block_group_used(&shrink_block_group->item) > 0) {
+		u64 calc;
+
 		trans = btrfs_start_transaction(root, 1);
 		new_alloc_flags = update_block_group_flags(root,
 						   shrink_block_group->flags);
+		if (new_alloc_flags != shrink_block_group->flags) {
+			calc =
+			     btrfs_block_group_used(&shrink_block_group->item);
+		} else {
+			calc = shrink_block_group->key.offset;
+		}
 		do_chunk_alloc(trans, root->fs_info->extent_root,
-			btrfs_block_group_used(&shrink_block_group->item) +
-			2 * 1024 * 1024, new_alloc_flags);
+			       calc + 2 * 1024 * 1024, new_alloc_flags);
 		btrfs_end_transaction(trans, root);
 	}
 	shrink_block_group->ro = 1;
@@ -2888,6 +2899,9 @@ next:
 	btrfs_release_path(root, path);
 
 	if (total_found > 0) {
+		printk("btrfs relocate found %llu last extent was %llu\n",
+		       (unsigned long long)total_found,
+		       (unsigned long long)found_key.objectid);
 		trans = btrfs_start_transaction(tree_root, 1);
 		btrfs_commit_transaction(trans, tree_root);
 
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index aa001b08ced0..b5d7bd1915b4 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1011,6 +1011,8 @@ int btrfs_relocate_chunk(struct btrfs_root *root,
 	int ret;
 	int i;
 
+	printk("btrfs relocating chunk %llu\n",
+	       (unsigned long long)chunk_offset);
 	root = root->fs_info->chunk_root;
 	extent_root = root->fs_info->extent_root;
 	em_tree = &root->fs_info->mapping_tree.map_tree;
-- 
cgit v1.2.3


From 1259ab75c62462b8ffad90067b5e1f6312786a18 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 12 May 2008 13:39:03 -0400
Subject: Btrfs: Handle write errors on raid1 and raid10

When duplicate copies exist, writes are allowed to fail to one of those
copies.  This changeset includes a few changes that allow the FS to
continue even when some IOs fail.

It also adds verification of the parent generation number for btree blocks.
This generation is stored in the pointer to a block, and it ensures
that missed writes to are detected.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c       |  2 +-
 fs/btrfs/disk-io.c     | 56 ++++++++++++++++++++++++++++++++++++++----
 fs/btrfs/disk-io.h     |  2 +-
 fs/btrfs/extent-tree.c |  4 +--
 fs/btrfs/extent_io.c   | 49 +++++++++++++++++++++++++++++++------
 fs/btrfs/extent_io.h   |  7 +++++-
 fs/btrfs/inode.c       | 66 +++++++++++++++++++++++++++++++-------------------
 fs/btrfs/tree-defrag.c |  7 +++---
 fs/btrfs/volumes.c     | 11 ++++++---
 9 files changed, 155 insertions(+), 49 deletions(-)

(limited to 'fs/btrfs/volumes.c')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 289d71d8653a..02e571e6ee62 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -379,7 +379,7 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
 
 		cur = btrfs_find_tree_block(root, blocknr, blocksize);
 		if (cur)
-			uptodate = btrfs_buffer_uptodate(cur);
+			uptodate = btrfs_buffer_uptodate(cur, gen);
 		else
 			uptodate = 0;
 		if (!cur || !uptodate) {
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index edee7a44f861..574b1245964e 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -205,6 +205,33 @@ static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf,
 	return 0;
 }
 
+static int verify_parent_transid(struct extent_io_tree *io_tree,
+				 struct extent_buffer *eb, u64 parent_transid)
+{
+	int ret;
+
+	if (!parent_transid || btrfs_header_generation(eb) == parent_transid)
+		return 0;
+
+	lock_extent(io_tree, eb->start, eb->start + eb->len - 1, GFP_NOFS);
+	if (extent_buffer_uptodate(io_tree, eb) &&
+	    btrfs_header_generation(eb) == parent_transid) {
+		ret = 0;
+		goto out;
+	}
+	printk("parent transid verify failed on %llu wanted %llu found %llu\n",
+	       (unsigned long long)eb->start,
+	       (unsigned long long)parent_transid,
+	       (unsigned long long)btrfs_header_generation(eb));
+	ret = 1;
+out:
+	clear_extent_buffer_uptodate(io_tree, eb);
+	unlock_extent(io_tree, eb->start, eb->start + eb->len - 1,
+		      GFP_NOFS);
+	return ret;
+
+}
+
 static int btree_read_extent_buffer_pages(struct btrfs_root *root,
 					  struct extent_buffer *eb,
 					  u64 start, u64 parent_transid)
@@ -218,7 +245,8 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root,
 	while (1) {
 		ret = read_extent_buffer_pages(io_tree, eb, start, 1,
 					       btree_get_extent, mirror_num);
-		if (!ret)
+		if (!ret &&
+		    !verify_parent_transid(io_tree, eb, parent_transid))
 			return ret;
 
 		num_copies = btrfs_num_copies(&root->fs_info->mapping_tree,
@@ -330,6 +358,13 @@ int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
 		ret = -EIO;
 		goto err;
 	}
+	if (memcmp_extent_buffer(eb, root->fs_info->fsid,
+				 (unsigned long)btrfs_header_fsid(eb),
+				 BTRFS_FSID_SIZE)) {
+		printk("bad fsid on block %Lu\n", eb->start);
+		ret = -EIO;
+		goto err;
+	}
 	found_level = btrfs_header_level(eb);
 
 	ret = csum_tree_block(root, eb, 1);
@@ -1363,7 +1398,9 @@ static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate)
 					"I/O error on %s\n",
 				       bdevname(bh->b_bdev, b));
 		}
-		set_buffer_write_io_error(bh);
+		/* note, we dont' set_buffer_write_io_error because we have
+		 * our own ways of dealing with the IO errors
+		 */
 		clear_buffer_uptodate(bh);
 	}
 	unlock_buffer(bh);
@@ -1459,7 +1496,8 @@ int write_all_supers(struct btrfs_root *root)
 				ret = submit_bh(WRITE, bh);
 				BUG_ON(ret);
 				wait_on_buffer(bh);
-				BUG_ON(!buffer_uptodate(bh));
+				if (!buffer_uptodate(bh))
+					total_errors++;
 			} else {
 				total_errors++;
 			}
@@ -1607,10 +1645,18 @@ int close_ctree(struct btrfs_root *root)
 	return 0;
 }
 
-int btrfs_buffer_uptodate(struct extent_buffer *buf)
+int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid)
 {
+	int ret;
 	struct inode *btree_inode = buf->first_page->mapping->host;
-	return extent_buffer_uptodate(&BTRFS_I(btree_inode)->io_tree, buf);
+
+	ret = extent_buffer_uptodate(&BTRFS_I(btree_inode)->io_tree, buf);
+	if (!ret)
+		return ret;
+
+	ret = verify_parent_transid(&BTRFS_I(btree_inode)->io_tree, buf,
+				    parent_transid);
+	return !ret;
 }
 
 int btrfs_set_buffer_uptodate(struct extent_buffer *buf)
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index e29c895d5230..30d1ed293c25 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -56,7 +56,7 @@ int btrfs_insert_dev_radix(struct btrfs_root *root,
 void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr);
 int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root);
 void btrfs_mark_buffer_dirty(struct extent_buffer *buf);
-int btrfs_buffer_uptodate(struct extent_buffer *buf);
+int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid);
 int btrfs_set_buffer_uptodate(struct extent_buffer *buf);
 int wait_on_tree_block_writeback(struct btrfs_root *root,
 				 struct extent_buffer *buf);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index db07dde4a870..605018c6045c 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1366,7 +1366,7 @@ static int pin_down_bytes(struct btrfs_root *root, u64 bytenr, u32 num_bytes,
 	if (!pending) {
 		buf = btrfs_find_tree_block(root, bytenr, num_bytes);
 		if (buf) {
-			if (btrfs_buffer_uptodate(buf)) {
+			if (btrfs_buffer_uptodate(buf, 0)) {
 				u64 transid =
 				    root->fs_info->running_transaction->transid;
 				u64 header_transid =
@@ -2151,7 +2151,7 @@ static int noinline walk_down_tree(struct btrfs_trans_handle *trans,
 			continue;
 		}
 		next = btrfs_find_tree_block(root, bytenr, blocksize);
-		if (!next || !btrfs_buffer_uptodate(next)) {
+		if (!next || !btrfs_buffer_uptodate(next, ptr_gen)) {
 			free_extent_buffer(next);
 			reada_walk_down(root, cur, path->slots[*level]);
 
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index dd403b426ff5..2a3624adc0cf 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1366,7 +1366,7 @@ static int end_bio_extent_writepage(struct bio *bio,
 				   unsigned int bytes_done, int err)
 #endif
 {
-	const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+	int uptodate = err == 0;
 	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
 	struct extent_state *state = bio->bi_private;
 	struct extent_io_tree *tree = state->tree;
@@ -1375,6 +1375,7 @@ static int end_bio_extent_writepage(struct bio *bio,
 	u64 end;
 	u64 cur;
 	int whole_page;
+	int ret;
 	unsigned long flags;
 
 #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
@@ -1395,17 +1396,30 @@ static int end_bio_extent_writepage(struct bio *bio,
 		if (--bvec >= bio->bi_io_vec)
 			prefetchw(&bvec->bv_page->flags);
 
+		if (tree->ops && tree->ops->writepage_end_io_hook) {
+			ret = tree->ops->writepage_end_io_hook(page, start,
+						       end, state);
+			if (ret)
+				uptodate = 0;
+		}
+
+		if (!uptodate && tree->ops &&
+		    tree->ops->writepage_io_failed_hook) {
+			ret = tree->ops->writepage_io_failed_hook(bio, page,
+							 start, end, state);
+			if (ret == 0) {
+				state = NULL;
+				uptodate = (err == 0);
+				continue;
+			}
+		}
+
 		if (!uptodate) {
 			clear_extent_uptodate(tree, start, end, GFP_ATOMIC);
 			ClearPageUptodate(page);
 			SetPageError(page);
 		}
 
-		if (tree->ops && tree->ops->writepage_end_io_hook) {
-			tree->ops->writepage_end_io_hook(page, start, end,
-							 state);
-		}
-
 		/*
 		 * bios can get merged in funny ways, and so we need to
 		 * be careful with the state variable.  We know the
@@ -2073,9 +2087,9 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
 		} else {
 			ret = 0;
 		}
-		if (ret)
+		if (ret) {
 			SetPageError(page);
-		else {
+		} else {
 			unsigned long max_nr = end_index + 1;
 			set_range_writeback(tree, cur, cur + iosize - 1);
 			if (!PageWriteback(page)) {
@@ -2948,6 +2962,25 @@ int set_extent_buffer_dirty(struct extent_io_tree *tree,
 }
 EXPORT_SYMBOL(set_extent_buffer_dirty);
 
+int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
+				struct extent_buffer *eb)
+{
+	unsigned long i;
+	struct page *page;
+	unsigned long num_pages;
+
+	num_pages = num_extent_pages(eb->start, eb->len);
+	eb->flags &= ~EXTENT_UPTODATE;
+
+	clear_extent_uptodate(tree, eb->start, eb->start + eb->len - 1,
+			      GFP_NOFS);
+	for (i = 0; i < num_pages; i++) {
+		page = extent_buffer_page(eb, i);
+		ClearPageUptodate(page);
+	}
+	return 0;
+}
+
 int set_extent_buffer_uptodate(struct extent_io_tree *tree,
 				struct extent_buffer *eb)
 {
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index e48346147514..f1960dafaa19 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -36,9 +36,12 @@ struct extent_io_ops {
 	int (*readpage_io_failed_hook)(struct bio *bio, struct page *page,
 				       u64 start, u64 end,
 				       struct extent_state *state);
+	int (*writepage_io_failed_hook)(struct bio *bio, struct page *page,
+					u64 start, u64 end,
+				       struct extent_state *state);
 	int (*readpage_end_io_hook)(struct page *page, u64 start, u64 end,
 				    struct extent_state *state);
-	void (*writepage_end_io_hook)(struct page *page, u64 start, u64 end,
+	int (*writepage_end_io_hook)(struct page *page, u64 start, u64 end,
 				      struct extent_state *state);
 	int (*set_bit_hook)(struct inode *inode, u64 start, u64 end,
 			    unsigned long old, unsigned long bits);
@@ -212,6 +215,8 @@ int set_extent_buffer_dirty(struct extent_io_tree *tree,
 			     struct extent_buffer *eb);
 int set_extent_buffer_uptodate(struct extent_io_tree *tree,
 			       struct extent_buffer *eb);
+int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
+				struct extent_buffer *eb);
 int extent_buffer_uptodate(struct extent_io_tree *tree,
 			   struct extent_buffer *eb);
 int map_extent_buffer(struct extent_buffer *eb, unsigned long offset,
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index a492fd238c88..08760ff9bab7 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -430,9 +430,9 @@ struct io_failure_record {
 	int last_mirror;
 };
 
-int btrfs_readpage_io_failed_hook(struct bio *failed_bio,
-				  struct page *page, u64 start, u64 end,
-				  struct extent_state *state)
+int btrfs_io_failed_hook(struct bio *failed_bio,
+			 struct page *page, u64 start, u64 end,
+			 struct extent_state *state)
 {
 	struct io_failure_record *failrec = NULL;
 	u64 private;
@@ -443,6 +443,7 @@ int btrfs_readpage_io_failed_hook(struct bio *failed_bio,
 	struct bio *bio;
 	int num_copies;
 	int ret;
+	int rw;
 	u64 logical;
 
 	ret = get_state_private(failure_tree, start, &private);
@@ -505,7 +506,41 @@ int btrfs_readpage_io_failed_hook(struct bio *failed_bio,
 	bio->bi_bdev = failed_bio->bi_bdev;
 	bio->bi_size = 0;
 	bio_add_page(bio, page, failrec->len, start - page_offset(page));
-	btrfs_submit_bio_hook(inode, READ, bio, failrec->last_mirror);
+	if (failed_bio->bi_rw & (1 << BIO_RW))
+		rw = WRITE;
+	else
+		rw = READ;
+
+	BTRFS_I(inode)->io_tree.ops->submit_bio_hook(inode, rw, bio,
+						      failrec->last_mirror);
+	return 0;
+}
+
+int btrfs_clean_io_failures(struct inode *inode, u64 start)
+{
+	u64 private;
+	u64 private_failure;
+	struct io_failure_record *failure;
+	int ret;
+
+	private = 0;
+	if (count_range_bits(&BTRFS_I(inode)->io_failure_tree, &private,
+			     (u64)-1, 1, EXTENT_DIRTY)) {
+		ret = get_state_private(&BTRFS_I(inode)->io_failure_tree,
+					start, &private_failure);
+		if (ret == 0) {
+			failure = (struct io_failure_record *)(unsigned long)
+				   private_failure;
+			set_state_private(&BTRFS_I(inode)->io_failure_tree,
+					  failure->start, 0);
+			clear_extent_bits(&BTRFS_I(inode)->io_failure_tree,
+					  failure->start,
+					  failure->start + failure->len - 1,
+					  EXTENT_DIRTY | EXTENT_LOCKED,
+					  GFP_NOFS);
+			kfree(failure);
+		}
+	}
 	return 0;
 }
 
@@ -547,26 +582,7 @@ int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end,
 	/* if the io failure tree for this inode is non-empty,
 	 * check to see if we've recovered from a failed IO
 	 */
-	private = 0;
-	if (count_range_bits(&BTRFS_I(inode)->io_failure_tree, &private,
-			     (u64)-1, 1, EXTENT_DIRTY)) {
-		u64 private_failure;
-		struct io_failure_record *failure;
-		ret = get_state_private(&BTRFS_I(inode)->io_failure_tree,
-					start, &private_failure);
-		if (ret == 0) {
-			failure = (struct io_failure_record *)(unsigned long)
-				   private_failure;
-			set_state_private(&BTRFS_I(inode)->io_failure_tree,
-					  failure->start, 0);
-			clear_extent_bits(&BTRFS_I(inode)->io_failure_tree,
-					  failure->start,
-					  failure->start + failure->len - 1,
-					  EXTENT_DIRTY | EXTENT_LOCKED,
-					  GFP_NOFS);
-			kfree(failure);
-		}
-	}
+	btrfs_clean_io_failures(inode, start);
 	return 0;
 
 zeroit:
@@ -3657,7 +3673,7 @@ static struct extent_io_ops btrfs_extent_io_ops = {
 	.merge_bio_hook = btrfs_merge_bio_hook,
 	.readpage_io_hook = btrfs_readpage_io_hook,
 	.readpage_end_io_hook = btrfs_readpage_end_io_hook,
-	.readpage_io_failed_hook = btrfs_readpage_io_failed_hook,
+	.readpage_io_failed_hook = btrfs_io_failed_hook,
 	.set_bit_hook = btrfs_set_bit_hook,
 	.clear_bit_hook = btrfs_clear_bit_hook,
 };
diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c
index 5085e9e693b9..c02e2bf2f028 100644
--- a/fs/btrfs/tree-defrag.c
+++ b/fs/btrfs/tree-defrag.c
@@ -51,6 +51,7 @@ static int defrag_walk_down(struct btrfs_trans_handle *trans,
 	struct extent_buffer *next;
 	struct extent_buffer *cur;
 	u64 bytenr;
+	u64 ptr_gen;
 	int ret = 0;
 	int is_extent = 0;
 
@@ -93,11 +94,12 @@ static int defrag_walk_down(struct btrfs_trans_handle *trans,
 			break;
 		}
 		bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
+		ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
 
 		if (cache_only) {
 			next = btrfs_find_tree_block(root, bytenr,
 					   btrfs_level_size(root, *level - 1));
-			if (!next || !btrfs_buffer_uptodate(next) ||
+			if (!next || !btrfs_buffer_uptodate(next, ptr_gen) ||
 			    !btrfs_buffer_defrag(next)) {
 				free_extent_buffer(next);
 				path->slots[*level]++;
@@ -106,8 +108,7 @@ static int defrag_walk_down(struct btrfs_trans_handle *trans,
 		} else {
 			next = read_tree_block(root, bytenr,
 				       btrfs_level_size(root, *level - 1),
-				       btrfs_node_ptr_generation(cur,
-							 path->slots[*level]));
+				       ptr_gen);
 		}
 		ret = btrfs_cow_block(trans, root, next, path->nodes[*level],
 				      path->slots[*level], &next);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index b5d7bd1915b4..5fc7fb481474 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1807,14 +1807,19 @@ static int end_bio_multi_stripe(struct bio *bio,
 	if (atomic_dec_and_test(&multi->stripes_pending)) {
 		bio->bi_private = multi->private;
 		bio->bi_end_io = multi->end_io;
-
 		/* only send an error to the higher layers if it is
 		 * beyond the tolerance of the multi-bio
 		 */
-		if (atomic_read(&multi->error) > multi->max_errors)
+		if (atomic_read(&multi->error) > multi->max_errors) {
 			err = -EIO;
-		else
+		} else if (err) {
+			/*
+			 * this bio is actually up to date, we didn't
+			 * go over the max number of errors
+			 */
+			set_bit(BIO_UPTODATE, &bio->bi_flags);
 			err = 0;
+		}
 		kfree(multi);
 
 #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
-- 
cgit v1.2.3


From dfe25020689bb2d318782d2c9c7141203583fc70 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 13 May 2008 13:46:40 -0400
Subject: Btrfs: Add mount -o degraded to allow mounts to continue with missing
 devices

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h   |   3 +
 fs/btrfs/disk-io.c |  49 ++++++----
 fs/btrfs/disk-io.h |   3 +-
 fs/btrfs/super.c   |  25 +++--
 fs/btrfs/volumes.c | 279 ++++++++++++++++++++++++++++++++++++++---------------
 fs/btrfs/volumes.h |   3 +
 6 files changed, 253 insertions(+), 109 deletions(-)

(limited to 'fs/btrfs/volumes.c')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 6c2c2c4e4d2a..a9377fb16c37 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -657,6 +657,7 @@ struct btrfs_root {
 #define BTRFS_MOUNT_NODATACOW		(1 << 1)
 #define BTRFS_MOUNT_NOBARRIER		(1 << 2)
 #define BTRFS_MOUNT_SSD			(1 << 3)
+#define BTRFS_MOUNT_DEGRADED		(1 << 4)
 
 #define btrfs_clear_opt(o, opt)		((o) &= ~BTRFS_MOUNT_##opt)
 #define btrfs_set_opt(o, opt)		((o) |= BTRFS_MOUNT_##opt)
@@ -1606,4 +1607,6 @@ int btrfs_delete_xattrs(struct btrfs_trans_handle *trans,
 			struct btrfs_root *root, struct inode *inode);
 /* super.c */
 u64 btrfs_parse_size(char *str);
+int btrfs_parse_options(char *options, struct btrfs_root *root,
+			char **subvol_name);
 #endif
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 574b1245964e..38b0d9ecda6a 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -614,21 +614,6 @@ int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize,
 	return ret;
 }
 
-static int close_all_devices(struct btrfs_fs_info *fs_info)
-{
-	struct list_head *list;
-	struct list_head *next;
-	struct btrfs_device *device;
-
-	list = &fs_info->fs_devices->devices;
-	list_for_each(next, list) {
-		device = list_entry(next, struct btrfs_device, dev_list);
-		close_bdev_excl(device->bdev);
-		device->bdev = NULL;
-	}
-	return 0;
-}
-
 struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
 					    u64 bytenr, u32 blocksize)
 {
@@ -927,6 +912,8 @@ static int btrfs_congested_fn(void *congested_data, int bdi_bits)
 
 	list_for_each(cur, &info->fs_devices->devices) {
 		device = list_entry(cur, struct btrfs_device, dev_list);
+		if (!device->bdev)
+			continue;
 		bdi = blk_get_backing_dev_info(device->bdev);
 		if (bdi && bdi_congested(bdi, bdi_bits)) {
 			ret = 1;
@@ -1140,7 +1127,8 @@ static void btrfs_async_submit_work(struct work_struct *work)
 }
 
 struct btrfs_root *open_ctree(struct super_block *sb,
-			      struct btrfs_fs_devices *fs_devices)
+			      struct btrfs_fs_devices *fs_devices,
+			      char *options)
 {
 	u32 sectorsize;
 	u32 nodesize;
@@ -1276,12 +1264,19 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	if (!btrfs_super_root(disk_super))
 		goto fail_sb_buffer;
 
-	if (btrfs_super_num_devices(disk_super) != fs_devices->num_devices) {
+	btrfs_parse_options(options, tree_root, NULL);
+
+	if (btrfs_super_num_devices(disk_super) > fs_devices->num_devices) {
 		printk("Btrfs: wanted %llu devices, but found %llu\n",
 		       (unsigned long long)btrfs_super_num_devices(disk_super),
 		       (unsigned long long)fs_devices->num_devices);
-		goto fail_sb_buffer;
+		if (btrfs_test_opt(tree_root, DEGRADED))
+			printk("continuing in degraded mode\n");
+		else {
+			goto fail_sb_buffer;
+		}
 	}
+
 	fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super);
 
 	nodesize = btrfs_super_nodesize(disk_super);
@@ -1329,6 +1324,8 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	ret = btrfs_read_chunk_tree(chunk_root);
 	BUG_ON(ret);
 
+	btrfs_close_extra_devices(fs_devices);
+
 	blocksize = btrfs_level_size(tree_root,
 				     btrfs_super_root_level(disk_super));
 
@@ -1374,7 +1371,7 @@ fail_sb_buffer:
 fail_iput:
 	iput(fs_info->btree_inode);
 fail:
-	close_all_devices(fs_info);
+	btrfs_close_devices(fs_info->fs_devices);
 	btrfs_mapping_tree_free(&fs_info->mapping_tree);
 
 	kfree(extent_root);
@@ -1429,6 +1426,13 @@ int write_all_supers(struct btrfs_root *root)
 	dev_item = &sb->dev_item;
 	list_for_each(cur, head) {
 		dev = list_entry(cur, struct btrfs_device, dev_list);
+		if (!dev->bdev) {
+			total_errors++;
+			continue;
+		}
+		if (!dev->in_fs_metadata)
+			continue;
+
 		btrfs_set_stack_device_type(dev_item, dev->type);
 		btrfs_set_stack_device_id(dev_item, dev->devid);
 		btrfs_set_stack_device_total_bytes(dev_item, dev->total_bytes);
@@ -1482,6 +1486,11 @@ int write_all_supers(struct btrfs_root *root)
 
 	list_for_each(cur, head) {
 		dev = list_entry(cur, struct btrfs_device, dev_list);
+		if (!dev->bdev)
+			continue;
+		if (!dev->in_fs_metadata)
+			continue;
+
 		BUG_ON(!dev->pending_io);
 		bh = dev->pending_io;
 		wait_on_buffer(bh);
@@ -1631,7 +1640,7 @@ int close_ctree(struct btrfs_root *root)
 		kfree(hasher);
 	}
 #endif
-	close_all_devices(fs_info);
+	btrfs_close_devices(fs_info->fs_devices);
 	btrfs_mapping_tree_free(&fs_info->mapping_tree);
 
 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,23)
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 30d1ed293c25..2bc64fefe6ea 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -33,7 +33,8 @@ struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
 int clean_tree_block(struct btrfs_trans_handle *trans,
 		     struct btrfs_root *root, struct extent_buffer *buf);
 struct btrfs_root *open_ctree(struct super_block *sb,
-			      struct btrfs_fs_devices *fs_devices);
+			      struct btrfs_fs_devices *fs_devices,
+			      char *options);
 int close_ctree(struct btrfs_root *root);
 int write_ctree_super(struct btrfs_trans_handle *trans,
 		      struct btrfs_root *root);
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 020e5a83e31f..273a5b511f50 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -65,11 +65,13 @@ static void btrfs_put_super (struct super_block * sb)
 }
 
 enum {
-	Opt_subvol, Opt_nodatasum, Opt_nodatacow, Opt_max_extent,
-	Opt_max_inline, Opt_alloc_start, Opt_nobarrier, Opt_ssd, Opt_err,
+	Opt_degraded, Opt_subvol, Opt_nodatasum, Opt_nodatacow,
+	Opt_max_extent, Opt_max_inline, Opt_alloc_start, Opt_nobarrier,
+	Opt_ssd, Opt_err,
 };
 
 static match_table_t tokens = {
+	{Opt_degraded, "degraded"},
 	{Opt_subvol, "subvol=%s"},
 	{Opt_nodatasum, "nodatasum"},
 	{Opt_nodatacow, "nodatacow"},
@@ -106,9 +108,8 @@ u64 btrfs_parse_size(char *str)
 	return res;
 }
 
-static int parse_options (char * options,
-			  struct btrfs_root *root,
-			  char **subvol_name)
+int btrfs_parse_options(char *options, struct btrfs_root *root,
+			char **subvol_name)
 {
 	char * p;
 	struct btrfs_fs_info *info = NULL;
@@ -135,6 +136,12 @@ static int parse_options (char * options,
 
 		token = match_token(p, tokens, args);
 		switch (token) {
+		case Opt_degraded:
+			if (info) {
+				printk("btrfs: allowing degraded mounts\n");
+				btrfs_set_opt(info->mount_opt, DEGRADED);
+			}
+			break;
 		case Opt_subvol:
 			if (subvol_name) {
 				*subvol_name = match_strdup(&args[0]);
@@ -234,7 +241,7 @@ static int btrfs_fill_super(struct super_block * sb,
 	sb->s_xattr = btrfs_xattr_handlers;
 	sb->s_time_gran = 1;
 
-	tree_root = open_ctree(sb, fs_devices);
+	tree_root = open_ctree(sb, fs_devices, (char *)data);
 
 	if (IS_ERR(tree_root)) {
 		printk("btrfs: open_ctree failed\n");
@@ -267,8 +274,6 @@ static int btrfs_fill_super(struct super_block * sb,
 		goto fail_close;
 	}
 
-	parse_options((char *)data, tree_root, NULL);
-
 	/* this does the super kobj at the same time */
 	err = btrfs_sysfs_add_super(tree_root->fs_info);
 	if (err)
@@ -341,7 +346,7 @@ int btrfs_get_sb_bdev(struct file_system_type *fs_type,
 	if (error)
 		return error;
 
-	bdev = fs_devices->lowest_bdev;
+	bdev = fs_devices->latest_bdev;
 	btrfs_lock_volumes();
 	s = sget(fs_type, btrfs_test_super, set_anon_super, fs_devices);
 	btrfs_unlock_volumes();
@@ -411,7 +416,7 @@ static int btrfs_get_sb(struct file_system_type *fs_type,
 	int ret;
 	char *subvol_name = NULL;
 
-	parse_options((char *)data, NULL, &subvol_name);
+	btrfs_parse_options((char *)data, NULL, &subvol_name);
 	ret = btrfs_get_sb_bdev(fs_type, flags, dev_name, data, mnt,
 			subvol_name ? subvol_name : "default");
 	if (subvol_name)
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 5fc7fb481474..43f74d17bcea 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -73,6 +73,7 @@ int btrfs_cleanup_fs_uuids(void)
 				close_bdev_excl(dev->bdev);
 			}
 			list_del(&dev->dev_list);
+			kfree(dev->name);
 			kfree(dev);
 		}
 	}
@@ -127,7 +128,6 @@ static int device_list_add(const char *path,
 		memcpy(fs_devices->fsid, disk_super->fsid, BTRFS_FSID_SIZE);
 		fs_devices->latest_devid = devid;
 		fs_devices->latest_trans = found_transid;
-		fs_devices->lowest_devid = (u64)-1;
 		fs_devices->num_devices = 0;
 		device = NULL;
 	} else {
@@ -159,13 +159,35 @@ static int device_list_add(const char *path,
 		fs_devices->latest_devid = devid;
 		fs_devices->latest_trans = found_transid;
 	}
-	if (fs_devices->lowest_devid > devid) {
-		fs_devices->lowest_devid = devid;
-	}
 	*fs_devices_ret = fs_devices;
 	return 0;
 }
 
+int btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices)
+{
+	struct list_head *head = &fs_devices->devices;
+	struct list_head *cur;
+	struct btrfs_device *device;
+
+	mutex_lock(&uuid_mutex);
+again:
+	list_for_each(cur, head) {
+		device = list_entry(cur, struct btrfs_device, dev_list);
+		if (!device->in_fs_metadata) {
+printk("getting rid of extra dev %s\n", device->name);
+			if (device->bdev)
+				close_bdev_excl(device->bdev);
+			list_del(&device->dev_list);
+			list_del(&device->dev_alloc_list);
+			fs_devices->num_devices--;
+			kfree(device->name);
+			kfree(device);
+			goto again;
+		}
+	}
+	mutex_unlock(&uuid_mutex);
+	return 0;
+}
 int btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
 {
 	struct list_head *head = &fs_devices->devices;
@@ -179,6 +201,7 @@ int btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
 			close_bdev_excl(device->bdev);
 		}
 		device->bdev = NULL;
+		device->in_fs_metadata = 0;
 	}
 	mutex_unlock(&uuid_mutex);
 	return 0;
@@ -199,6 +222,9 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
 		if (device->bdev)
 			continue;
 
+		if (!device->name)
+			continue;
+
 		bdev = open_bdev_excl(device->name, flags, holder);
 
 		if (IS_ERR(bdev)) {
@@ -209,10 +235,8 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
 		set_blocksize(bdev, 4096);
 		if (device->devid == fs_devices->latest_devid)
 			fs_devices->latest_bdev = bdev;
-		if (device->devid == fs_devices->lowest_devid) {
-			fs_devices->lowest_bdev = bdev;
-		}
 		device->bdev = bdev;
+		device->in_fs_metadata = 0;
 
 	}
 	mutex_unlock(&uuid_mutex);
@@ -439,7 +463,8 @@ int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
 	}
 	BUG_ON(ret);
 
-	device->bytes_used -= btrfs_dev_extent_length(leaf, extent);
+	if (device->bytes_used > 0)
+		device->bytes_used -= btrfs_dev_extent_length(leaf, extent);
 	ret = btrfs_del_item(trans, root, path);
 	BUG_ON(ret);
 
@@ -460,6 +485,7 @@ int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
 	struct extent_buffer *leaf;
 	struct btrfs_key key;
 
+	WARN_ON(!device->in_fs_metadata);
 	path = btrfs_alloc_path();
 	if (!path)
 		return -ENOMEM;
@@ -674,8 +700,6 @@ static int btrfs_rm_dev_item(struct btrfs_root *root,
 
 	next_dev = list_entry(fs_devices->devices.next, struct btrfs_device,
 			      dev_list);
-	if (bdev == fs_devices->lowest_bdev)
-		fs_devices->lowest_bdev = next_dev->bdev;
 	if (bdev == root->fs_info->sb->s_bdev)
 		root->fs_info->sb->s_bdev = next_dev->bdev;
 	if (bdev == fs_devices->latest_bdev)
@@ -698,7 +722,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
 {
 	struct btrfs_device *device;
 	struct block_device *bdev;
-	struct buffer_head *bh;
+	struct buffer_head *bh = NULL;
 	struct btrfs_super_block *disk_super;
 	u64 all_avail;
 	u64 devid;
@@ -712,47 +736,73 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
 		root->fs_info->avail_metadata_alloc_bits;
 
 	if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) &&
-	    root->fs_info->fs_devices->num_devices <= 4) {
+	    btrfs_super_num_devices(&root->fs_info->super_copy) <= 4) {
 		printk("btrfs: unable to go below four devices on raid10\n");
 		ret = -EINVAL;
 		goto out;
 	}
 
 	if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) &&
-	    root->fs_info->fs_devices->num_devices <= 2) {
+	    btrfs_super_num_devices(&root->fs_info->super_copy) <= 2) {
 		printk("btrfs: unable to go below two devices on raid1\n");
 		ret = -EINVAL;
 		goto out;
 	}
 
-	bdev = open_bdev_excl(device_path, 0, root->fs_info->bdev_holder);
-	if (IS_ERR(bdev)) {
-		ret = PTR_ERR(bdev);
-		goto out;
-	}
+	if (strcmp(device_path, "missing") == 0) {
+		struct list_head *cur;
+		struct list_head *devices;
+		struct btrfs_device *tmp;
 
-	bh = __bread(bdev, BTRFS_SUPER_INFO_OFFSET / 4096, 4096);
-	if (!bh) {
-		ret = -EIO;
-		goto error_close;
-	}
-	disk_super = (struct btrfs_super_block *)bh->b_data;
-	if (strncmp((char *)(&disk_super->magic), BTRFS_MAGIC,
-	    sizeof(disk_super->magic))) {
-		ret = -ENOENT;
-		goto error_brelse;
-	}
-	if (memcmp(disk_super->fsid, root->fs_info->fsid, BTRFS_FSID_SIZE)) {
-		ret = -ENOENT;
-		goto error_brelse;
-	}
-	devid = le64_to_cpu(disk_super->dev_item.devid);
-	device = btrfs_find_device(root, devid, NULL);
-	if (!device) {
-		ret = -ENOENT;
-		goto error_brelse;
-	}
+		device = NULL;
+		devices = &root->fs_info->fs_devices->devices;
+		list_for_each(cur, devices) {
+			tmp = list_entry(cur, struct btrfs_device, dev_list);
+			if (tmp->in_fs_metadata && !tmp->bdev) {
+				device = tmp;
+				break;
+			}
+		}
+		bdev = NULL;
+		bh = NULL;
+		disk_super = NULL;
+		if (!device) {
+			printk("btrfs: no missing devices found to remove\n");
+			goto out;
+		}
+
+	} else {
+		bdev = open_bdev_excl(device_path, 0,
+				      root->fs_info->bdev_holder);
+		if (IS_ERR(bdev)) {
+			ret = PTR_ERR(bdev);
+			goto out;
+		}
+
+		bh = __bread(bdev, BTRFS_SUPER_INFO_OFFSET / 4096, 4096);
+		if (!bh) {
+			ret = -EIO;
+			goto error_close;
+		}
+		disk_super = (struct btrfs_super_block *)bh->b_data;
+		if (strncmp((char *)(&disk_super->magic), BTRFS_MAGIC,
+		    sizeof(disk_super->magic))) {
+			ret = -ENOENT;
+			goto error_brelse;
+		}
+		if (memcmp(disk_super->fsid, root->fs_info->fsid,
+			   BTRFS_FSID_SIZE)) {
+			ret = -ENOENT;
+			goto error_brelse;
+		}
+		devid = le64_to_cpu(disk_super->dev_item.devid);
+		device = btrfs_find_device(root, devid, NULL);
+		if (!device) {
+			ret = -ENOENT;
+			goto error_brelse;
+		}
 
+	}
 	root->fs_info->fs_devices->num_devices--;
 
 	ret = btrfs_shrink_device(device, 0);
@@ -764,19 +814,25 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
 	if (ret)
 		goto error_brelse;
 
-	/* make sure this device isn't detected as part of the FS anymore */
-	memset(&disk_super->magic, 0, sizeof(disk_super->magic));
-	set_buffer_dirty(bh);
-	sync_dirty_buffer(bh);
-
-	brelse(bh);
-
-	/* one close for the device struct or super_block */
-	close_bdev_excl(device->bdev);
+	if (bh) {
+		/* make sure this device isn't detected as part of
+		 * the FS anymore
+		 */
+		memset(&disk_super->magic, 0, sizeof(disk_super->magic));
+		set_buffer_dirty(bh);
+		sync_dirty_buffer(bh);
 
-	/* one close for us */
-	close_bdev_excl(device->bdev);
+		brelse(bh);
+	}
 
+	if (device->bdev) {
+		/* one close for the device struct or super_block */
+		close_bdev_excl(device->bdev);
+	}
+	if (bdev) {
+		/* one close for us */
+		close_bdev_excl(bdev);
+	}
 	kfree(device->name);
 	kfree(device);
 	ret = 0;
@@ -785,7 +841,8 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
 error_brelse:
 	brelse(bh);
 error_close:
-	close_bdev_excl(bdev);
+	if (bdev)
+		close_bdev_excl(bdev);
 out:
 	mutex_unlock(&uuid_mutex);
 	mutex_unlock(&root->fs_info->fs_mutex);
@@ -839,6 +896,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
 	device->total_bytes = i_size_read(bdev->bd_inode);
 	device->dev_root = root->fs_info->dev_root;
 	device->bdev = bdev;
+	device->in_fs_metadata = 1;
 
 	ret = btrfs_add_device(trans, root, device);
 	if (ret)
@@ -1041,8 +1099,10 @@ int btrfs_relocate_chunk(struct btrfs_root *root,
 					    map->stripes[i].physical);
 		BUG_ON(ret);
 
-		ret = btrfs_update_device(trans, map->stripes[i].dev);
-		BUG_ON(ret);
+		if (map->stripes[i].dev) {
+			ret = btrfs_update_device(trans, map->stripes[i].dev);
+			BUG_ON(ret);
+		}
 	}
 	ret = btrfs_free_chunk(trans, root, chunk_tree, chunk_objectid,
 			       chunk_offset);
@@ -1415,10 +1475,13 @@ again:
 	while(index < num_stripes) {
 		device = list_entry(cur, struct btrfs_device, dev_alloc_list);
 
-		avail = device->total_bytes - device->bytes_used;
+		if (device->total_bytes > device->bytes_used)
+			avail = device->total_bytes - device->bytes_used;
+		else
+			avail = 0;
 		cur = cur->next;
 
-		if (avail >= min_free) {
+		if (device->in_fs_metadata && avail >= min_free) {
 			u64 ignored_start = 0;
 			ret = find_free_dev_extent(trans, device, path,
 						   min_free,
@@ -1430,7 +1493,7 @@ again:
 				if (type & BTRFS_BLOCK_GROUP_DUP)
 					index++;
 			}
-		} else if (avail > max_avail)
+		} else if (device->in_fs_metadata && avail > max_avail)
 			max_avail = avail;
 		if (cur == dev_list)
 			break;
@@ -1610,6 +1673,22 @@ int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len)
 	return ret;
 }
 
+static int find_live_mirror(struct map_lookup *map, int first, int num,
+			    int optimal)
+{
+	int i;
+	if (map->stripes[optimal].dev->bdev)
+		return optimal;
+	for (i = first; i < first + num; i++) {
+		if (map->stripes[i].dev->bdev)
+			return i;
+	}
+	/* we couldn't find one that doesn't fail.  Just return something
+	 * and the io error handling code will clean up eventually
+	 */
+	return optimal;
+}
+
 static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
 			     u64 logical, u64 *length,
 			     struct btrfs_multi_bio **multi_ret,
@@ -1712,8 +1791,11 @@ again:
 			num_stripes = map->num_stripes;
 		else if (mirror_num)
 			stripe_index = mirror_num - 1;
-		else
-			stripe_index = current->pid % map->num_stripes;
+		else {
+			stripe_index = find_live_mirror(map, 0,
+					    map->num_stripes,
+					    current->pid % map->num_stripes);
+		}
 
 	} else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
 		if (rw & (1 << BIO_RW))
@@ -1731,8 +1813,11 @@ again:
 			num_stripes = map->sub_stripes;
 		else if (mirror_num)
 			stripe_index += mirror_num - 1;
-		else
-			stripe_index += current->pid % map->sub_stripes;
+		else {
+			stripe_index = find_live_mirror(map, stripe_index,
+					      map->sub_stripes, stripe_index +
+					      current->pid % map->sub_stripes);
+		}
 	} else {
 		/*
 		 * after this do_div call, stripe_nr is the number of stripes
@@ -1749,9 +1834,11 @@ again:
 			struct backing_dev_info *bdi;
 
 			device = map->stripes[stripe_index].dev;
-			bdi = blk_get_backing_dev_info(device->bdev);
-			if (bdi->unplug_io_fn) {
-				bdi->unplug_io_fn(bdi, unplug_page);
+			if (device->bdev) {
+				bdi = blk_get_backing_dev_info(device->bdev);
+				if (bdi->unplug_io_fn) {
+					bdi->unplug_io_fn(bdi, unplug_page);
+				}
 			}
 		} else {
 			multi->stripes[i].physical =
@@ -1880,12 +1967,21 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
 		}
 		bio->bi_sector = multi->stripes[dev_nr].physical >> 9;
 		dev = multi->stripes[dev_nr].dev;
-
-		bio->bi_bdev = dev->bdev;
-		spin_lock(&dev->io_lock);
-		dev->total_ios++;
-		spin_unlock(&dev->io_lock);
-		submit_bio(rw, bio);
+		if (dev && dev->bdev) {
+			bio->bi_bdev = dev->bdev;
+			spin_lock(&dev->io_lock);
+			dev->total_ios++;
+			spin_unlock(&dev->io_lock);
+			submit_bio(rw, bio);
+		} else {
+			bio->bi_bdev = root->fs_info->fs_devices->latest_bdev;
+			bio->bi_sector = logical >> 9;
+#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
+			bio_endio(bio, bio->bi_size, -EIO);
+#else
+			bio_endio(bio, -EIO);
+#endif
+		}
 		dev_nr++;
 	}
 	if (total_devs == 1)
@@ -1901,6 +1997,27 @@ struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid,
 	return __find_device(head, devid, uuid);
 }
 
+static struct btrfs_device *add_missing_dev(struct btrfs_root *root,
+					    u64 devid, u8 *dev_uuid)
+{
+	struct btrfs_device *device;
+	struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
+
+	device = kzalloc(sizeof(*device), GFP_NOFS);
+	list_add(&device->dev_list,
+		 &fs_devices->devices);
+	list_add(&device->dev_alloc_list,
+		 &fs_devices->alloc_list);
+	device->barriers = 1;
+	device->dev_root = root->fs_info->dev_root;
+	device->devid = devid;
+	fs_devices->num_devices++;
+	spin_lock_init(&device->io_lock);
+	memcpy(device->uuid, dev_uuid, BTRFS_UUID_SIZE);
+	return device;
+}
+
+
 static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
 			  struct extent_buffer *leaf,
 			  struct btrfs_chunk *chunk)
@@ -1965,11 +2082,22 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
 				   btrfs_stripe_dev_uuid_nr(chunk, i),
 				   BTRFS_UUID_SIZE);
 		map->stripes[i].dev = btrfs_find_device(root, devid, uuid);
-		if (!map->stripes[i].dev) {
+
+		if (!map->stripes[i].dev && !btrfs_test_opt(root, DEGRADED)) {
 			kfree(map);
 			free_extent_map(em);
 			return -EIO;
 		}
+		if (!map->stripes[i].dev) {
+			map->stripes[i].dev =
+				add_missing_dev(root, devid, uuid);
+			if (!map->stripes[i].dev) {
+				kfree(map);
+				free_extent_map(em);
+				return -EIO;
+			}
+		}
+		map->stripes[i].dev->in_fs_metadata = 1;
 	}
 
 	spin_lock(&map_tree->map_tree.lock);
@@ -2016,20 +2144,15 @@ static int read_one_dev(struct btrfs_root *root,
 			   BTRFS_UUID_SIZE);
 	device = btrfs_find_device(root, devid, dev_uuid);
 	if (!device) {
-		printk("warning devid %Lu not found already\n", devid);
-		device = kzalloc(sizeof(*device), GFP_NOFS);
+		printk("warning devid %Lu missing\n", devid);
+		device = add_missing_dev(root, devid, dev_uuid);
 		if (!device)
 			return -ENOMEM;
-		list_add(&device->dev_list,
-			 &root->fs_info->fs_devices->devices);
-		list_add(&device->dev_alloc_list,
-			 &root->fs_info->fs_devices->alloc_list);
-		device->barriers = 1;
-		spin_lock_init(&device->io_lock);
 	}
 
 	fill_device_from_item(leaf, dev_item, device);
 	device->dev_root = root->fs_info->dev_root;
+	device->in_fs_metadata = 1;
 	ret = 0;
 #if 0
 	ret = btrfs_open_device(device);
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 0f94a69e6eb6..454fe8103329 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -27,8 +27,10 @@ struct btrfs_device {
 	struct list_head dev_alloc_list;
 	struct btrfs_root *dev_root;
 	struct buffer_head *pending_io;
+	u64 generation;
 
 	int barriers;
+	int in_fs_metadata;
 
 	spinlock_t io_lock;
 
@@ -122,6 +124,7 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
 int btrfs_scan_one_device(const char *path, int flags, void *holder,
 			  struct btrfs_fs_devices **fs_devices_ret);
 int btrfs_close_devices(struct btrfs_fs_devices *fs_devices);
+int btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices);
 int btrfs_add_device(struct btrfs_trans_handle *trans,
 		     struct btrfs_root *root,
 		     struct btrfs_device *device);
-- 
cgit v1.2.3


From a0af469b58944f6e8c5c8ecbebb42997baf0cb9e Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 13 May 2008 16:03:06 -0400
Subject: Fix btrfs_open_devices to deal with changes since the scan ioctls

Devices can change after the scan ioctls are done, and btrfs_open_devices
needs to be able to verify them as they are opened and used by the FS.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/disk-io.c |  4 ++--
 fs/btrfs/volumes.c | 70 +++++++++++++++++++++++++++++++++++++++++++++---------
 fs/btrfs/volumes.h |  4 ++--
 3 files changed, 63 insertions(+), 15 deletions(-)

(limited to 'fs/btrfs/volumes.c')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 38b0d9ecda6a..264f297260f8 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1266,10 +1266,10 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 
 	btrfs_parse_options(options, tree_root, NULL);
 
-	if (btrfs_super_num_devices(disk_super) > fs_devices->num_devices) {
+	if (btrfs_super_num_devices(disk_super) > fs_devices->open_devices) {
 		printk("Btrfs: wanted %llu devices, but found %llu\n",
 		       (unsigned long long)btrfs_super_num_devices(disk_super),
-		       (unsigned long long)fs_devices->num_devices);
+		       (unsigned long long)fs_devices->open_devices);
 		if (btrfs_test_opt(tree_root, DEGRADED))
 			printk("continuing in degraded mode\n");
 		else {
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 43f74d17bcea..501d23d3ebfd 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -71,6 +71,7 @@ int btrfs_cleanup_fs_uuids(void)
 					 dev_list);
 			if (dev->bdev) {
 				close_bdev_excl(dev->bdev);
+				fs_devices->open_devices--;
 			}
 			list_del(&dev->dev_list);
 			kfree(dev->name);
@@ -174,9 +175,10 @@ again:
 	list_for_each(cur, head) {
 		device = list_entry(cur, struct btrfs_device, dev_list);
 		if (!device->in_fs_metadata) {
-printk("getting rid of extra dev %s\n", device->name);
-			if (device->bdev)
+			if (device->bdev) {
 				close_bdev_excl(device->bdev);
+				fs_devices->open_devices--;
+			}
 			list_del(&device->dev_list);
 			list_del(&device->dev_alloc_list);
 			fs_devices->num_devices--;
@@ -188,6 +190,7 @@ printk("getting rid of extra dev %s\n", device->name);
 	mutex_unlock(&uuid_mutex);
 	return 0;
 }
+
 int btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
 {
 	struct list_head *head = &fs_devices->devices;
@@ -199,10 +202,12 @@ int btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
 		device = list_entry(cur, struct btrfs_device, dev_list);
 		if (device->bdev) {
 			close_bdev_excl(device->bdev);
+			fs_devices->open_devices--;
 		}
 		device->bdev = NULL;
 		device->in_fs_metadata = 0;
 	}
+	fs_devices->mounted = 0;
 	mutex_unlock(&uuid_mutex);
 	return 0;
 }
@@ -214,9 +219,19 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
 	struct list_head *head = &fs_devices->devices;
 	struct list_head *cur;
 	struct btrfs_device *device;
-	int ret;
+	struct block_device *latest_bdev = NULL;
+	struct buffer_head *bh;
+	struct btrfs_super_block *disk_super;
+	u64 latest_devid = 0;
+	u64 latest_transid = 0;
+	u64 transid;
+	u64 devid;
+	int ret = 0;
 
 	mutex_lock(&uuid_mutex);
+	if (fs_devices->mounted)
+		goto out;
+
 	list_for_each(cur, head) {
 		device = list_entry(cur, struct btrfs_device, dev_list);
 		if (device->bdev)
@@ -229,21 +244,52 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
 
 		if (IS_ERR(bdev)) {
 			printk("open %s failed\n", device->name);
-			ret = PTR_ERR(bdev);
-			goto fail;
+			goto error;
 		}
 		set_blocksize(bdev, 4096);
-		if (device->devid == fs_devices->latest_devid)
-			fs_devices->latest_bdev = bdev;
+
+		bh = __bread(bdev, BTRFS_SUPER_INFO_OFFSET / 4096, 4096);
+		if (!bh)
+			goto error_close;
+
+		disk_super = (struct btrfs_super_block *)bh->b_data;
+		if (strncmp((char *)(&disk_super->magic), BTRFS_MAGIC,
+		    sizeof(disk_super->magic)))
+			goto error_brelse;
+
+		devid = le64_to_cpu(disk_super->dev_item.devid);
+		if (devid != device->devid)
+			goto error_brelse;
+
+		transid = btrfs_super_generation(disk_super);
+		if (transid > latest_transid) {
+			latest_devid = devid;
+			latest_transid = transid;
+			latest_bdev = bdev;
+		}
+
 		device->bdev = bdev;
 		device->in_fs_metadata = 0;
+		fs_devices->open_devices++;
+		continue;
 
+error_brelse:
+		brelse(bh);
+error_close:
+		close_bdev_excl(bdev);
+error:
+		continue;
 	}
+	if (fs_devices->open_devices == 0) {
+		ret = -EIO;
+		goto out;
+	}
+	fs_devices->mounted = 1;
+	fs_devices->latest_bdev = latest_bdev;
+	fs_devices->latest_devid = latest_devid;
+	fs_devices->latest_trans = latest_transid;
+out:
 	mutex_unlock(&uuid_mutex);
-	return 0;
-fail:
-	mutex_unlock(&uuid_mutex);
-	btrfs_close_devices(fs_devices);
 	return ret;
 }
 
@@ -828,6 +874,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
 	if (device->bdev) {
 		/* one close for the device struct or super_block */
 		close_bdev_excl(device->bdev);
+		root->fs_info->fs_devices->open_devices--;
 	}
 	if (bdev) {
 		/* one close for us */
@@ -914,6 +961,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
 	list_add(&device->dev_alloc_list,
 		 &root->fs_info->fs_devices->alloc_list);
 	root->fs_info->fs_devices->num_devices++;
+	root->fs_info->fs_devices->open_devices++;
 out:
 	btrfs_end_transaction(trans, root);
 	mutex_unlock(&root->fs_info->fs_mutex);
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 454fe8103329..4df6b1608f91 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -71,16 +71,16 @@ struct btrfs_fs_devices {
 	/* the device with this id has the most recent coyp of the super */
 	u64 latest_devid;
 	u64 latest_trans;
-	u64 lowest_devid;
 	u64 num_devices;
+	u64 open_devices;
 	struct block_device *latest_bdev;
-	struct block_device *lowest_bdev;
 	/* all of the devices in the FS */
 	struct list_head devices;
 
 	/* devices not currently being allocated */
 	struct list_head alloc_list;
 	struct list_head list;
+	int mounted;
 };
 
 struct btrfs_bio_stripe {
-- 
cgit v1.2.3


From 6af5ac3c1aede574acde5fc477f1a1c876fb568e Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 16 May 2008 13:14:57 -0400
Subject: Btrfs: Handle transid == 0 while opening devices

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/volumes.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs/btrfs/volumes.c')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 501d23d3ebfd..3fbf0834e884 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -262,7 +262,7 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
 			goto error_brelse;
 
 		transid = btrfs_super_generation(disk_super);
-		if (transid > latest_transid) {
+		if (!latest_transid || transid > latest_transid) {
 			latest_devid = devid;
 			latest_transid = transid;
 			latest_bdev = bdev;
-- 
cgit v1.2.3


From 515dc3228c568f9df8b634f82b3471e20205ac16 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 16 May 2008 13:30:15 -0400
Subject: Btrfs: Use kzalloc on the fs_devices allocation

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/volumes.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs/btrfs/volumes.c')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 3fbf0834e884..333e5269577a 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -120,7 +120,7 @@ static int device_list_add(const char *path,
 
 	fs_devices = find_fsid(disk_super->fsid);
 	if (!fs_devices) {
-		fs_devices = kmalloc(sizeof(*fs_devices), GFP_NOFS);
+		fs_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS);
 		if (!fs_devices)
 			return -ENOMEM;
 		INIT_LIST_HEAD(&fs_devices->devices);
@@ -129,7 +129,6 @@ static int device_list_add(const char *path,
 		memcpy(fs_devices->fsid, disk_super->fsid, BTRFS_FSID_SIZE);
 		fs_devices->latest_devid = devid;
 		fs_devices->latest_trans = found_transid;
-		fs_devices->num_devices = 0;
 		device = NULL;
 	} else {
 		device = __find_device(&fs_devices->devices, devid,
-- 
cgit v1.2.3


From 0ef3e66b6700eb8f052daa8b89443ff872fbbdfc Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Sat, 24 May 2008 14:04:53 -0400
Subject: Btrfs: Allocator fix variety pack

* Force chunk allocation when find_free_extent has to do a full scan
* Record the max key at the start of defrag so it doesn't run forever
* Block groups might not be contiguous, make a forward search for the
  next block group in extent-tree.c
* Get rid of extra checks for total fs size
* Fix relocate_one_reference to avoid relocating the same file data block
  twice when referenced by an older transaction
* Use the open device count when allocating chunks so that we don't
  try to allocate from devices that don't exist

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c       |   2 +-
 fs/btrfs/ctree.h       |   2 +
 fs/btrfs/extent-tree.c | 273 +++++++++++++++++++++++++++++++++----------------
 fs/btrfs/tree-defrag.c |  17 +++
 fs/btrfs/volumes.c     |  12 +--
 5 files changed, 209 insertions(+), 97 deletions(-)

(limited to 'fs/btrfs/volumes.c')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 02e571e6ee62..1d404bde3cfa 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -363,7 +363,7 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
 			other = btrfs_node_blockptr(parent, i - 1);
 			close = close_blocks(blocknr, other, blocksize);
 		}
-		if (close && i < end_slot - 2) {
+		if (!close && i < end_slot - 2) {
 			other = btrfs_node_blockptr(parent, i + 1);
 			close = close_blocks(blocknr, other, blocksize);
 		}
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 293464c0c6ba..7b73a9c3d868 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -464,6 +464,7 @@ struct btrfs_space_info {
 	u64 bytes_used;
 	u64 bytes_pinned;
 	int full;
+	int force_alloc;
 	struct list_head list;
 };
 
@@ -589,6 +590,7 @@ struct btrfs_root {
 	int ref_cows;
 	int track_dirty;
 	struct btrfs_key defrag_progress;
+	struct btrfs_key defrag_max;
 	int defrag_running;
 	int defrag_level;
 	char *name;
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 605018c6045c..41a63462d3eb 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -136,6 +136,35 @@ err:
 	return 0;
 }
 
+struct btrfs_block_group_cache *btrfs_lookup_first_block_group(struct
+						       btrfs_fs_info *info,
+							 u64 bytenr)
+{
+	struct extent_io_tree *block_group_cache;
+	struct btrfs_block_group_cache *block_group = NULL;
+	u64 ptr;
+	u64 start;
+	u64 end;
+	int ret;
+
+	bytenr = max_t(u64, bytenr,
+		       BTRFS_SUPER_INFO_OFFSET + BTRFS_SUPER_INFO_SIZE);
+	block_group_cache = &info->block_group_cache;
+	ret = find_first_extent_bit(block_group_cache,
+				    bytenr, &start, &end,
+				    BLOCK_GROUP_DATA | BLOCK_GROUP_METADATA |
+				    BLOCK_GROUP_SYSTEM);
+	if (ret) {
+		return NULL;
+	}
+	ret = get_state_private(block_group_cache, start, &ptr);
+	if (ret)
+		return NULL;
+
+	block_group = (struct btrfs_block_group_cache *)(unsigned long)ptr;
+	return block_group;
+}
+
 struct btrfs_block_group_cache *btrfs_lookup_block_group(struct
 							 btrfs_fs_info *info,
 							 u64 bytenr)
@@ -175,7 +204,7 @@ static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
 
 static int noinline find_search_start(struct btrfs_root *root,
 			      struct btrfs_block_group_cache **cache_ret,
-			      u64 *start_ret, int num, int data)
+			      u64 *start_ret, u64 num, int data)
 {
 	int ret;
 	struct btrfs_block_group_cache *cache = *cache_ret;
@@ -188,21 +217,21 @@ static int noinline find_search_start(struct btrfs_root *root,
 	u64 search_start = *start_ret;
 	int wrapped = 0;
 
-	if (!cache)
-		goto out;
-
 	total_fs_bytes = btrfs_super_total_bytes(&root->fs_info->super_copy);
 	free_space_cache = &root->fs_info->free_space_cache;
 
+	if (!cache)
+		goto out;
+
 again:
 	ret = cache_block_group(root, cache);
-	if (ret)
+	if (ret) {
 		goto out;
+	}
 
 	last = max(search_start, cache->key.objectid);
-	if (!block_group_bits(cache, data) || cache->ro) {
+	if (!block_group_bits(cache, data) || cache->ro)
 		goto new_group;
-	}
 
 	spin_lock_irq(&free_space_cache->lock);
 	state = find_first_extent_bit_state(free_space_cache, last, EXTENT_DIRTY);
@@ -217,20 +246,17 @@ again:
 		start = max(last, state->start);
 		last = state->end + 1;
 		if (last - start < num) {
-			if (last == cache->key.objectid + cache->key.offset)
-				cache_miss = start;
 			do {
 				state = extent_state_next(state);
 			} while(state && !(state->state & EXTENT_DIRTY));
 			continue;
 		}
 		spin_unlock_irq(&free_space_cache->lock);
-		if (cache->ro)
+		if (cache->ro) {
 			goto new_group;
+		}
 		if (start + num > cache->key.objectid + cache->key.offset)
 			goto new_group;
-		if (start + num  > total_fs_bytes)
-			goto new_group;
 		if (!block_group_bits(cache, data)) {
 			printk("block group bits don't match %Lu %d\n", cache->flags, data);
 		}
@@ -248,7 +274,7 @@ out:
 new_group:
 	last = cache->key.objectid + cache->key.offset;
 wrapped:
-	cache = btrfs_lookup_block_group(root->fs_info, last);
+	cache = btrfs_lookup_first_block_group(root->fs_info, last);
 	if (!cache || cache->key.objectid >= total_fs_bytes) {
 no_cache:
 		if (!wrapped) {
@@ -261,13 +287,13 @@ no_cache:
 	if (cache_miss && !cache->cached) {
 		cache_block_group(root, cache);
 		last = cache_miss;
-		cache = btrfs_lookup_block_group(root->fs_info, last);
+		cache = btrfs_lookup_first_block_group(root->fs_info, last);
 	}
+	cache_miss = 0;
 	cache = btrfs_find_block_group(root, cache, last, data, 0);
 	if (!cache)
 		goto no_cache;
 	*cache_ret = cache;
-	cache_miss = 0;
 	goto again;
 }
 
@@ -303,28 +329,26 @@ struct btrfs_block_group_cache *btrfs_find_block_group(struct btrfs_root *root,
 	struct btrfs_fs_info *info = root->fs_info;
 	u64 used;
 	u64 last = 0;
-	u64 hint_last;
 	u64 start;
 	u64 end;
 	u64 free_check;
 	u64 ptr;
-	u64 total_fs_bytes;
 	int bit;
 	int ret;
 	int full_search = 0;
 	int factor = 10;
+	int wrapped = 0;
 
 	block_group_cache = &info->block_group_cache;
-	total_fs_bytes = btrfs_super_total_bytes(&root->fs_info->super_copy);
 
 	if (data & BTRFS_BLOCK_GROUP_METADATA)
 		factor = 9;
 
 	bit = block_group_state_bits(data);
 
-	if (search_start && search_start < total_fs_bytes) {
+	if (search_start) {
 		struct btrfs_block_group_cache *shint;
-		shint = btrfs_lookup_block_group(info, search_start);
+		shint = btrfs_lookup_first_block_group(info, search_start);
 		if (shint && block_group_bits(shint, data) && !shint->ro) {
 			used = btrfs_block_group_used(&shint->item);
 			if (used + shint->pinned <
@@ -333,24 +357,18 @@ struct btrfs_block_group_cache *btrfs_find_block_group(struct btrfs_root *root,
 			}
 		}
 	}
-	if (hint && !hint->ro && block_group_bits(hint, data) &&
-	    hint->key.objectid < total_fs_bytes) {
+	if (hint && !hint->ro && block_group_bits(hint, data)) {
 		used = btrfs_block_group_used(&hint->item);
 		if (used + hint->pinned <
 		    div_factor(hint->key.offset, factor)) {
 			return hint;
 		}
 		last = hint->key.objectid + hint->key.offset;
-		hint_last = last;
 	} else {
 		if (hint)
-			hint_last = max(hint->key.objectid, search_start);
+			last = max(hint->key.objectid, search_start);
 		else
-			hint_last = search_start;
-
-		if (hint_last >= total_fs_bytes)
-			hint_last = search_start;
-		last = hint_last;
+			last = search_start;
 	}
 again:
 	while(1) {
@@ -360,23 +378,17 @@ again:
 			break;
 
 		ret = get_state_private(block_group_cache, start, &ptr);
-		if (ret)
-			break;
+		if (ret) {
+			last = end + 1;
+			continue;
+		}
 
 		cache = (struct btrfs_block_group_cache *)(unsigned long)ptr;
 		last = cache->key.objectid + cache->key.offset;
 		used = btrfs_block_group_used(&cache->item);
 
-		if (cache->key.objectid > total_fs_bytes)
-			break;
-
 		if (!cache->ro && block_group_bits(cache, data)) {
-			if (full_search)
-				free_check = cache->key.offset;
-			else
-				free_check = div_factor(cache->key.offset,
-							factor);
-
+			free_check = div_factor(cache->key.offset, factor);
 			if (used + cache->pinned < free_check) {
 				found_group = cache;
 				goto found;
@@ -384,9 +396,15 @@ again:
 		}
 		cond_resched();
 	}
-	if (!full_search) {
+	if (!wrapped) {
+		last = search_start;
+		wrapped = 1;
+		goto again;
+	}
+	if (!full_search && factor < 10) {
 		last = search_start;
 		full_search = 1;
+		factor = 10;
 		goto again;
 	}
 found:
@@ -1070,6 +1088,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
 	found->bytes_used = bytes_used;
 	found->bytes_pinned = 0;
 	found->full = 0;
+	found->force_alloc = 0;
 	*space_info = found;
 	return 0;
 }
@@ -1120,7 +1139,7 @@ static u64 reduce_alloc_profile(struct btrfs_root *root, u64 flags)
 
 static int do_chunk_alloc(struct btrfs_trans_handle *trans,
 			  struct btrfs_root *extent_root, u64 alloc_bytes,
-			  u64 flags)
+			  u64 flags, int force)
 {
 	struct btrfs_space_info *space_info;
 	u64 thresh;
@@ -1138,11 +1157,16 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
 	}
 	BUG_ON(!space_info);
 
+	if (space_info->force_alloc) {
+		force = 1;
+		space_info->force_alloc = 0;
+	}
 	if (space_info->full)
 		return 0;
 
 	thresh = div_factor(space_info->total_bytes, 6);
-	if ((space_info->bytes_used + space_info->bytes_pinned + alloc_bytes) <
+	if (!force &&
+	   (space_info->bytes_used + space_info->bytes_pinned + alloc_bytes) <
 	    thresh)
 		return 0;
 
@@ -1152,7 +1176,6 @@ printk("space info full %Lu\n", flags);
 		space_info->full = 1;
 		return 0;
 	}
-
 	BUG_ON(ret);
 
 	ret = btrfs_make_block_group(trans, extent_root, 0, flags,
@@ -1619,11 +1642,16 @@ static int noinline find_free_extent(struct btrfs_trans_handle *trans,
 	struct btrfs_block_group_cache *block_group;
 	int full_scan = 0;
 	int wrapped = 0;
+	int chunk_alloc_done = 0;
 	int empty_cluster = 2 * 1024 * 1024;
+	int allowed_chunk_alloc = 0;
 
 	WARN_ON(num_bytes < root->sectorsize);
 	btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY);
 
+	if (orig_root->ref_cows || empty_size)
+		allowed_chunk_alloc = 1;
+
 	if (data & BTRFS_BLOCK_GROUP_METADATA) {
 		last_ptr = &root->fs_info->last_alloc;
 		empty_cluster = 256 * 1024;
@@ -1648,7 +1676,7 @@ static int noinline find_free_extent(struct btrfs_trans_handle *trans,
 		search_end = btrfs_super_total_bytes(&info->super_copy);
 
 	if (hint_byte) {
-		block_group = btrfs_lookup_block_group(info, hint_byte);
+		block_group = btrfs_lookup_first_block_group(info, hint_byte);
 		if (!block_group)
 			hint_byte = search_start;
 		block_group = btrfs_find_block_group(root, block_group,
@@ -1666,17 +1694,28 @@ static int noinline find_free_extent(struct btrfs_trans_handle *trans,
 
 check_failed:
 	if (!block_group) {
-		block_group = btrfs_lookup_block_group(info, search_start);
+		block_group = btrfs_lookup_first_block_group(info,
+							     search_start);
 		if (!block_group)
-			block_group = btrfs_lookup_block_group(info,
+			block_group = btrfs_lookup_first_block_group(info,
 						       orig_search_start);
 	}
+	if (full_scan && !chunk_alloc_done) {
+		if (allowed_chunk_alloc) {
+			do_chunk_alloc(trans, root,
+				     num_bytes + 2 * 1024 * 1024, data, 1);
+			allowed_chunk_alloc = 0;
+		} else if (block_group && block_group_bits(block_group, data)) {
+			block_group->space_info->force_alloc = 1;
+		}
+		chunk_alloc_done = 1;
+	}
 	ret = find_search_start(root, &block_group, &search_start,
 				total_needed, data);
 	if (ret == -ENOSPC && last_ptr && *last_ptr) {
 		*last_ptr = 0;
-		block_group = btrfs_lookup_block_group(info,
-						       orig_search_start);
+		block_group = btrfs_lookup_first_block_group(info,
+							     orig_search_start);
 		search_start = orig_search_start;
 		ret = find_search_start(root, &block_group, &search_start,
 					total_needed, data);
@@ -1692,7 +1731,7 @@ check_failed:
 			empty_size += empty_cluster;
 			total_needed += empty_size;
 		}
-		block_group = btrfs_lookup_block_group(info,
+		block_group = btrfs_lookup_first_block_group(info,
 						       orig_search_start);
 		search_start = orig_search_start;
 		ret = find_search_start(root, &block_group,
@@ -1765,7 +1804,7 @@ enospc:
 		} else
 			wrapped = 1;
 	}
-	block_group = btrfs_lookup_block_group(info, search_start);
+	block_group = btrfs_lookup_first_block_group(info, search_start);
 	cond_resched();
 	block_group = btrfs_find_block_group(root, block_group,
 					     search_start, data, 0);
@@ -1819,17 +1858,21 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
 	}
 again:
 	data = reduce_alloc_profile(root, data);
-	if (root->ref_cows) {
+	/*
+	 * the only place that sets empty_size is btrfs_realloc_node, which
+	 * is not called recursively on allocations
+	 */
+	if (empty_size || root->ref_cows) {
 		if (!(data & BTRFS_BLOCK_GROUP_METADATA)) {
 			ret = do_chunk_alloc(trans, root->fs_info->extent_root,
-					     2 * 1024 * 1024,
-					     BTRFS_BLOCK_GROUP_METADATA |
-					     (info->metadata_alloc_profile &
-					      info->avail_metadata_alloc_bits));
+				     2 * 1024 * 1024,
+				     BTRFS_BLOCK_GROUP_METADATA |
+				     (info->metadata_alloc_profile &
+				      info->avail_metadata_alloc_bits), 0);
 			BUG_ON(ret);
 		}
 		ret = do_chunk_alloc(trans, root->fs_info->extent_root,
-				     num_bytes + 2 * 1024 * 1024, data);
+				     num_bytes + 2 * 1024 * 1024, data, 0);
 		BUG_ON(ret);
 	}
 
@@ -1842,6 +1885,8 @@ again:
 	if (ret == -ENOSPC && num_bytes > min_alloc_size) {
 		num_bytes = num_bytes >> 1;
 		num_bytes = max(num_bytes, min_alloc_size);
+		do_chunk_alloc(trans, root->fs_info->extent_root,
+			       num_bytes, data, 1);
 		goto again;
 	}
 	if (ret) {
@@ -2537,7 +2582,11 @@ out:
  */
 static int noinline relocate_one_reference(struct btrfs_root *extent_root,
 				  struct btrfs_path *path,
-				  struct btrfs_key *extent_key)
+				  struct btrfs_key *extent_key,
+				  u64 *last_file_objectid,
+				  u64 *last_file_offset,
+				  u64 *last_file_root,
+				  u64 last_extent)
 {
 	struct inode *inode;
 	struct btrfs_root *found_root;
@@ -2576,6 +2625,12 @@ static int noinline relocate_one_reference(struct btrfs_root *extent_root,
 		found_key.offset = ref_offset;
 		level = 0;
 
+		if (last_extent == extent_key->objectid &&
+		    *last_file_objectid == ref_objectid &&
+		    *last_file_offset == ref_offset &&
+		    *last_file_root == ref_root)
+			goto out;
+
 		ret = find_root_for_ref(extent_root, path, &found_key,
 					level, 1, &found_root,
 					extent_key->objectid);
@@ -2583,6 +2638,12 @@ static int noinline relocate_one_reference(struct btrfs_root *extent_root,
 		if (ret)
 			goto out;
 
+		if (last_extent == extent_key->objectid &&
+		    *last_file_objectid == ref_objectid &&
+		    *last_file_offset == ref_offset &&
+		    *last_file_root == ref_root)
+			goto out;
+
 		mutex_unlock(&extent_root->fs_info->fs_mutex);
 		inode = btrfs_iget_locked(extent_root->fs_info->sb,
 					  ref_objectid, found_root);
@@ -2603,6 +2664,10 @@ static int noinline relocate_one_reference(struct btrfs_root *extent_root,
 			mutex_lock(&extent_root->fs_info->fs_mutex);
 			goto out;
 		}
+		*last_file_objectid = inode->i_ino;
+		*last_file_root = found_root->root_key.objectid;
+		*last_file_offset = ref_offset;
+
 		relocate_inode_pages(inode, ref_offset, extent_key->offset);
 		iput(inode);
 		mutex_lock(&extent_root->fs_info->fs_mutex);
@@ -2643,6 +2708,8 @@ static int noinline relocate_one_reference(struct btrfs_root *extent_root,
 			path->nodes[i] = NULL;
 		}
 		btrfs_release_path(found_root, path);
+		if (found_root == found_root->fs_info->extent_root)
+			btrfs_extent_post_op(trans, found_root);
 		btrfs_end_transaction(trans, found_root);
 	}
 
@@ -2678,6 +2745,10 @@ static int noinline relocate_one_extent(struct btrfs_root *extent_root,
 	struct btrfs_key key;
 	struct btrfs_key found_key;
 	struct extent_buffer *leaf;
+	u64 last_file_objectid = 0;
+	u64 last_file_root = 0;
+	u64 last_file_offset = (u64)-1;
+	u64 last_extent = 0;
 	u32 nritems;
 	u32 item_size;
 	int ret = 0;
@@ -2722,9 +2793,13 @@ static int noinline relocate_one_extent(struct btrfs_root *extent_root,
 		key.offset = found_key.offset + 1;
 		item_size = btrfs_item_size_nr(leaf, path->slots[0]);
 
-		ret = relocate_one_reference(extent_root, path, extent_key);
+		ret = relocate_one_reference(extent_root, path, extent_key,
+					     &last_file_objectid,
+					     &last_file_offset,
+					     &last_file_root, last_extent);
 		if (ret)
 			goto out;
+		last_extent = extent_key->objectid;
 	}
 	ret = 0;
 out:
@@ -2770,6 +2845,32 @@ static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
 	return flags;
 }
 
+int __alloc_chunk_for_shrink(struct btrfs_root *root,
+		     struct btrfs_block_group_cache *shrink_block_group,
+		     int force)
+{
+	struct btrfs_trans_handle *trans;
+	u64 new_alloc_flags;
+	u64 calc;
+
+	if (btrfs_block_group_used(&shrink_block_group->item) > 0) {
+
+		trans = btrfs_start_transaction(root, 1);
+		new_alloc_flags = update_block_group_flags(root,
+						   shrink_block_group->flags);
+		if (new_alloc_flags != shrink_block_group->flags) {
+			calc =
+			     btrfs_block_group_used(&shrink_block_group->item);
+		} else {
+			calc = shrink_block_group->key.offset;
+		}
+		do_chunk_alloc(trans, root->fs_info->extent_root,
+			       calc + 2 * 1024 * 1024, new_alloc_flags, force);
+		btrfs_end_transaction(trans, root);
+	}
+	return 0;
+}
+
 int btrfs_shrink_extent_tree(struct btrfs_root *root, u64 shrink_start)
 {
 	struct btrfs_trans_handle *trans;
@@ -2778,7 +2879,6 @@ int btrfs_shrink_extent_tree(struct btrfs_root *root, u64 shrink_start)
 	u64 cur_byte;
 	u64 total_found;
 	u64 shrink_last_byte;
-	u64 new_alloc_flags;
 	struct btrfs_block_group_cache *shrink_block_group;
 	struct btrfs_fs_info *info = root->fs_info;
 	struct btrfs_key key;
@@ -2792,7 +2892,8 @@ int btrfs_shrink_extent_tree(struct btrfs_root *root, u64 shrink_start)
 						      shrink_start);
 	BUG_ON(!shrink_block_group);
 
-	shrink_last_byte = shrink_start + shrink_block_group->key.offset;
+	shrink_last_byte = shrink_block_group->key.objectid +
+		shrink_block_group->key.offset;
 
 	shrink_block_group->space_info->total_bytes -=
 		shrink_block_group->key.offset;
@@ -2804,23 +2905,10 @@ int btrfs_shrink_extent_tree(struct btrfs_root *root, u64 shrink_start)
 	       (unsigned long long)shrink_start,
 	       (unsigned long long)shrink_block_group->flags);
 
+	__alloc_chunk_for_shrink(root, shrink_block_group, 1);
+
 again:
-	if (btrfs_block_group_used(&shrink_block_group->item) > 0) {
-		u64 calc;
 
-		trans = btrfs_start_transaction(root, 1);
-		new_alloc_flags = update_block_group_flags(root,
-						   shrink_block_group->flags);
-		if (new_alloc_flags != shrink_block_group->flags) {
-			calc =
-			     btrfs_block_group_used(&shrink_block_group->item);
-		} else {
-			calc = shrink_block_group->key.offset;
-		}
-		do_chunk_alloc(trans, root->fs_info->extent_root,
-			       calc + 2 * 1024 * 1024, new_alloc_flags);
-		btrfs_end_transaction(trans, root);
-	}
 	shrink_block_group->ro = 1;
 
 	total_found = 0;
@@ -2888,6 +2976,8 @@ next:
 
 		if (btrfs_key_type(&found_key) != BTRFS_EXTENT_ITEM_KEY ||
 		    found_key.objectid + found_key.offset <= cur_byte) {
+			memcpy(&key, &found_key, sizeof(key));
+			key.offset++;
 			path->slots[0]++;
 			goto next;
 		}
@@ -2897,6 +2987,7 @@ next:
 		key.objectid = cur_byte;
 		btrfs_release_path(root, path);
 		ret = relocate_one_extent(root, path, &found_key);
+		__alloc_chunk_for_shrink(root, shrink_block_group, 0);
 	}
 
 	btrfs_release_path(root, path);
@@ -2930,20 +3021,27 @@ next:
 	if (ret < 0)
 		goto out;
 
-	leaf = path->nodes[0];
-	nritems = btrfs_header_nritems(leaf);
-	btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
-	kfree(shrink_block_group);
-
-	clear_extent_bits(&info->block_group_cache, found_key.objectid,
-			  found_key.objectid + found_key.offset - 1,
+	clear_extent_bits(&info->block_group_cache, key.objectid,
+			  key.objectid + key.offset - 1,
 			  (unsigned int)-1, GFP_NOFS);
 
+
+	clear_extent_bits(&info->free_space_cache,
+			   key.objectid, key.objectid + key.offset - 1,
+			   (unsigned int)-1, GFP_NOFS);
+
+	memset(shrink_block_group, 0, sizeof(*shrink_block_group));
+	kfree(shrink_block_group);
+
 	btrfs_del_item(trans, root, path);
-	clear_extent_dirty(&info->free_space_cache,
-			   shrink_start, shrink_last_byte - 1,
-			   GFP_NOFS);
 	btrfs_commit_transaction(trans, root);
+
+	/* the code to unpin extents might set a few bits in the free
+	 * space cache for this range again
+	 */
+	clear_extent_bits(&info->free_space_cache,
+			   key.objectid, key.objectid + key.offset - 1,
+			   (unsigned int)-1, GFP_NOFS);
 out:
 	btrfs_free_path(path);
 	return ret;
@@ -3081,9 +3179,8 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
 	BUG_ON(!cache);
 	cache->key.objectid = chunk_offset;
 	cache->key.offset = size;
-
 	btrfs_set_key_type(&cache->key, BTRFS_BLOCK_GROUP_ITEM_KEY);
-	memset(&cache->item, 0, sizeof(cache->item));
+
 	btrfs_set_block_group_used(&cache->item, bytes_used);
 	btrfs_set_block_group_chunk_objectid(&cache->item, chunk_objectid);
 	cache->flags = type;
diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c
index c02e2bf2f028..155961c7b4d5 100644
--- a/fs/btrfs/tree-defrag.c
+++ b/fs/btrfs/tree-defrag.c
@@ -198,6 +198,13 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
 		goto out;
 	}
 	if (root->defrag_progress.objectid == 0) {
+		u32 nritems;
+
+		nritems = btrfs_header_nritems(root->node);
+		root->defrag_max.objectid = 0;
+		/* from above we know this is not a leaf */
+		btrfs_node_key_to_cpu(root->node, &root->defrag_max,
+				      nritems - 1);
 		extent_buffer_get(root->node);
 		ret = btrfs_cow_block(trans, root, root->node, NULL, 0, &tmp);
 		BUG_ON(ret);
@@ -254,6 +261,16 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
 out:
 	if (path)
 		btrfs_free_path(path);
+	if (ret == -EAGAIN) {
+		if (root->defrag_max.objectid > root->defrag_progress.objectid)
+			goto done;
+		if (root->defrag_max.type > root->defrag_progress.type)
+			goto done;
+		if (root->defrag_max.offset > root->defrag_progress.offset)
+			goto done;
+		ret = 0;
+	}
+done:
 	if (ret != -EAGAIN) {
 		memset(&root->defrag_progress, 0,
 		       sizeof(root->defrag_progress));
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 333e5269577a..722eb4550154 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -750,10 +750,6 @@ static int btrfs_rm_dev_item(struct btrfs_root *root,
 	if (bdev == fs_devices->latest_bdev)
 		fs_devices->latest_bdev = next_dev->bdev;
 
-	total_bytes = btrfs_super_total_bytes(&root->fs_info->super_copy);
-	btrfs_set_super_total_bytes(&root->fs_info->super_copy,
-				    total_bytes - device->total_bytes);
-
 	total_bytes = btrfs_super_num_devices(&root->fs_info->super_copy);
 	btrfs_set_super_num_devices(&root->fs_info->super_copy,
 				    total_bytes - 1);
@@ -849,6 +845,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
 
 	}
 	root->fs_info->fs_devices->num_devices--;
+	root->fs_info->fs_devices->open_devices--;
 
 	ret = btrfs_shrink_device(device, 0);
 	if (ret)
@@ -873,7 +870,6 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
 	if (device->bdev) {
 		/* one close for the device struct or super_block */
 		close_bdev_excl(device->bdev);
-		root->fs_info->fs_devices->open_devices--;
 	}
 	if (bdev) {
 		/* one close for us */
@@ -1450,7 +1446,7 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 		return -ENOSPC;
 
 	if (type & (BTRFS_BLOCK_GROUP_RAID0)) {
-		num_stripes = btrfs_super_num_devices(&info->super_copy);
+		num_stripes = extent_root->fs_info->fs_devices->open_devices;
 		min_stripes = 2;
 	}
 	if (type & (BTRFS_BLOCK_GROUP_DUP)) {
@@ -1459,13 +1455,13 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 	}
 	if (type & (BTRFS_BLOCK_GROUP_RAID1)) {
 		num_stripes = min_t(u64, 2,
-				  btrfs_super_num_devices(&info->super_copy));
+			    extent_root->fs_info->fs_devices->open_devices);
 		if (num_stripes < 2)
 			return -ENOSPC;
 		min_stripes = 2;
 	}
 	if (type & (BTRFS_BLOCK_GROUP_RAID10)) {
-		num_stripes = btrfs_super_num_devices(&info->super_copy);
+		num_stripes = extent_root->fs_info->fs_devices->open_devices;
 		if (num_stripes < 4)
 			return -ENOSPC;
 		num_stripes &= ~(u32)1;
-- 
cgit v1.2.3


From 8b7128429235d9bd72cfd5ed20c77c4f3118f744 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 11 Jun 2008 16:50:36 -0400
Subject: Btrfs: Add async worker threads for pre and post IO checksumming

Btrfs has been using workqueues to spread the checksumming load across
other CPUs in the system.  But, workqueues only schedule work on the
same CPU that queued the work, giving them a limited benefit for systems with
higher CPU counts.

This code adds a generic facility to schedule work with pools of kthreads,
and changes the bio submission code to queue bios up.  The queueing is
important to make sure large numbers of procs on the system don't
turn streaming workloads into random workloads by sending IO down
concurrently.

The end result of all of this is much higher performance (and CPU usage) when
doing checksumming on large machines.  Two worker pools are created,
one for writes and one for endio processing.  The two could deadlock if
we tried to service both from a single pool.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/Makefile       |   2 +-
 fs/btrfs/async-thread.c | 288 ++++++++++++++++++++++++++++++++++++++++++++++++
 fs/btrfs/async-thread.h |  78 +++++++++++++
 fs/btrfs/ctree.h        |  14 ++-
 fs/btrfs/disk-io.c      | 200 ++++++++++++++-------------------
 fs/btrfs/inode.c        |   4 +-
 fs/btrfs/volumes.c      | 162 ++++++++++++++++++++++++++-
 fs/btrfs/volumes.h      |  10 +-
 8 files changed, 626 insertions(+), 132 deletions(-)
 create mode 100644 fs/btrfs/async-thread.c
 create mode 100644 fs/btrfs/async-thread.h

(limited to 'fs/btrfs/volumes.c')

diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index e5fc3cfea0ab..9dcfc2fe3332 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -6,7 +6,7 @@ btrfs-y := super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
 	   hash.o file-item.o inode-item.o inode-map.o disk-io.o \
 	   transaction.o bit-radix.o inode.o file.o tree-defrag.o \
 	   extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \
-	   extent_io.o volumes.o
+	   extent_io.o volumes.o async-thread.o
 
 btrfs-$(CONFIG_FS_POSIX_ACL)	+= acl.o
 else
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
new file mode 100644
index 000000000000..2911b67bd6f7
--- /dev/null
+++ b/fs/btrfs/async-thread.c
@@ -0,0 +1,288 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/kthread.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/freezer.h>
+#include "async-thread.h"
+
+/*
+ * container for the kthread task pointer and the list of pending work
+ * One of these is allocated per thread.
+ */
+struct btrfs_worker_thread {
+	/* list of struct btrfs_work that are waiting for service */
+	struct list_head pending;
+
+	/* list of worker threads from struct btrfs_workers */
+	struct list_head worker_list;
+
+	/* kthread */
+	struct task_struct *task;
+
+	/* number of things on the pending list */
+	atomic_t num_pending;
+
+	/* protects the pending list. */
+	spinlock_t lock;
+
+	/* set to non-zero when this thread is already awake and kicking */
+	int working;
+};
+
+/*
+ * main loop for servicing work items
+ */
+static int worker_loop(void *arg)
+{
+	struct btrfs_worker_thread *worker = arg;
+	struct list_head *cur;
+	struct btrfs_work *work;
+	do {
+		spin_lock_irq(&worker->lock);
+		while(!list_empty(&worker->pending)) {
+			cur = worker->pending.next;
+			work = list_entry(cur, struct btrfs_work, list);
+			list_del(&work->list);
+			clear_bit(0, &work->flags);
+
+			work->worker = worker;
+			spin_unlock_irq(&worker->lock);
+
+			work->func(work);
+
+			atomic_dec(&worker->num_pending);
+			spin_lock_irq(&worker->lock);
+		}
+		worker->working = 0;
+		if (freezing(current)) {
+			refrigerator();
+		} else {
+			set_current_state(TASK_INTERRUPTIBLE);
+			spin_unlock_irq(&worker->lock);
+			schedule();
+			__set_current_state(TASK_RUNNING);
+		}
+	} while (!kthread_should_stop());
+	return 0;
+}
+
+/*
+ * this will wait for all the worker threads to shutdown
+ */
+int btrfs_stop_workers(struct btrfs_workers *workers)
+{
+	struct list_head *cur;
+	struct btrfs_worker_thread *worker;
+
+	while(!list_empty(&workers->worker_list)) {
+		cur = workers->worker_list.next;
+		worker = list_entry(cur, struct btrfs_worker_thread,
+				    worker_list);
+		kthread_stop(worker->task);
+		list_del(&worker->worker_list);
+		kfree(worker);
+	}
+	return 0;
+}
+
+/*
+ * simple init on struct btrfs_workers
+ */
+void btrfs_init_workers(struct btrfs_workers *workers, int max)
+{
+	workers->num_workers = 0;
+	INIT_LIST_HEAD(&workers->worker_list);
+	workers->last = NULL;
+	spin_lock_init(&workers->lock);
+	workers->max_workers = max;
+}
+
+/*
+ * starts new worker threads.  This does not enforce the max worker
+ * count in case you need to temporarily go past it.
+ */
+int btrfs_start_workers(struct btrfs_workers *workers, int num_workers)
+{
+	struct btrfs_worker_thread *worker;
+	int ret = 0;
+	int i;
+
+	for (i = 0; i < num_workers; i++) {
+		worker = kzalloc(sizeof(*worker), GFP_NOFS);
+		if (!worker) {
+			ret = -ENOMEM;
+			goto fail;
+		}
+
+		INIT_LIST_HEAD(&worker->pending);
+		INIT_LIST_HEAD(&worker->worker_list);
+		spin_lock_init(&worker->lock);
+		atomic_set(&worker->num_pending, 0);
+		worker->task = kthread_run(worker_loop, worker, "btrfs");
+		if (IS_ERR(worker->task)) {
+			ret = PTR_ERR(worker->task);
+			goto fail;
+		}
+
+		spin_lock_irq(&workers->lock);
+		list_add_tail(&worker->worker_list, &workers->worker_list);
+		workers->last = worker;
+		workers->num_workers++;
+		spin_unlock_irq(&workers->lock);
+	}
+	return 0;
+fail:
+	btrfs_stop_workers(workers);
+	return ret;
+}
+
+/*
+ * run through the list and find a worker thread that doesn't have a lot
+ * to do right now.  This can return null if we aren't yet at the thread
+ * count limit and all of the threads are busy.
+ */
+static struct btrfs_worker_thread *next_worker(struct btrfs_workers *workers)
+{
+	struct btrfs_worker_thread *worker;
+	struct list_head *next;
+	struct list_head *start;
+	int enforce_min = workers->num_workers < workers->max_workers;
+
+	/* start with the last thread if it isn't busy */
+	worker = workers->last;
+	if (atomic_read(&worker->num_pending) < 64)
+		goto done;
+
+	next = worker->worker_list.next;
+	start = &worker->worker_list;
+
+	/*
+	 * check all the workers for someone that is bored.  FIXME, do
+	 * something smart here
+	 */
+	while(next != start) {
+		if (next == &workers->worker_list) {
+			next = workers->worker_list.next;
+			continue;
+		}
+		worker = list_entry(next, struct btrfs_worker_thread,
+				    worker_list);
+		if (atomic_read(&worker->num_pending) < 64 || !enforce_min)
+			goto done;
+		next = next->next;
+	}
+	/*
+	 * nobody was bored, if we're already at the max thread count,
+	 * use the last thread
+	 */
+	if (!enforce_min || atomic_read(&workers->last->num_pending) < 64) {
+		return workers->last;
+	}
+	return NULL;
+done:
+	workers->last = worker;
+	return worker;
+}
+
+static struct btrfs_worker_thread *find_worker(struct btrfs_workers *workers)
+{
+	struct btrfs_worker_thread *worker;
+	unsigned long flags;
+
+again:
+	spin_lock_irqsave(&workers->lock, flags);
+	worker = next_worker(workers);
+	spin_unlock_irqrestore(&workers->lock, flags);
+
+	if (!worker) {
+		spin_lock_irqsave(&workers->lock, flags);
+		if (workers->num_workers >= workers->max_workers) {
+			/*
+			 * we have failed to find any workers, just
+			 * return the force one
+			 */
+			worker = list_entry(workers->worker_list.next,
+				  struct btrfs_worker_thread, worker_list);
+			spin_unlock_irqrestore(&workers->lock, flags);
+		} else {
+			spin_unlock_irqrestore(&workers->lock, flags);
+			/* we're below the limit, start another worker */
+			btrfs_start_workers(workers, 1);
+			goto again;
+		}
+	}
+	return worker;
+}
+
+/*
+ * btrfs_requeue_work just puts the work item back on the tail of the list
+ * it was taken from.  It is intended for use with long running work functions
+ * that make some progress and want to give the cpu up for others.
+ */
+int btrfs_requeue_work(struct btrfs_work *work)
+{
+	struct btrfs_worker_thread *worker = work->worker;
+	unsigned long flags;
+
+	if (test_and_set_bit(0, &work->flags))
+		goto out;
+
+	spin_lock_irqsave(&worker->lock, flags);
+	atomic_inc(&worker->num_pending);
+	list_add_tail(&work->list, &worker->pending);
+	spin_unlock_irqrestore(&worker->lock, flags);
+out:
+	return 0;
+}
+
+/*
+ * places a struct btrfs_work into the pending queue of one of the kthreads
+ */
+int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work)
+{
+	struct btrfs_worker_thread *worker;
+	unsigned long flags;
+	int wake = 0;
+
+	/* don't requeue something already on a list */
+	if (test_and_set_bit(0, &work->flags))
+		goto out;
+
+	worker = find_worker(workers);
+
+	spin_lock_irqsave(&worker->lock, flags);
+	atomic_inc(&worker->num_pending);
+	list_add_tail(&work->list, &worker->pending);
+
+	/*
+	 * avoid calling into wake_up_process if this thread has already
+	 * been kicked
+	 */
+	if (!worker->working)
+		wake = 1;
+	worker->working = 1;
+
+	spin_unlock_irqrestore(&worker->lock, flags);
+
+	if (wake)
+		wake_up_process(worker->task);
+out:
+	return 0;
+}
diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h
new file mode 100644
index 000000000000..52fc9da0f9e7
--- /dev/null
+++ b/fs/btrfs/async-thread.h
@@ -0,0 +1,78 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef __BTRFS_ASYNC_THREAD_
+#define __BTRFS_ASYNC_THREAD_
+
+struct btrfs_worker_thread;
+
+/*
+ * This is similar to a workqueue, but it is meant to spread the operations
+ * across all available cpus instead of just the CPU that was used to
+ * queue the work.  There is also some batching introduced to try and
+ * cut down on context switches.
+ *
+ * By default threads are added on demand up to 2 * the number of cpus.
+ * Changing struct btrfs_workers->max_workers is one way to prevent
+ * demand creation of kthreads.
+ *
+ * the basic model of these worker threads is to embed a btrfs_work
+ * structure in your own data struct, and use container_of in a
+ * work function to get back to your data struct.
+ */
+struct btrfs_work {
+	/*
+	 * only func should be set to the function you want called
+	 * your work struct is passed as the only arg
+	 */
+	void (*func)(struct btrfs_work *work);
+
+	/*
+	 * flags should be set to zero.  It is used to make sure the
+	 * struct is only inserted once into the list.
+	 */
+	unsigned long flags;
+
+	/* don't touch these */
+	struct btrfs_worker_thread *worker;
+	struct list_head list;
+};
+
+struct btrfs_workers {
+	/* current number of running workers */
+	int num_workers;
+
+	/* max number of workers allowed.  changed by btrfs_start_workers */
+	int max_workers;
+
+	/* list with all the work threads */
+	struct list_head worker_list;
+
+	/* the last worker thread to have something queued */
+	struct btrfs_worker_thread *last;
+
+	/* lock for finding the next worker thread to queue on */
+	spinlock_t lock;
+};
+
+int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work);
+int btrfs_start_workers(struct btrfs_workers *workers, int num_workers);
+int btrfs_stop_workers(struct btrfs_workers *workers);
+void btrfs_init_workers(struct btrfs_workers *workers, int max);
+int btrfs_requeue_work(struct btrfs_work *work);
+#endif
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 49cbc62b42f9..6c91a510c965 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -30,6 +30,7 @@
 #include "bit-radix.h"
 #include "extent_io.h"
 #include "extent_map.h"
+#include "async-thread.h"
 
 struct btrfs_trans_handle;
 struct btrfs_transaction;
@@ -518,13 +519,20 @@ struct btrfs_fs_info {
 	struct list_head hashers;
 	struct list_head dead_roots;
 	struct list_head end_io_work_list;
-	struct list_head async_submit_work_list;
 	struct work_struct end_io_work;
-	struct work_struct async_submit_work;
 	spinlock_t end_io_work_lock;
-	spinlock_t async_submit_work_lock;
 	atomic_t nr_async_submits;
 
+	/*
+	 * there is a pool of worker threads for checksumming during writes
+	 * and a pool for checksumming after reads.  This is because readers
+	 * can run with FS locks held, and the writers may be waiting for
+	 * those locks.  We don't want ordering in the pending list to cause
+	 * deadlocks, and so the two are serviced separately.
+	 */
+	struct btrfs_workers workers;
+	struct btrfs_workers endio_workers;
+
 #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18)
 	struct work_struct trans_work;
 #else
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index b9a53646ceb2..98ff4fbcb386 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -31,6 +31,7 @@
 #include "btrfs_inode.h"
 #include "volumes.h"
 #include "print-tree.h"
+#include "async-thread.h"
 
 #if 0
 static int check_tree_block(struct btrfs_root *root, struct extent_buffer *buf)
@@ -46,8 +47,7 @@ static int check_tree_block(struct btrfs_root *root, struct extent_buffer *buf)
 #endif
 
 static struct extent_io_ops btree_extent_io_ops;
-static struct workqueue_struct *end_io_workqueue;
-static struct workqueue_struct *async_submit_workqueue;
+static void end_workqueue_fn(struct btrfs_work *work);
 
 struct end_io_wq {
 	struct bio *bio;
@@ -57,6 +57,7 @@ struct end_io_wq {
 	int error;
 	int metadata;
 	struct list_head list;
+	struct btrfs_work work;
 };
 
 struct async_submit_bio {
@@ -66,6 +67,7 @@ struct async_submit_bio {
 	extent_submit_bio_hook_t *submit_bio_hook;
 	int rw;
 	int mirror_num;
+	struct btrfs_work work;
 };
 
 struct extent_map *btree_get_extent(struct inode *inode, struct page *page,
@@ -389,7 +391,6 @@ static int end_workqueue_bio(struct bio *bio,
 {
 	struct end_io_wq *end_io_wq = bio->bi_private;
 	struct btrfs_fs_info *fs_info;
-	unsigned long flags;
 
 #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
 	if (bio->bi_size)
@@ -397,11 +398,10 @@ static int end_workqueue_bio(struct bio *bio,
 #endif
 
 	fs_info = end_io_wq->info;
-	spin_lock_irqsave(&fs_info->end_io_work_lock, flags);
 	end_io_wq->error = err;
-	list_add_tail(&end_io_wq->list, &fs_info->end_io_work_list);
-	spin_unlock_irqrestore(&fs_info->end_io_work_lock, flags);
-	queue_work(end_io_workqueue, &fs_info->end_io_work);
+	end_io_wq->work.func = end_workqueue_fn;
+	end_io_wq->work.flags = 0;
+	btrfs_queue_worker(&fs_info->endio_workers, &end_io_wq->work);
 
 #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
 	return 0;
@@ -428,6 +428,19 @@ int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
 	return 0;
 }
 
+static void run_one_async_submit(struct btrfs_work *work)
+{
+	struct btrfs_fs_info *fs_info;
+	struct async_submit_bio *async;
+
+	async = container_of(work, struct  async_submit_bio, work);
+	fs_info = BTRFS_I(async->inode)->root->fs_info;
+	atomic_dec(&fs_info->nr_async_submits);
+	async->submit_bio_hook(async->inode, async->rw, async->bio,
+			       async->mirror_num);
+	kfree(async);
+}
+
 int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
 			int rw, struct bio *bio, int mirror_num,
 			extent_submit_bio_hook_t *submit_bio_hook)
@@ -443,13 +456,10 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
 	async->bio = bio;
 	async->mirror_num = mirror_num;
 	async->submit_bio_hook = submit_bio_hook;
-
-	spin_lock(&fs_info->async_submit_work_lock);
-	list_add_tail(&async->list, &fs_info->async_submit_work_list);
+	async->work.func = run_one_async_submit;
+	async->work.flags = 0;
 	atomic_inc(&fs_info->nr_async_submits);
-	spin_unlock(&fs_info->async_submit_work_lock);
-
-	queue_work(async_submit_workqueue, &fs_info->async_submit_work);
+	btrfs_queue_worker(&fs_info->workers, &async->work);
 	return 0;
 }
 
@@ -462,19 +472,32 @@ static int __btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
 
 	offset = bio->bi_sector << 9;
 
+	/*
+	 * when we're called for a write, we're already in the async
+	 * submission context.  Just jump ingo btrfs_map_bio
+	 */
 	if (rw & (1 << BIO_RW)) {
-		return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num);
+		return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio,
+				     mirror_num, 0);
 	}
 
+	/*
+	 * called for a read, do the setup so that checksum validation
+	 * can happen in the async kernel threads
+	 */
 	ret = btrfs_bio_wq_end_io(root->fs_info, bio, 1);
 	BUG_ON(ret);
 
-	return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num);
+	return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num, 1);
 }
 
 static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
 				 int mirror_num)
 {
+	/*
+	 * kthread helpers are used to submit writes so that checksumming
+	 * can happen in parallel across all CPUs
+	 */
 	if (!(rw & (1 << BIO_RW))) {
 		return __btree_submit_bio_hook(inode, rw, bio, mirror_num);
 	}
@@ -1036,95 +1059,40 @@ static int bio_ready_for_csum(struct bio *bio)
 	return ret;
 }
 
-#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18)
-static void btrfs_end_io_csum(void *p)
-#else
-static void btrfs_end_io_csum(struct work_struct *work)
-#endif
+/*
+ * called by the kthread helper functions to finally call the bio end_io
+ * functions.  This is where read checksum verification actually happens
+ */
+static void end_workqueue_fn(struct btrfs_work *work)
 {
-#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18)
-	struct btrfs_fs_info *fs_info = p;
-#else
-	struct btrfs_fs_info *fs_info = container_of(work,
-						     struct btrfs_fs_info,
-						     end_io_work);
-#endif
-	unsigned long flags;
-	struct end_io_wq *end_io_wq;
 	struct bio *bio;
-	struct list_head *next;
+	struct end_io_wq *end_io_wq;
+	struct btrfs_fs_info *fs_info;
 	int error;
-	int was_empty;
 
-	while(1) {
-		spin_lock_irqsave(&fs_info->end_io_work_lock, flags);
-		if (list_empty(&fs_info->end_io_work_list)) {
-			spin_unlock_irqrestore(&fs_info->end_io_work_lock,
-					       flags);
-			return;
-		}
-		next = fs_info->end_io_work_list.next;
-		list_del(next);
-		spin_unlock_irqrestore(&fs_info->end_io_work_lock, flags);
-
-		end_io_wq = list_entry(next, struct end_io_wq, list);
-
-		bio = end_io_wq->bio;
-		if (end_io_wq->metadata && !bio_ready_for_csum(bio)) {
-			spin_lock_irqsave(&fs_info->end_io_work_lock, flags);
-			was_empty = list_empty(&fs_info->end_io_work_list);
-			list_add_tail(&end_io_wq->list,
-				      &fs_info->end_io_work_list);
-			spin_unlock_irqrestore(&fs_info->end_io_work_lock,
-					       flags);
-			if (was_empty)
-				return;
-			continue;
-		}
-		error = end_io_wq->error;
-		bio->bi_private = end_io_wq->private;
-		bio->bi_end_io = end_io_wq->end_io;
-		kfree(end_io_wq);
-#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
-		bio_endio(bio, bio->bi_size, error);
-#else
-		bio_endio(bio, error);
-#endif
-	}
-}
+	end_io_wq = container_of(work, struct end_io_wq, work);
+	bio = end_io_wq->bio;
+	fs_info = end_io_wq->info;
 
-#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18)
-static void btrfs_async_submit_work(void *p)
-#else
-static void btrfs_async_submit_work(struct work_struct *work)
-#endif
-{
-#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18)
-	struct btrfs_fs_info *fs_info = p;
+	/* metadata bios are special because the whole tree block must
+	 * be checksummed at once.  This makes sure the entire block is in
+	 * ram and up to date before trying to verify things.  For
+	 * blocksize <= pagesize, it is basically a noop
+	 */
+	if (end_io_wq->metadata && !bio_ready_for_csum(bio)) {
+		btrfs_queue_worker(&fs_info->endio_workers,
+				   &end_io_wq->work);
+		return;
+	}
+	error = end_io_wq->error;
+	bio->bi_private = end_io_wq->private;
+	bio->bi_end_io = end_io_wq->end_io;
+	kfree(end_io_wq);
+#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
+	bio_endio(bio, bio->bi_size, error);
 #else
-	struct btrfs_fs_info *fs_info = container_of(work,
-						     struct btrfs_fs_info,
-						     async_submit_work);
+	bio_endio(bio, error);
 #endif
-	struct async_submit_bio *async;
-	struct list_head *next;
-
-	while(1) {
-		spin_lock(&fs_info->async_submit_work_lock);
-		if (list_empty(&fs_info->async_submit_work_list)) {
-			spin_unlock(&fs_info->async_submit_work_lock);
-			return;
-		}
-		next = fs_info->async_submit_work_list.next;
-		list_del(next);
-		atomic_dec(&fs_info->nr_async_submits);
-		spin_unlock(&fs_info->async_submit_work_lock);
-
-		async = list_entry(next, struct async_submit_bio, list);
-		async->submit_bio_hook(async->inode, async->rw, async->bio,
-				       async->mirror_num);
-		kfree(async);
-	}
 }
 
 struct btrfs_root *open_ctree(struct super_block *sb,
@@ -1155,19 +1123,11 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 		err = -ENOMEM;
 		goto fail;
 	}
-	end_io_workqueue = create_workqueue("btrfs-end-io");
-	BUG_ON(!end_io_workqueue);
-	async_submit_workqueue = create_workqueue("btrfs-async-submit");
-
 	INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_NOFS);
 	INIT_LIST_HEAD(&fs_info->trans_list);
 	INIT_LIST_HEAD(&fs_info->dead_roots);
 	INIT_LIST_HEAD(&fs_info->hashers);
-	INIT_LIST_HEAD(&fs_info->end_io_work_list);
-	INIT_LIST_HEAD(&fs_info->async_submit_work_list);
 	spin_lock_init(&fs_info->hash_lock);
-	spin_lock_init(&fs_info->end_io_work_lock);
-	spin_lock_init(&fs_info->async_submit_work_lock);
 	spin_lock_init(&fs_info->delalloc_lock);
 	spin_lock_init(&fs_info->new_trans_lock);
 
@@ -1222,13 +1182,8 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	fs_info->do_barriers = 1;
 
 #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18)
-	INIT_WORK(&fs_info->end_io_work, btrfs_end_io_csum, fs_info);
-	INIT_WORK(&fs_info->async_submit_work, btrfs_async_submit_work,
-		  fs_info);
 	INIT_WORK(&fs_info->trans_work, btrfs_transaction_cleaner, fs_info);
 #else
-	INIT_WORK(&fs_info->end_io_work, btrfs_end_io_csum);
-	INIT_WORK(&fs_info->async_submit_work, btrfs_async_submit_work);
 	INIT_DELAYED_WORK(&fs_info->trans_work, btrfs_transaction_cleaner);
 #endif
 	BTRFS_I(fs_info->btree_inode)->root = tree_root;
@@ -1240,6 +1195,19 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	mutex_init(&fs_info->trans_mutex);
 	mutex_init(&fs_info->fs_mutex);
 
+	/* we need to start all the end_io workers up front because the
+	 * queue work function gets called at interrupt time.  The endio
+	 * workers don't normally start IO, so some number of them <= the
+	 * number of cpus is fine.  They handle checksumming after a read.
+	 *
+	 * The other worker threads do start IO, so the max is larger than
+	 * the number of CPUs.  FIXME, tune this for huge machines
+	 */
+	btrfs_init_workers(&fs_info->workers, num_online_cpus() * 2);
+	btrfs_init_workers(&fs_info->endio_workers, num_online_cpus());
+	btrfs_start_workers(&fs_info->workers, 1);
+	btrfs_start_workers(&fs_info->endio_workers, num_online_cpus());
+
 #if 0
 	ret = add_hasher(fs_info, "crc32c");
 	if (ret) {
@@ -1375,6 +1343,8 @@ fail_sb_buffer:
 	extent_io_tree_empty_lru(&BTRFS_I(fs_info->btree_inode)->io_tree);
 fail_iput:
 	iput(fs_info->btree_inode);
+	btrfs_stop_workers(&fs_info->workers);
+	btrfs_stop_workers(&fs_info->endio_workers);
 fail:
 	btrfs_close_devices(fs_info->fs_devices);
 	btrfs_mapping_tree_free(&fs_info->mapping_tree);
@@ -1623,16 +1593,10 @@ int close_ctree(struct btrfs_root *root)
 	extent_io_tree_empty_lru(&fs_info->extent_ins);
 	extent_io_tree_empty_lru(&BTRFS_I(fs_info->btree_inode)->io_tree);
 
-	flush_workqueue(async_submit_workqueue);
-	flush_workqueue(end_io_workqueue);
-
 	truncate_inode_pages(fs_info->btree_inode->i_mapping, 0);
 
-	flush_workqueue(async_submit_workqueue);
-	destroy_workqueue(async_submit_workqueue);
-
-	flush_workqueue(end_io_workqueue);
-	destroy_workqueue(end_io_workqueue);
+	btrfs_stop_workers(&fs_info->workers);
+	btrfs_stop_workers(&fs_info->endio_workers);
 
 	iput(fs_info->btree_inode);
 #if 0
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 0f14697becef..7daef8d37006 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -359,7 +359,7 @@ int __btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
 
 	kfree(sums);
 
-	return btrfs_map_bio(root, rw, bio, mirror_num);
+	return btrfs_map_bio(root, rw, bio, mirror_num, 1);
 }
 
 int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
@@ -383,7 +383,7 @@ int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
 				   inode, rw, bio, mirror_num,
 				   __btrfs_submit_bio_hook);
 mapit:
-	return btrfs_map_bio(root, rw, bio, mirror_num);
+	return btrfs_map_bio(root, rw, bio, mirror_num, 0);
 }
 
 int btrfs_readpage_io_hook(struct page *page, u64 start, u64 end)
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 722eb4550154..c57458ce6339 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -27,6 +27,7 @@
 #include "transaction.h"
 #include "print-tree.h"
 #include "volumes.h"
+#include "async-thread.h"
 
 struct map_lookup {
 	u64 type;
@@ -110,6 +111,101 @@ static struct btrfs_fs_devices *find_fsid(u8 *fsid)
 	return NULL;
 }
 
+/*
+ * we try to collect pending bios for a device so we don't get a large
+ * number of procs sending bios down to the same device.  This greatly
+ * improves the schedulers ability to collect and merge the bios.
+ *
+ * But, it also turns into a long list of bios to process and that is sure
+ * to eventually make the worker thread block.  The solution here is to
+ * make some progress and then put this work struct back at the end of
+ * the list if the block device is congested.  This way, multiple devices
+ * can make progress from a single worker thread.
+ */
+int run_scheduled_bios(struct btrfs_device *device)
+{
+	struct bio *pending;
+	struct backing_dev_info *bdi;
+	struct bio *tail;
+	struct bio *cur;
+	int again = 0;
+	unsigned long num_run = 0;
+
+	bdi = device->bdev->bd_inode->i_mapping->backing_dev_info;
+loop:
+	spin_lock(&device->io_lock);
+
+	/* take all the bios off the list at once and process them
+	 * later on (without the lock held).  But, remember the
+	 * tail and other pointers so the bios can be properly reinserted
+	 * into the list if we hit congestion
+	 */
+	pending = device->pending_bios;
+	tail = device->pending_bio_tail;
+	WARN_ON(pending && !tail);
+	device->pending_bios = NULL;
+	device->pending_bio_tail = NULL;
+
+	/*
+	 * if pending was null this time around, no bios need processing
+	 * at all and we can stop.  Otherwise it'll loop back up again
+	 * and do an additional check so no bios are missed.
+	 *
+	 * device->running_pending is used to synchronize with the
+	 * schedule_bio code.
+	 */
+	if (pending) {
+		again = 1;
+		device->running_pending = 1;
+	} else {
+		again = 0;
+		device->running_pending = 0;
+	}
+	spin_unlock(&device->io_lock);
+
+	while(pending) {
+		cur = pending;
+		pending = pending->bi_next;
+		cur->bi_next = NULL;
+		atomic_dec(&device->dev_root->fs_info->nr_async_submits);
+		submit_bio(cur->bi_rw, cur);
+		num_run++;
+
+		/*
+		 * we made progress, there is more work to do and the bdi
+		 * is now congested.  Back off and let other work structs
+		 * run instead
+		 */
+		if (pending && num_run && bdi_write_congested(bdi)) {
+			struct bio *old_head;
+
+			spin_lock(&device->io_lock);
+			old_head = device->pending_bios;
+			device->pending_bios = pending;
+			if (device->pending_bio_tail)
+				tail->bi_next = old_head;
+			else
+				device->pending_bio_tail = tail;
+
+			spin_unlock(&device->io_lock);
+			btrfs_requeue_work(&device->work);
+			goto done;
+		}
+	}
+	if (again)
+		goto loop;
+done:
+	return 0;
+}
+
+void pending_bios_fn(struct btrfs_work *work)
+{
+	struct btrfs_device *device;
+
+	device = container_of(work, struct btrfs_device, work);
+	run_scheduled_bios(device);
+}
+
 static int device_list_add(const char *path,
 			   struct btrfs_super_block *disk_super,
 			   u64 devid, struct btrfs_fs_devices **fs_devices_ret)
@@ -141,6 +237,7 @@ static int device_list_add(const char *path,
 			return -ENOMEM;
 		}
 		device->devid = devid;
+		device->work.func = pending_bios_fn;
 		memcpy(device->uuid, disk_super->dev_item.uuid,
 		       BTRFS_UUID_SIZE);
 		device->barriers = 1;
@@ -925,6 +1022,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
 	}
 
 	device->barriers = 1;
+	device->work.func = pending_bios_fn;
 	generate_random_uuid(device->uuid);
 	spin_lock_init(&device->io_lock);
 	device->name = kstrdup(device_path, GFP_NOFS);
@@ -1965,8 +2063,61 @@ static int end_bio_multi_stripe(struct bio *bio,
 #endif
 }
 
+struct async_sched {
+	struct bio *bio;
+	int rw;
+	struct btrfs_fs_info *info;
+	struct btrfs_work work;
+};
+
+/*
+ * see run_scheduled_bios for a description of why bios are collected for
+ * async submit.
+ *
+ * This will add one bio to the pending list for a device and make sure
+ * the work struct is scheduled.
+ */
+int schedule_bio(struct btrfs_root *root, struct btrfs_device *device,
+		 int rw, struct bio *bio)
+{
+	int should_queue = 1;
+
+	/* don't bother with additional async steps for reads, right now */
+	if (!(rw & (1 << BIO_RW))) {
+		submit_bio(rw, bio);
+		return 0;
+	}
+
+	/*
+	 * nr_async_sumbits allows us to reliably return congestion to the
+	 * higher layers.  Otherwise, the async bio makes it appear we have
+	 * made progress against dirty pages when we've really just put it
+	 * on a queue for later
+	 */
+	atomic_inc(&root->fs_info->nr_async_submits);
+	bio->bi_next = NULL;
+	bio->bi_rw |= rw;
+
+	spin_lock(&device->io_lock);
+
+	if (device->pending_bio_tail)
+		device->pending_bio_tail->bi_next = bio;
+
+	device->pending_bio_tail = bio;
+	if (!device->pending_bios)
+		device->pending_bios = bio;
+	if (device->running_pending)
+		should_queue = 0;
+
+	spin_unlock(&device->io_lock);
+
+	if (should_queue)
+		btrfs_queue_worker(&root->fs_info->workers, &device->work);
+	return 0;
+}
+
 int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
-		  int mirror_num)
+		  int mirror_num, int async_submit)
 {
 	struct btrfs_mapping_tree *map_tree;
 	struct btrfs_device *dev;
@@ -2012,10 +2163,10 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
 		dev = multi->stripes[dev_nr].dev;
 		if (dev && dev->bdev) {
 			bio->bi_bdev = dev->bdev;
-			spin_lock(&dev->io_lock);
-			dev->total_ios++;
-			spin_unlock(&dev->io_lock);
-			submit_bio(rw, bio);
+			if (async_submit)
+				schedule_bio(root, dev, rw, bio);
+			else
+				submit_bio(rw, bio);
 		} else {
 			bio->bi_bdev = root->fs_info->fs_devices->latest_bdev;
 			bio->bi_sector = logical >> 9;
@@ -2054,6 +2205,7 @@ static struct btrfs_device *add_missing_dev(struct btrfs_root *root,
 	device->barriers = 1;
 	device->dev_root = root->fs_info->dev_root;
 	device->devid = devid;
+	device->work.func = pending_bios_fn;
 	fs_devices->num_devices++;
 	spin_lock_init(&device->io_lock);
 	memcpy(device->uuid, dev_uuid, BTRFS_UUID_SIZE);
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 4df6b1608f91..48a44f7a9385 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -20,6 +20,7 @@
 #define __BTRFS_VOLUMES_
 
 #include <linux/bio.h>
+#include "async-thread.h"
 
 struct buffer_head;
 struct btrfs_device {
@@ -27,6 +28,9 @@ struct btrfs_device {
 	struct list_head dev_alloc_list;
 	struct btrfs_root *dev_root;
 	struct buffer_head *pending_io;
+	struct bio *pending_bios;
+	struct bio *pending_bio_tail;
+	int running_pending;
 	u64 generation;
 
 	int barriers;
@@ -36,8 +40,6 @@ struct btrfs_device {
 
 	struct block_device *bdev;
 
-	u64 total_ios;
-
 	char *name;
 
 	/* the internal btrfs device id */
@@ -63,6 +65,8 @@ struct btrfs_device {
 
 	/* physical drive uuid (or lvm uuid) */
 	u8 uuid[BTRFS_UUID_SIZE];
+
+	struct btrfs_work work;
 };
 
 struct btrfs_fs_devices {
@@ -117,7 +121,7 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 void btrfs_mapping_init(struct btrfs_mapping_tree *tree);
 void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree);
 int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
-		  int mirror_num);
+		  int mirror_num, int async_submit);
 int btrfs_read_super_device(struct btrfs_root *root, struct extent_buffer *buf);
 int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
 		       int flags, void *holder);
-- 
cgit v1.2.3


From 1cc127b5d1b71453091859301de4a7dd6ee96fa8 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 12 Jun 2008 14:46:17 -0400
Subject: Btrfs: Add a thread pool just for submit_bio

If a bio submission is after a lock holder waiting for the bio
on the work queue, it is possible to deadlock.  Move the bios
into their own pool.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h   | 4 ++++
 fs/btrfs/disk-io.c | 4 ++++
 fs/btrfs/volumes.c | 3 ++-
 3 files changed, 10 insertions(+), 1 deletion(-)

(limited to 'fs/btrfs/volumes.c')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 181c81d29897..dcea9d706d9b 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -529,9 +529,13 @@ struct btrfs_fs_info {
 	 * can run with FS locks held, and the writers may be waiting for
 	 * those locks.  We don't want ordering in the pending list to cause
 	 * deadlocks, and so the two are serviced separately.
+	 *
+	 * A third pool does submit_bio to avoid deadlocking with the other
+	 * two
 	 */
 	struct btrfs_workers workers;
 	struct btrfs_workers endio_workers;
+	struct btrfs_workers submit_workers;
 	int thread_pool_size;
 
 #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18)
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index c6a710a668cb..e5c758e306d5 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1233,8 +1233,10 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	 * cannot dynamically grow.
 	 */
 	btrfs_init_workers(&fs_info->workers, fs_info->thread_pool_size);
+	btrfs_init_workers(&fs_info->submit_workers, fs_info->thread_pool_size);
 	btrfs_init_workers(&fs_info->endio_workers, fs_info->thread_pool_size);
 	btrfs_start_workers(&fs_info->workers, 1);
+	btrfs_start_workers(&fs_info->submit_workers, 1);
 	btrfs_start_workers(&fs_info->endio_workers, fs_info->thread_pool_size);
 
 
@@ -1343,6 +1345,7 @@ fail_sb_buffer:
 	extent_io_tree_empty_lru(&BTRFS_I(fs_info->btree_inode)->io_tree);
 	btrfs_stop_workers(&fs_info->workers);
 	btrfs_stop_workers(&fs_info->endio_workers);
+	btrfs_stop_workers(&fs_info->submit_workers);
 fail_iput:
 	iput(fs_info->btree_inode);
 fail:
@@ -1597,6 +1600,7 @@ int close_ctree(struct btrfs_root *root)
 
 	btrfs_stop_workers(&fs_info->workers);
 	btrfs_stop_workers(&fs_info->endio_workers);
+	btrfs_stop_workers(&fs_info->submit_workers);
 
 	iput(fs_info->btree_inode);
 #if 0
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index c57458ce6339..ba3968571024 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -2112,7 +2112,8 @@ int schedule_bio(struct btrfs_root *root, struct btrfs_device *device,
 	spin_unlock(&device->io_lock);
 
 	if (should_queue)
-		btrfs_queue_worker(&root->fs_info->workers, &device->work);
+		btrfs_queue_worker(&root->fs_info->submit_workers,
+				   &device->work);
 	return 0;
 }
 
-- 
cgit v1.2.3


From a213501153fd66e2359e091b1612841305ba6551 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 25 Jun 2008 16:01:30 -0400
Subject: Btrfs: Replace the big fs_mutex with a collection of other locks

Extent alloctions are still protected by a large alloc_mutex.
Objectid allocations are covered by a objectid mutex
Other btree operations are protected by a lock on individual btree nodes

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c       | 40 +++++++++++++++++++++-----
 fs/btrfs/ctree.h       |  5 ++--
 fs/btrfs/disk-io.c     | 15 +++++-----
 fs/btrfs/extent-tree.c | 18 +++++-------
 fs/btrfs/file.c        |  7 +----
 fs/btrfs/inode-map.c   |  8 ++++++
 fs/btrfs/inode.c       | 76 ++++----------------------------------------------
 fs/btrfs/ioctl.c       | 24 ++++------------
 fs/btrfs/super.c       |  2 --
 fs/btrfs/transaction.c | 42 +++++++++++-----------------
 fs/btrfs/volumes.c     | 19 +++++++++----
 fs/btrfs/xattr.c       | 10 +------
 12 files changed, 101 insertions(+), 165 deletions(-)

(limited to 'fs/btrfs/volumes.c')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 75625c68fdd8..dff4da082d06 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -63,6 +63,9 @@ void btrfs_free_path(struct btrfs_path *p)
 void btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p)
 {
 	int i;
+	int skip = p->skip_locking;
+	int keep = p->keep_locks;
+
 	for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
 		if (!p->nodes[i])
 			continue;
@@ -73,6 +76,8 @@ void btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p)
 		free_extent_buffer(p->nodes[i]);
 	}
 	memset(p, 0, sizeof(*p));
+	p->skip_locking = skip;
+	p->keep_locks = keep;
 }
 
 struct extent_buffer *btrfs_root_node(struct btrfs_root *root)
@@ -1202,13 +1207,19 @@ static void unlock_up(struct btrfs_path *path, int level, int lowest_unlock)
 			u32 nritems;
 			t = path->nodes[i];
 			nritems = btrfs_header_nritems(t);
-			if (path->slots[i] >= nritems - 1) {
+			if (nritems < 2 || path->slots[i] >= nritems - 2) {
+if (path->keep_locks) {
+//printk("path %p skip level now %d\n", path, skip_level);
+}
 				skip_level = i + 1;
 				continue;
 			}
 		}
 		t = path->nodes[i];
 		if (i >= lowest_unlock && i > skip_level && path->locks[i]) {
+if (path->keep_locks) {
+//printk("path %p unlocking level %d slot %d nritems %d skip_level %d\n", path, i, path->slots[i], btrfs_header_nritems(t), skip_level);
+}
 			btrfs_tree_unlock(t);
 			path->locks[i] = 0;
 		}
@@ -1243,7 +1254,6 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
 	lowest_level = p->lowest_level;
 	WARN_ON(lowest_level && ins_len);
 	WARN_ON(p->nodes[0] != NULL);
-	// WARN_ON(!mutex_is_locked(&root->fs_info->fs_mutex));
 	WARN_ON(root == root->fs_info->extent_root &&
 		!mutex_is_locked(&root->fs_info->alloc_mutex));
 	WARN_ON(root == root->fs_info->chunk_root &&
@@ -1321,7 +1331,7 @@ again:
 			b = read_node_slot(root, b, slot);
 			if (!p->skip_locking)
 				btrfs_tree_lock(b);
-			unlock_up(p, level, lowest_unlock);
+			unlock_up(p, level + 1, lowest_unlock);
 		} else {
 			p->slots[level] = slot;
 			if (ins_len > 0 && btrfs_leaf_free_space(root, b) <
@@ -1804,6 +1814,8 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
 	if (slot >= btrfs_header_nritems(upper) - 1)
 		return 1;
 
+	WARN_ON(!btrfs_tree_locked(path->nodes[1]));
+
 	right = read_node_slot(root, upper, slot + 1);
 	btrfs_tree_lock(right);
 	free_space = btrfs_leaf_free_space(root, right);
@@ -1981,6 +1993,8 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
 		return 1;
 	}
 
+	WARN_ON(!btrfs_tree_locked(path->nodes[1]));
+
 	left = read_node_slot(root, path->nodes[1], slot - 1);
 	btrfs_tree_lock(left);
 	free_space = btrfs_leaf_free_space(root, left);
@@ -2957,15 +2971,16 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
 
 	btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1);
 
-	path->keep_locks = 1;
 	btrfs_release_path(root, path);
+	path->keep_locks = 1;
 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
 	path->keep_locks = 0;
 
 	if (ret < 0)
 		return ret;
 
-	if (path->slots[0] < nritems - 1) {
+	nritems = btrfs_header_nritems(path->nodes[0]);
+	if (nritems > 0 && path->slots[0] < nritems - 1) {
 		goto done;
 	}
 
@@ -2992,8 +3007,17 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
 			reada_for_search(root, path, level, slot, 0);
 
 		next = read_node_slot(root, c, slot);
-		if (!path->skip_locking)
+		if (!path->skip_locking) {
+			if (!btrfs_tree_locked(c)) {
+				int i;
+				WARN_ON(1);
+printk("path %p no lock on level %d\n", path, level);
+for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
+printk("path %p level %d slot %d nritems %d\n", path, i, path->slots[i], btrfs_header_nritems(path->nodes[i]));
+}
+			}
 			btrfs_tree_lock(next);
+		}
 		break;
 	}
 	path->slots[level] = slot;
@@ -3011,8 +3035,10 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
 		if (level == 1 && path->locks[1] && path->reada)
 			reada_for_search(root, path, level, slot, 0);
 		next = read_node_slot(root, next, 0);
-		if (!path->skip_locking)
+		if (!path->skip_locking) {
+			WARN_ON(!btrfs_tree_locked(path->nodes[level]));
 			btrfs_tree_lock(next);
+		}
 	}
 done:
 	unlock_up(path, 0, 1);
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 50891b39f366..692b8ea42de1 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -519,9 +519,9 @@ struct btrfs_fs_info {
 	struct backing_dev_info bdi;
 	spinlock_t hash_lock;
 	struct mutex trans_mutex;
-	struct mutex fs_mutex;
 	struct mutex alloc_mutex;
 	struct mutex chunk_mutex;
+	struct mutex drop_mutex;
 	struct list_head trans_list;
 	struct list_head hashers;
 	struct list_head dead_roots;
@@ -554,7 +554,7 @@ struct btrfs_fs_info {
 	struct completion kobj_unregister;
 	int do_barriers;
 	int closing;
-	unsigned long throttles;
+	atomic_t throttles;
 
 	u64 total_pinned;
 	struct list_head dirty_cowonly_roots;
@@ -594,6 +594,7 @@ struct btrfs_root {
 	struct inode *inode;
 	struct kobject root_kobj;
 	struct completion kobj_unregister;
+	struct mutex objectid_mutex;
 	u64 objectid;
 	u64 last_trans;
 
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index fe40bdd984ff..f638803549e0 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -724,6 +724,7 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
 
 	INIT_LIST_HEAD(&root->dirty_list);
 	spin_lock_init(&root->node_lock);
+	mutex_init(&root->objectid_mutex);
 	memset(&root->root_key, 0, sizeof(root->root_key));
 	memset(&root->root_item, 0, sizeof(root->root_item));
 	memset(&root->defrag_progress, 0, sizeof(root->defrag_progress));
@@ -1146,6 +1147,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	INIT_LIST_HEAD(&fs_info->space_info);
 	btrfs_mapping_init(&fs_info->mapping_tree);
 	atomic_set(&fs_info->nr_async_submits, 0);
+	atomic_set(&fs_info->throttles, 0);
 	fs_info->sb = sb;
 	fs_info->max_extent = (u64)-1;
 	fs_info->max_inline = 8192 * 1024;
@@ -1199,7 +1201,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS);
 
 	mutex_init(&fs_info->trans_mutex);
-	mutex_init(&fs_info->fs_mutex);
+	mutex_init(&fs_info->drop_mutex);
 	mutex_init(&fs_info->alloc_mutex);
 	mutex_init(&fs_info->chunk_mutex);
 
@@ -1278,8 +1280,6 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 		goto fail_sb_buffer;
 	}
 
-	mutex_lock(&fs_info->fs_mutex);
-
 	mutex_lock(&fs_info->chunk_mutex);
 	ret = btrfs_read_sys_array(tree_root);
 	mutex_unlock(&fs_info->chunk_mutex);
@@ -1342,7 +1342,6 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	fs_info->metadata_alloc_profile = (u64)-1;
 	fs_info->system_alloc_profile = fs_info->metadata_alloc_profile;
 
-	mutex_unlock(&fs_info->fs_mutex);
 	return tree_root;
 
 fail_extent_root:
@@ -1350,7 +1349,6 @@ fail_extent_root:
 fail_tree_root:
 	free_extent_buffer(tree_root->node);
 fail_sys_array:
-	mutex_unlock(&fs_info->fs_mutex);
 fail_sb_buffer:
 	extent_io_tree_empty_lru(&BTRFS_I(fs_info->btree_inode)->io_tree);
 	btrfs_stop_workers(&fs_info->workers);
@@ -1562,8 +1560,9 @@ int close_ctree(struct btrfs_root *root)
 	struct btrfs_fs_info *fs_info = root->fs_info;
 
 	fs_info->closing = 1;
+	smp_mb();
+
 	btrfs_transaction_flush_work(root);
-	mutex_lock(&fs_info->fs_mutex);
 	btrfs_defrag_dirty_roots(root->fs_info);
 	trans = btrfs_start_transaction(root, 1);
 	ret = btrfs_commit_transaction(trans, root);
@@ -1574,7 +1573,6 @@ int close_ctree(struct btrfs_root *root)
 	BUG_ON(ret);
 
 	write_ctree_super(NULL, root);
-	mutex_unlock(&fs_info->fs_mutex);
 
 	btrfs_transaction_flush_work(root);
 
@@ -1679,7 +1677,8 @@ void btrfs_throttle(struct btrfs_root *root)
 	struct backing_dev_info *bdi;
 
 	bdi = &root->fs_info->bdi;
-	if (root->fs_info->throttles && bdi_write_congested(bdi)) {
+	if (atomic_read(&root->fs_info->throttles) &&
+	    bdi_write_congested(bdi)) {
 #if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,18)
 		congestion_wait(WRITE, HZ/20);
 #else
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 7e40c516fe62..890b9e9d8e27 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1577,9 +1577,11 @@ static int __free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
 		}
 
 		/* block accounting for super block */
+		spin_lock_irq(&info->delalloc_lock);
 		super_used = btrfs_super_bytes_used(&info->super_copy);
 		btrfs_set_super_bytes_used(&info->super_copy,
 					   super_used - num_bytes);
+		spin_unlock_irq(&info->delalloc_lock);
 
 		/* block accounting for root item */
 		root_used = btrfs_root_used(&root->root_item);
@@ -1968,8 +1970,10 @@ again:
 	}
 
 	/* block accounting for super block */
+	spin_lock_irq(&info->delalloc_lock);
 	super_used = btrfs_super_bytes_used(&info->super_copy);
 	btrfs_set_super_bytes_used(&info->super_copy, super_used + num_bytes);
+	spin_unlock_irq(&info->delalloc_lock);
 
 	/* block accounting for root item */
 	root_used = btrfs_root_used(&root->root_item);
@@ -2172,12 +2176,12 @@ static void noinline reada_walk_down(struct btrfs_root *root,
 				continue;
 			}
 		}
-		mutex_unlock(&root->fs_info->fs_mutex);
+		mutex_unlock(&root->fs_info->alloc_mutex);
 		ret = readahead_tree_block(root, bytenr, blocksize,
 					   btrfs_node_ptr_generation(node, i));
 		last = bytenr + blocksize;
 		cond_resched();
-		mutex_lock(&root->fs_info->fs_mutex);
+		mutex_lock(&root->fs_info->alloc_mutex);
 		if (ret)
 			break;
 	}
@@ -2254,11 +2258,9 @@ static int noinline walk_down_tree(struct btrfs_trans_handle *trans,
 			free_extent_buffer(next);
 			reada_walk_down(root, cur, path->slots[*level]);
 
-			mutex_unlock(&root->fs_info->fs_mutex);
 			mutex_unlock(&root->fs_info->alloc_mutex);
 			next = read_tree_block(root, bytenr, blocksize,
 					       ptr_gen);
-			mutex_lock(&root->fs_info->fs_mutex);
 			mutex_lock(&root->fs_info->alloc_mutex);
 
 			/* we've dropped the lock, double check */
@@ -2381,6 +2383,7 @@ int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
 	int orig_level;
 	struct btrfs_root_item *root_item = &root->root_item;
 
+	WARN_ON(!mutex_is_locked(&root->fs_info->drop_mutex));
 	path = btrfs_alloc_path();
 	BUG_ON(!path);
 
@@ -2710,7 +2713,6 @@ static int noinline relocate_one_reference(struct btrfs_root *extent_root,
 		    *last_file_root == ref_root)
 			goto out;
 
-		mutex_unlock(&extent_root->fs_info->fs_mutex);
 		inode = btrfs_iget_locked(extent_root->fs_info->sb,
 					  ref_objectid, found_root);
 		if (inode->i_state & I_NEW) {
@@ -2727,7 +2729,6 @@ static int noinline relocate_one_reference(struct btrfs_root *extent_root,
 		 * the latest version of the tree root
 		 */
 		if (is_bad_inode(inode)) {
-			mutex_lock(&extent_root->fs_info->fs_mutex);
 			goto out;
 		}
 		*last_file_objectid = inode->i_ino;
@@ -2736,7 +2737,6 @@ static int noinline relocate_one_reference(struct btrfs_root *extent_root,
 
 		relocate_inode_pages(inode, ref_offset, extent_key->offset);
 		iput(inode);
-		mutex_lock(&extent_root->fs_info->fs_mutex);
 	} else {
 		struct btrfs_trans_handle *trans;
 		struct extent_buffer *eb;
@@ -3033,9 +3033,7 @@ next:
 
 		if (progress && need_resched()) {
 			memcpy(&key, &found_key, sizeof(key));
-			mutex_unlock(&root->fs_info->fs_mutex);
 			cond_resched();
-			mutex_lock(&root->fs_info->fs_mutex);
 			btrfs_release_path(root, path);
 			btrfs_search_slot(NULL, root, &key, path, 0, 0);
 			progress = 0;
@@ -3068,9 +3066,7 @@ next:
 		trans = btrfs_start_transaction(tree_root, 1);
 		btrfs_commit_transaction(trans, tree_root);
 
-		mutex_unlock(&root->fs_info->fs_mutex);
 		btrfs_clean_old_snapshots(tree_root);
-		mutex_lock(&root->fs_info->fs_mutex);
 
 		trans = btrfs_start_transaction(tree_root, 1);
 		btrfs_commit_transaction(trans, tree_root);
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 73c6d085bd90..18bbe108a0e6 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -252,7 +252,6 @@ static int noinline dirty_and_release_pages(struct btrfs_trans_handle *trans,
 	end_of_last_block = start_pos + num_bytes - 1;
 
 	lock_extent(io_tree, start_pos, end_of_last_block, GFP_NOFS);
-	mutex_lock(&root->fs_info->fs_mutex);
 	trans = btrfs_start_transaction(root, 1);
 	if (!trans) {
 		err = -ENOMEM;
@@ -341,7 +340,6 @@ static int noinline dirty_and_release_pages(struct btrfs_trans_handle *trans,
 failed:
 	err = btrfs_end_transaction(trans, root);
 out_unlock:
-	mutex_unlock(&root->fs_info->fs_mutex);
 	unlock_extent(io_tree, start_pos, end_of_last_block, GFP_NOFS);
 	return err;
 }
@@ -905,9 +903,7 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
 		WARN_ON(num_pages > nrptrs);
 		memset(pages, 0, sizeof(pages));
 
-		mutex_lock(&root->fs_info->fs_mutex);
 		ret = btrfs_check_free_space(root, write_bytes, 0);
-		mutex_unlock(&root->fs_info->fs_mutex);
 		if (ret)
 			goto out;
 
@@ -998,9 +994,9 @@ static int btrfs_sync_file(struct file *file,
 	 * check the transaction that last modified this inode
 	 * and see if its already been committed
 	 */
-	mutex_lock(&root->fs_info->fs_mutex);
 	if (!BTRFS_I(inode)->last_trans)
 		goto out;
+
 	mutex_lock(&root->fs_info->trans_mutex);
 	if (BTRFS_I(inode)->last_trans <=
 	    root->fs_info->last_trans_committed) {
@@ -1023,7 +1019,6 @@ static int btrfs_sync_file(struct file *file,
 	}
 	ret = btrfs_commit_transaction(trans, root);
 out:
-	mutex_unlock(&root->fs_info->fs_mutex);
 	return ret > 0 ? EIO : ret;
 }
 
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index a0925eabdaa2..298346ae1481 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -69,6 +69,12 @@ int btrfs_find_free_objectid(struct btrfs_trans_handle *trans,
 	struct btrfs_key search_key;
 	u64 search_start = dirid;
 
+	mutex_lock(&root->objectid_mutex);
+	if (root->last_inode_alloc) {
+		*objectid = ++root->last_inode_alloc;
+		mutex_unlock(&root->objectid_mutex);
+		return 0;
+	}
 	path = btrfs_alloc_path();
 	BUG_ON(!path);
 	search_start = root->last_inode_alloc;
@@ -124,9 +130,11 @@ found:
 	btrfs_release_path(root, path);
 	btrfs_free_path(path);
 	BUG_ON(*objectid < search_start);
+	mutex_unlock(&root->objectid_mutex);
 	return 0;
 error:
 	btrfs_release_path(root, path);
 	btrfs_free_path(path);
+	mutex_unlock(&root->objectid_mutex);
 	return ret;
 }
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 61bd8953a683..b2251e27ac84 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -79,12 +79,15 @@ static unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = {
 int btrfs_check_free_space(struct btrfs_root *root, u64 num_required,
 			   int for_del)
 {
-	u64 total = btrfs_super_total_bytes(&root->fs_info->super_copy);
-	u64 used = btrfs_super_bytes_used(&root->fs_info->super_copy);
+	u64 total;
+	u64 used;
 	u64 thresh;
 	unsigned long flags;
 	int ret = 0;
 
+	spin_lock_irqsave(&root->fs_info->delalloc_lock, flags);
+	total = btrfs_super_total_bytes(&root->fs_info->super_copy);
+	used = btrfs_super_bytes_used(&root->fs_info->super_copy);
 	if (for_del)
 		thresh = total * 90;
 	else
@@ -92,7 +95,6 @@ int btrfs_check_free_space(struct btrfs_root *root, u64 num_required,
 
 	do_div(thresh, 100);
 
-	spin_lock_irqsave(&root->fs_info->delalloc_lock, flags);
 	if (used + root->fs_info->delalloc_bytes + num_required > thresh)
 		ret = -ENOSPC;
 	spin_unlock_irqrestore(&root->fs_info->delalloc_lock, flags);
@@ -115,7 +117,6 @@ static int cow_file_range(struct inode *inode, u64 start, u64 end)
 	trans = btrfs_start_transaction(root, 1);
 	BUG_ON(!trans);
 	btrfs_set_trans_block_group(trans, inode);
-	mutex_unlock(&root->fs_info->fs_mutex);
 
 	num_bytes = (end - start + blocksize) & ~(blocksize - 1);
 	num_bytes = max(blocksize,  num_bytes);
@@ -160,7 +161,6 @@ static int cow_file_range(struct inode *inode, u64 start, u64 end)
 	btrfs_add_ordered_inode(inode);
 	btrfs_update_inode(trans, root, inode);
 out:
-	mutex_lock(&root->fs_info->fs_mutex);
 	btrfs_end_transaction(trans, root);
 	return ret;
 }
@@ -269,14 +269,13 @@ static int run_delalloc_range(struct inode *inode, u64 start, u64 end)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	int ret;
-	mutex_lock(&root->fs_info->fs_mutex);
+
 	if (btrfs_test_opt(root, NODATACOW) ||
 	    btrfs_test_flag(inode, NODATACOW))
 		ret = run_delalloc_nocow(inode, start, end);
 	else
 		ret = cow_file_range(inode, start, end);
 
-	mutex_unlock(&root->fs_info->fs_mutex);
 	return ret;
 }
 
@@ -349,17 +348,13 @@ int __btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
 	ret = btrfs_csum_one_bio(root, bio, &sums);
 	BUG_ON(ret);
 
-	mutex_lock(&root->fs_info->fs_mutex);
 	trans = btrfs_start_transaction(root, 1);
-	mutex_unlock(&root->fs_info->fs_mutex);
 
 	btrfs_set_trans_block_group(trans, inode);
 	btrfs_csum_file_blocks(trans, root, inode, bio, sums);
 
-	mutex_lock(&root->fs_info->fs_mutex);
 	ret = btrfs_end_transaction(trans, root);
 	BUG_ON(ret);
-	mutex_unlock(&root->fs_info->fs_mutex);
 
 	kfree(sums);
 
@@ -404,7 +399,6 @@ int btrfs_readpage_io_hook(struct page *page, u64 start, u64 end)
 	    btrfs_test_flag(inode, NODATASUM))
 		return 0;
 
-	mutex_lock(&root->fs_info->fs_mutex);
 	path = btrfs_alloc_path();
 	item = btrfs_lookup_csum(NULL, root, path, inode->i_ino, start, 0);
 	if (IS_ERR(item)) {
@@ -422,7 +416,6 @@ int btrfs_readpage_io_hook(struct page *page, u64 start, u64 end)
 out:
 	if (path)
 		btrfs_free_path(path);
-	mutex_unlock(&root->fs_info->fs_mutex);
 	return ret;
 }
 
@@ -616,7 +609,6 @@ void btrfs_read_locked_inode(struct inode *inode)
 
 	path = btrfs_alloc_path();
 	BUG_ON(!path);
-	mutex_lock(&root->fs_info->fs_mutex);
 	memcpy(&location, &BTRFS_I(inode)->location, sizeof(location));
 
 	ret = btrfs_lookup_inode(NULL, root, path, &location, 0);
@@ -662,8 +654,6 @@ void btrfs_read_locked_inode(struct inode *inode)
 	btrfs_free_path(path);
 	inode_item = NULL;
 
-	mutex_unlock(&root->fs_info->fs_mutex);
-
 	switch (inode->i_mode & S_IFMT) {
 	case S_IFREG:
 		inode->i_mapping->a_ops = &btrfs_aops;
@@ -691,9 +681,7 @@ void btrfs_read_locked_inode(struct inode *inode)
 	return;
 
 make_bad:
-	btrfs_release_path(root, path);
 	btrfs_free_path(path);
-	mutex_unlock(&root->fs_info->fs_mutex);
 	make_bad_inode(inode);
 }
 
@@ -758,7 +746,6 @@ int btrfs_update_inode(struct btrfs_trans_handle *trans,
 	btrfs_set_inode_last_trans(trans, inode);
 	ret = 0;
 failed:
-	btrfs_release_path(root, path);
 	btrfs_free_path(path);
 	return ret;
 }
@@ -849,7 +836,6 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
 	unsigned long nr = 0;
 
 	root = BTRFS_I(dir)->root;
-	mutex_lock(&root->fs_info->fs_mutex);
 
 	ret = btrfs_check_free_space(root, 1, 1);
 	if (ret)
@@ -871,7 +857,6 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
 
 	btrfs_end_transaction(trans, root);
 fail:
-	mutex_unlock(&root->fs_info->fs_mutex);
 	btrfs_btree_balance_dirty(root, nr);
 	btrfs_throttle(root);
 	return ret;
@@ -890,7 +875,6 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
 		return -ENOTEMPTY;
 	}
 
-	mutex_lock(&root->fs_info->fs_mutex);
 	ret = btrfs_check_free_space(root, 1, 1);
 	if (ret)
 		goto fail;
@@ -907,7 +891,6 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
 	nr = trans->blocks_used;
 	ret = btrfs_end_transaction(trans, root);
 fail:
-	mutex_unlock(&root->fs_info->fs_mutex);
 	btrfs_btree_balance_dirty(root, nr);
 	btrfs_throttle(root);
 
@@ -1129,7 +1112,6 @@ error:
 		ret = btrfs_del_items(trans, root, path, pending_del_slot,
 				      pending_del_nr);
 	}
-	btrfs_release_path(root, path);
 	btrfs_free_path(path);
 	inode->i_sb->s_dirt = 1;
 	return ret;
@@ -1234,9 +1216,7 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
 		if (attr->ia_size <= hole_start)
 			goto out;
 
-		mutex_lock(&root->fs_info->fs_mutex);
 		err = btrfs_check_free_space(root, 1, 0);
-		mutex_unlock(&root->fs_info->fs_mutex);
 		if (err)
 			goto fail;
 
@@ -1245,7 +1225,6 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
 		lock_extent(io_tree, hole_start, block_end - 1, GFP_NOFS);
 		hole_size = block_end - hole_start;
 
-		mutex_lock(&root->fs_info->fs_mutex);
 		trans = btrfs_start_transaction(root, 1);
 		btrfs_set_trans_block_group(trans, inode);
 		err = btrfs_drop_extents(trans, root, inode,
@@ -1262,7 +1241,6 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
 			btrfs_check_file(root, inode);
 		}
 		btrfs_end_transaction(trans, root);
-		mutex_unlock(&root->fs_info->fs_mutex);
 		unlock_extent(io_tree, hole_start, block_end - 1, GFP_NOFS);
 		if (err)
 			return err;
@@ -1286,7 +1264,6 @@ void btrfs_delete_inode(struct inode *inode)
 	}
 
 	inode->i_size = 0;
-	mutex_lock(&root->fs_info->fs_mutex);
 	trans = btrfs_start_transaction(root, 1);
 
 	btrfs_set_trans_block_group(trans, inode);
@@ -1298,7 +1275,6 @@ void btrfs_delete_inode(struct inode *inode)
 	clear_inode(inode);
 
 	btrfs_end_transaction(trans, root);
-	mutex_unlock(&root->fs_info->fs_mutex);
 	btrfs_btree_balance_dirty(root, nr);
 	btrfs_throttle(root);
 	return;
@@ -1306,7 +1282,6 @@ void btrfs_delete_inode(struct inode *inode)
 no_delete_lock:
 	nr = trans->blocks_used;
 	btrfs_end_transaction(trans, root);
-	mutex_unlock(&root->fs_info->fs_mutex);
 	btrfs_btree_balance_dirty(root, nr);
 	btrfs_throttle(root);
 no_delete:
@@ -1402,7 +1377,6 @@ static int fixup_tree_root_location(struct btrfs_root *root,
 
 	path = btrfs_alloc_path();
 	BUG_ON(!path);
-	mutex_lock(&root->fs_info->fs_mutex);
 
 	*sub_root = btrfs_read_fs_root(root->fs_info, location,
 					dentry->d_name.name,
@@ -1416,7 +1390,6 @@ static int fixup_tree_root_location(struct btrfs_root *root,
 	location->offset = 0;
 
 	btrfs_free_path(path);
-	mutex_unlock(&root->fs_info->fs_mutex);
 	return 0;
 }
 
@@ -1482,9 +1455,7 @@ static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
 	if (dentry->d_name.len > BTRFS_NAME_LEN)
 		return ERR_PTR(-ENAMETOOLONG);
 
-	mutex_lock(&root->fs_info->fs_mutex);
 	ret = btrfs_inode_by_name(dir, dentry, &location);
-	mutex_unlock(&root->fs_info->fs_mutex);
 
 	if (ret < 0)
 		return ERR_PTR(ret);
@@ -1559,7 +1530,6 @@ static int btrfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 		filp->f_pos = 1;
 	}
 
-	mutex_lock(&root->fs_info->fs_mutex);
 	key.objectid = inode->i_ino;
 	path = btrfs_alloc_path();
 	path->reada = 2;
@@ -1668,9 +1638,7 @@ read_dir_items:
 nopos:
 	ret = 0;
 err:
-	btrfs_release_path(root, path);
 	btrfs_free_path(path);
-	mutex_unlock(&root->fs_info->fs_mutex);
 	return ret;
 }
 
@@ -1681,11 +1649,9 @@ int btrfs_write_inode(struct inode *inode, int wait)
 	int ret = 0;
 
 	if (wait) {
-		mutex_lock(&root->fs_info->fs_mutex);
 		trans = btrfs_start_transaction(root, 1);
 		btrfs_set_trans_block_group(trans, inode);
 		ret = btrfs_commit_transaction(trans, root);
-		mutex_unlock(&root->fs_info->fs_mutex);
 	}
 	return ret;
 }
@@ -1701,12 +1667,10 @@ void btrfs_dirty_inode(struct inode *inode)
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_trans_handle *trans;
 
-	mutex_lock(&root->fs_info->fs_mutex);
 	trans = btrfs_start_transaction(root, 1);
 	btrfs_set_trans_block_group(trans, inode);
 	btrfs_update_inode(trans, root, inode);
 	btrfs_end_transaction(trans, root);
-	mutex_unlock(&root->fs_info->fs_mutex);
 }
 
 static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
@@ -1874,7 +1838,6 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
 	if (!new_valid_dev(rdev))
 		return -EINVAL;
 
-	mutex_lock(&root->fs_info->fs_mutex);
 	err = btrfs_check_free_space(root, 1, 0);
 	if (err)
 		goto fail;
@@ -1912,8 +1875,6 @@ out_unlock:
 	nr = trans->blocks_used;
 	btrfs_end_transaction(trans, root);
 fail:
-	mutex_unlock(&root->fs_info->fs_mutex);
-
 	if (drop_inode) {
 		inode_dec_link_count(inode);
 		iput(inode);
@@ -1934,7 +1895,6 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
 	unsigned long nr = 0;
 	u64 objectid;
 
-	mutex_lock(&root->fs_info->fs_mutex);
 	err = btrfs_check_free_space(root, 1, 0);
 	if (err)
 		goto fail;
@@ -1980,8 +1940,6 @@ out_unlock:
 	nr = trans->blocks_used;
 	btrfs_end_transaction(trans, root);
 fail:
-	mutex_unlock(&root->fs_info->fs_mutex);
-
 	if (drop_inode) {
 		inode_dec_link_count(inode);
 		iput(inode);
@@ -2009,7 +1967,6 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
 #else
 	inc_nlink(inode);
 #endif
-	mutex_lock(&root->fs_info->fs_mutex);
 	err = btrfs_check_free_space(root, 1, 0);
 	if (err)
 		goto fail;
@@ -2032,8 +1989,6 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
 	nr = trans->blocks_used;
 	btrfs_end_transaction(trans, root);
 fail:
-	mutex_unlock(&root->fs_info->fs_mutex);
-
 	if (drop_inode) {
 		inode_dec_link_count(inode);
 		iput(inode);
@@ -2053,7 +2008,6 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	u64 objectid = 0;
 	unsigned long nr = 1;
 
-	mutex_lock(&root->fs_info->fs_mutex);
 	err = btrfs_check_free_space(root, 1, 0);
 	if (err)
 		goto out_unlock;
@@ -2106,7 +2060,6 @@ out_fail:
 	btrfs_end_transaction(trans, root);
 
 out_unlock:
-	mutex_unlock(&root->fs_info->fs_mutex);
 	if (drop_on_err)
 		iput(inode);
 	btrfs_btree_balance_dirty(root, nr);
@@ -2199,7 +2152,6 @@ struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
 
 	path = btrfs_alloc_path();
 	BUG_ON(!path);
-	mutex_lock(&root->fs_info->fs_mutex);
 
 again:
 	spin_lock(&em_tree->lock);
@@ -2402,7 +2354,6 @@ out:
 		if (!err)
 			err = ret;
 	}
-	mutex_unlock(&root->fs_info->fs_mutex);
 	if (err) {
 		free_extent_map(em);
 		WARN_ON(1);
@@ -2584,9 +2535,7 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct page *page)
 	int ret;
 	u64 page_start;
 
-	mutex_lock(&root->fs_info->fs_mutex);
 	ret = btrfs_check_free_space(root, PAGE_CACHE_SIZE, 0);
-	mutex_unlock(&root->fs_info->fs_mutex);
 	if (ret)
 		goto out;
 
@@ -2631,7 +2580,6 @@ static void btrfs_truncate(struct inode *inode)
 
 	btrfs_truncate_page(inode->i_mapping, inode->i_size);
 
-	mutex_lock(&root->fs_info->fs_mutex);
 	trans = btrfs_start_transaction(root, 1);
 	btrfs_set_trans_block_group(trans, inode);
 
@@ -2643,7 +2591,6 @@ static void btrfs_truncate(struct inode *inode)
 
 	ret = btrfs_end_transaction(trans, root);
 	BUG_ON(ret);
-	mutex_unlock(&root->fs_info->fs_mutex);
 	btrfs_btree_balance_dirty(root, nr);
 	btrfs_throttle(root);
 }
@@ -2827,7 +2774,6 @@ static int btrfs_rename(struct inode * old_dir, struct dentry *old_dentry,
 	struct inode *new_inode = new_dentry->d_inode;
 	struct inode *old_inode = old_dentry->d_inode;
 	struct timespec ctime = CURRENT_TIME;
-	struct btrfs_path *path;
 	int ret;
 
 	if (S_ISDIR(old_inode->i_mode) && new_inode &&
@@ -2835,7 +2781,6 @@ static int btrfs_rename(struct inode * old_dir, struct dentry *old_dentry,
 		return -ENOTEMPTY;
 	}
 
-	mutex_lock(&root->fs_info->fs_mutex);
 	ret = btrfs_check_free_space(root, 1, 0);
 	if (ret)
 		goto out_unlock;
@@ -2843,11 +2788,6 @@ static int btrfs_rename(struct inode * old_dir, struct dentry *old_dentry,
 	trans = btrfs_start_transaction(root, 1);
 
 	btrfs_set_trans_block_group(trans, new_dir);
-	path = btrfs_alloc_path();
-	if (!path) {
-		ret = -ENOMEM;
-		goto out_fail;
-	}
 
 	old_dentry->d_inode->i_nlink++;
 	old_dir->i_ctime = old_dir->i_mtime = ctime;
@@ -2869,10 +2809,8 @@ static int btrfs_rename(struct inode * old_dir, struct dentry *old_dentry,
 		goto out_fail;
 
 out_fail:
-	btrfs_free_path(path);
 	btrfs_end_transaction(trans, root);
 out_unlock:
-	mutex_unlock(&root->fs_info->fs_mutex);
 	return ret;
 }
 
@@ -2898,7 +2836,6 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
 	if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root))
 		return -ENAMETOOLONG;
 
-	mutex_lock(&root->fs_info->fs_mutex);
 	err = btrfs_check_free_space(root, 1, 0);
 	if (err)
 		goto out_fail;
@@ -2979,7 +2916,6 @@ out_unlock:
 	nr = trans->blocks_used;
 	btrfs_end_transaction(trans, root);
 out_fail:
-	mutex_unlock(&root->fs_info->fs_mutex);
 	if (drop_inode) {
 		inode_dec_link_count(inode);
 		iput(inode);
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 3fbf74e93dba..6002eb64daf9 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -63,7 +63,6 @@ static noinline int create_subvol(struct btrfs_root *root, char *name,
 	u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID;
 	unsigned long nr = 1;
 
-	mutex_lock(&root->fs_info->fs_mutex);
 	ret = btrfs_check_free_space(root, 1, 0);
 	if (ret)
 		goto fail_commit;
@@ -164,7 +163,6 @@ fail:
 	if (err && !ret)
 		ret = err;
 fail_commit:
-	mutex_unlock(&root->fs_info->fs_mutex);
 	btrfs_btree_balance_dirty(root, nr);
 	btrfs_throttle(root);
 	return ret;
@@ -181,7 +179,6 @@ static int create_snapshot(struct btrfs_root *root, char *name, int namelen)
 	if (!root->ref_cows)
 		return -EINVAL;
 
-	mutex_lock(&root->fs_info->fs_mutex);
 	ret = btrfs_check_free_space(root, 1, 0);
 	if (ret)
 		goto fail_unlock;
@@ -208,7 +205,6 @@ static int create_snapshot(struct btrfs_root *root, char *name, int namelen)
 	err = btrfs_commit_transaction(trans, root);
 
 fail_unlock:
-	mutex_unlock(&root->fs_info->fs_mutex);
 	btrfs_btree_balance_dirty(root, nr);
 	btrfs_throttle(root);
 	return ret;
@@ -228,9 +224,7 @@ int btrfs_defrag_file(struct file *file)
 	unsigned long i;
 	int ret;
 
-	mutex_lock(&root->fs_info->fs_mutex);
 	ret = btrfs_check_free_space(root, inode->i_size, 0);
-	mutex_unlock(&root->fs_info->fs_mutex);
 	if (ret)
 		return -ENOSPC;
 
@@ -315,7 +309,8 @@ static int btrfs_ioctl_resize(struct btrfs_root *root, void __user *arg)
 		goto out;
 	}
 
-	mutex_lock(&root->fs_info->fs_mutex);
+	mutex_lock(&root->fs_info->alloc_mutex);
+	mutex_lock(&root->fs_info->chunk_mutex);
 	sizestr = vol_args->name;
 	devstr = strchr(sizestr, ':');
 	if (devstr) {
@@ -385,7 +380,8 @@ static int btrfs_ioctl_resize(struct btrfs_root *root, void __user *arg)
 	}
 
 out_unlock:
-	mutex_unlock(&root->fs_info->fs_mutex);
+	mutex_lock(&root->fs_info->alloc_mutex);
+	mutex_lock(&root->fs_info->chunk_mutex);
 out:
 	kfree(vol_args);
 	return ret;
@@ -428,11 +424,9 @@ static noinline int btrfs_ioctl_snap_create(struct btrfs_root *root,
 	}
 
 	root_dirid = root->fs_info->sb->s_root->d_inode->i_ino,
-	mutex_lock(&root->fs_info->fs_mutex);
 	di = btrfs_lookup_dir_item(NULL, root->fs_info->tree_root,
 			    path, root_dirid,
 			    vol_args->name, namelen, 0);
-	mutex_unlock(&root->fs_info->fs_mutex);
 	btrfs_free_path(path);
 
 	if (di && !IS_ERR(di)) {
@@ -445,10 +439,12 @@ static noinline int btrfs_ioctl_snap_create(struct btrfs_root *root,
 		goto out;
 	}
 
+	mutex_lock(&root->fs_info->drop_mutex);
 	if (root == root->fs_info->tree_root)
 		ret = create_subvol(root, vol_args->name, namelen);
 	else
 		ret = create_snapshot(root, vol_args->name, namelen);
+	mutex_unlock(&root->fs_info->drop_mutex);
 out:
 	kfree(vol_args);
 	return ret;
@@ -461,10 +457,8 @@ static int btrfs_ioctl_defrag(struct file *file)
 
 	switch (inode->i_mode & S_IFMT) {
 	case S_IFDIR:
-		mutex_lock(&root->fs_info->fs_mutex);
 		btrfs_defrag_root(root, 0);
 		btrfs_defrag_root(root->fs_info->extent_root, 0);
-		mutex_unlock(&root->fs_info->fs_mutex);
 		break;
 	case S_IFREG:
 		btrfs_defrag_file(file);
@@ -588,7 +582,6 @@ long btrfs_ioctl_clone(struct file *file, unsigned long src_fd)
 		unlock_extent(&BTRFS_I(src)->io_tree, 0, (u64)-1, GFP_NOFS);
 	}
 
-	mutex_lock(&root->fs_info->fs_mutex);
 	trans = btrfs_start_transaction(root, 0);
 	path = btrfs_alloc_path();
 	if (!path) {
@@ -685,7 +678,6 @@ out:
 	unlock_extent(&BTRFS_I(src)->io_tree, 0, (u64)-1, GFP_NOFS);
 
 	btrfs_end_transaction(trans, root);
-	mutex_unlock(&root->fs_info->fs_mutex);
 
 out_unlock:
 	mutex_unlock(&src->i_mutex);
@@ -711,7 +703,6 @@ long btrfs_ioctl_trans_start(struct file *file)
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
-	mutex_lock(&root->fs_info->fs_mutex);
 	if (file->private_data) {
 		ret = -EINPROGRESS;
 		goto out;
@@ -723,7 +714,6 @@ long btrfs_ioctl_trans_start(struct file *file)
 		ret = -ENOMEM;
 	/*printk(KERN_INFO "btrfs_ioctl_trans_start on %p\n", file);*/
 out:
-	mutex_unlock(&root->fs_info->fs_mutex);
 	return ret;
 }
 
@@ -740,7 +730,6 @@ long btrfs_ioctl_trans_end(struct file *file)
 	struct btrfs_trans_handle *trans;
 	int ret = 0;
 
-	mutex_lock(&root->fs_info->fs_mutex);
 	trans = file->private_data;
 	if (!trans) {
 		ret = -EINVAL;
@@ -749,7 +738,6 @@ long btrfs_ioctl_trans_end(struct file *file)
 	btrfs_end_transaction(trans, root);
 	file->private_data = 0;
 out:
-	mutex_unlock(&root->fs_info->fs_mutex);
 	return ret;
 }
 
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 196d0e280b19..b61ded7a20c9 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -366,12 +366,10 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
 		return 0;
 	}
 	btrfs_clean_old_snapshots(root);
-	mutex_lock(&root->fs_info->fs_mutex);
 	btrfs_defrag_dirty_roots(root->fs_info);
 	trans = btrfs_start_transaction(root, 1);
 	ret = btrfs_commit_transaction(trans, root);
 	sb->s_dirt = 0;
-	mutex_unlock(&root->fs_info->fs_mutex);
 	return ret;
 }
 
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 1ed433a71493..5a1ee0665ae8 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -370,6 +370,7 @@ int btrfs_defrag_root(struct btrfs_root *root, int cacheonly)
 	struct btrfs_trans_handle *trans;
 	unsigned long nr;
 
+	smp_mb();
 	if (root->defrag_running)
 		return 0;
 	trans = btrfs_start_transaction(root, 1);
@@ -378,16 +379,15 @@ int btrfs_defrag_root(struct btrfs_root *root, int cacheonly)
 		ret = btrfs_defrag_leaves(trans, root, cacheonly);
 		nr = trans->blocks_used;
 		btrfs_end_transaction(trans, root);
-		mutex_unlock(&info->fs_mutex);
 		btrfs_btree_balance_dirty(info->tree_root, nr);
 		cond_resched();
 
-		mutex_lock(&info->fs_mutex);
 		trans = btrfs_start_transaction(root, 1);
 		if (ret != -EAGAIN)
 			break;
 	}
 	root->defrag_running = 0;
+	smp_mb();
 	radix_tree_tag_clear(&info->fs_roots_radix,
 		     (unsigned long)root->root_key.objectid,
 		     BTRFS_ROOT_DEFRAG_TAG);
@@ -435,14 +435,14 @@ static noinline int drop_dirty_roots(struct btrfs_root *tree_root,
 	while(!list_empty(list)) {
 		struct btrfs_root *root;
 
-		mutex_lock(&tree_root->fs_info->fs_mutex);
 		dirty = list_entry(list->next, struct dirty_root, list);
 		list_del_init(&dirty->list);
 
 		num_bytes = btrfs_root_used(&dirty->root->root_item);
 		root = dirty->latest_root;
-		root->fs_info->throttles++;
+		atomic_inc(&root->fs_info->throttles);
 
+		mutex_lock(&root->fs_info->drop_mutex);
 		while(1) {
 			trans = btrfs_start_transaction(tree_root, 1);
 			ret = btrfs_drop_snapshot(trans, dirty->root);
@@ -459,14 +459,16 @@ static noinline int drop_dirty_roots(struct btrfs_root *tree_root,
 			nr = trans->blocks_used;
 			ret = btrfs_end_transaction(trans, tree_root);
 			BUG_ON(ret);
-			mutex_unlock(&tree_root->fs_info->fs_mutex);
+
+			mutex_unlock(&root->fs_info->drop_mutex);
 			btrfs_btree_balance_dirty(tree_root, nr);
 			cond_resched();
-			mutex_lock(&tree_root->fs_info->fs_mutex);
+			mutex_lock(&root->fs_info->drop_mutex);
 		}
 		BUG_ON(ret);
-		root->fs_info->throttles--;
+		atomic_dec(&root->fs_info->throttles);
 
+		mutex_lock(&root->fs_info->alloc_mutex);
 		num_bytes -= btrfs_root_used(&dirty->root->root_item);
 		bytes_used = btrfs_root_used(&root->root_item);
 		if (num_bytes) {
@@ -474,11 +476,15 @@ static noinline int drop_dirty_roots(struct btrfs_root *tree_root,
 			btrfs_set_root_used(&root->root_item,
 					    bytes_used - num_bytes);
 		}
+		mutex_unlock(&root->fs_info->alloc_mutex);
+
 		ret = btrfs_del_root(trans, tree_root, &dirty->root->root_key);
 		if (ret) {
 			BUG();
 			break;
 		}
+		mutex_unlock(&root->fs_info->drop_mutex);
+
 		nr = trans->blocks_used;
 		ret = btrfs_end_transaction(trans, tree_root);
 		BUG_ON(ret);
@@ -486,7 +492,6 @@ static noinline int drop_dirty_roots(struct btrfs_root *tree_root,
 		free_extent_buffer(dirty->root->node);
 		kfree(dirty->root);
 		kfree(dirty);
-		mutex_unlock(&tree_root->fs_info->fs_mutex);
 
 		btrfs_btree_balance_dirty(tree_root, nr);
 		cond_resched();
@@ -503,7 +508,7 @@ int btrfs_write_ordered_inodes(struct btrfs_trans_handle *trans,
 	u64 objectid = 0;
 	int ret;
 
-	root->fs_info->throttles++;
+	atomic_inc(&root->fs_info->throttles);
 	while(1) {
 		ret = btrfs_find_first_ordered_inode(
 				&cur_trans->ordered_inode_tree,
@@ -512,7 +517,6 @@ int btrfs_write_ordered_inodes(struct btrfs_trans_handle *trans,
 			break;
 
 		mutex_unlock(&root->fs_info->trans_mutex);
-		mutex_unlock(&root->fs_info->fs_mutex);
 
 		if (S_ISREG(inode->i_mode)) {
 			atomic_inc(&BTRFS_I(inode)->ordered_writeback);
@@ -521,7 +525,6 @@ int btrfs_write_ordered_inodes(struct btrfs_trans_handle *trans,
 		}
 		iput(inode);
 
-		mutex_lock(&root->fs_info->fs_mutex);
 		mutex_lock(&root->fs_info->trans_mutex);
 	}
 	while(1) {
@@ -533,7 +536,6 @@ int btrfs_write_ordered_inodes(struct btrfs_trans_handle *trans,
 		if (!ret)
 			break;
 		mutex_unlock(&root->fs_info->trans_mutex);
-		mutex_unlock(&root->fs_info->fs_mutex);
 
 		if (S_ISREG(inode->i_mode)) {
 			atomic_inc(&BTRFS_I(inode)->ordered_writeback);
@@ -543,10 +545,9 @@ int btrfs_write_ordered_inodes(struct btrfs_trans_handle *trans,
 		atomic_dec(&inode->i_count);
 		iput(inode);
 
-		mutex_lock(&root->fs_info->fs_mutex);
 		mutex_lock(&root->fs_info->trans_mutex);
 	}
-	root->fs_info->throttles--;
+	atomic_dec(&root->fs_info->throttles);
 	return 0;
 }
 
@@ -661,7 +662,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 		mutex_unlock(&root->fs_info->trans_mutex);
 		btrfs_end_transaction(trans, root);
 
-		mutex_unlock(&root->fs_info->fs_mutex);
 		ret = wait_for_commit(root, cur_trans);
 		BUG_ON(ret);
 
@@ -669,7 +669,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 		put_transaction(cur_trans);
 		mutex_unlock(&root->fs_info->trans_mutex);
 
-		mutex_lock(&root->fs_info->fs_mutex);
 		return 0;
 	}
 
@@ -687,12 +686,10 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 					struct btrfs_transaction, list);
 		if (!prev_trans->commit_done) {
 			prev_trans->use_count++;
-			mutex_unlock(&root->fs_info->fs_mutex);
 			mutex_unlock(&root->fs_info->trans_mutex);
 
 			wait_for_commit(root, prev_trans);
 
-			mutex_lock(&root->fs_info->fs_mutex);
 			mutex_lock(&root->fs_info->trans_mutex);
 			put_transaction(prev_trans);
 		}
@@ -709,12 +706,10 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 		else
 			timeout = 1;
 
-		mutex_unlock(&root->fs_info->fs_mutex);
 		mutex_unlock(&root->fs_info->trans_mutex);
 
 		schedule_timeout(timeout);
 
-		mutex_lock(&root->fs_info->fs_mutex);
 		mutex_lock(&root->fs_info->trans_mutex);
 		finish_wait(&cur_trans->writer_wait, &wait);
 		ret = btrfs_write_ordered_inodes(trans, root);
@@ -755,12 +750,10 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 	btrfs_copy_pinned(root, pinned_copy);
 
 	mutex_unlock(&root->fs_info->trans_mutex);
-	mutex_unlock(&root->fs_info->fs_mutex);
 	ret = btrfs_write_and_wait_transaction(trans, root);
 	BUG_ON(ret);
 	write_ctree_super(trans, root);
 
-	mutex_lock(&root->fs_info->fs_mutex);
 	btrfs_finish_extent_commit(trans, root, pinned_copy);
 	mutex_lock(&root->fs_info->trans_mutex);
 
@@ -781,9 +774,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 	kmem_cache_free(btrfs_trans_handle_cachep, trans);
 
 	if (root->fs_info->closing) {
-		mutex_unlock(&root->fs_info->fs_mutex);
 		drop_dirty_roots(root->fs_info->tree_root, &dirty_fs_roots);
-		mutex_lock(&root->fs_info->fs_mutex);
 	}
 	return ret;
 }
@@ -823,7 +814,7 @@ void btrfs_transaction_cleaner(struct work_struct *work)
 	unsigned long delay = HZ * 30;
 	int ret;
 
-	mutex_lock(&root->fs_info->fs_mutex);
+	smp_mb();
 	if (root->fs_info->closing)
 		goto out;
 
@@ -844,7 +835,6 @@ void btrfs_transaction_cleaner(struct work_struct *work)
 	trans = btrfs_start_transaction(root, 1);
 	ret = btrfs_commit_transaction(trans, root);
 out:
-	mutex_unlock(&root->fs_info->fs_mutex);
 	btrfs_clean_old_snapshots(root);
 	btrfs_transaction_queue_work(root, delay);
 }
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index ba3968571024..869864ddcc29 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -866,7 +866,8 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
 	u64 devid;
 	int ret = 0;
 
-	mutex_lock(&root->fs_info->fs_mutex);
+	mutex_lock(&root->fs_info->alloc_mutex);
+	mutex_lock(&root->fs_info->chunk_mutex);
 	mutex_lock(&uuid_mutex);
 
 	all_avail = root->fs_info->avail_data_alloc_bits |
@@ -984,7 +985,8 @@ error_close:
 		close_bdev_excl(bdev);
 out:
 	mutex_unlock(&uuid_mutex);
-	mutex_unlock(&root->fs_info->fs_mutex);
+	mutex_unlock(&root->fs_info->chunk_mutex);
+	mutex_unlock(&root->fs_info->alloc_mutex);
 	return ret;
 }
 
@@ -1003,7 +1005,10 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
 	if (!bdev) {
 		return -EIO;
 	}
-	mutex_lock(&root->fs_info->fs_mutex);
+
+	mutex_lock(&root->fs_info->alloc_mutex);
+	mutex_lock(&root->fs_info->chunk_mutex);
+
 	trans = btrfs_start_transaction(root, 1);
 	devices = &root->fs_info->fs_devices->devices;
 	list_for_each(cur, devices) {
@@ -1057,7 +1062,9 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
 	root->fs_info->fs_devices->open_devices++;
 out:
 	btrfs_end_transaction(trans, root);
-	mutex_unlock(&root->fs_info->fs_mutex);
+	mutex_unlock(&root->fs_info->chunk_mutex);
+	mutex_unlock(&root->fs_info->alloc_mutex);
+
 	return ret;
 
 out_close_bdev:
@@ -1297,9 +1304,10 @@ int btrfs_balance(struct btrfs_root *dev_root)
 	struct btrfs_key found_key;
 
 
+	BUG(); /* FIXME, needs locking */
+
 	dev_root = dev_root->fs_info->dev_root;
 
-	mutex_lock(&dev_root->fs_info->fs_mutex);
 	/* step one make some room on all the devices */
 	list_for_each(cur, devices) {
 		device = list_entry(cur, struct btrfs_device, dev_list);
@@ -1368,7 +1376,6 @@ int btrfs_balance(struct btrfs_root *dev_root)
 	ret = 0;
 error:
 	btrfs_free_path(path);
-	mutex_unlock(&dev_root->fs_info->fs_mutex);
 	return ret;
 }
 
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index f63488dc2f16..3e9eb91c3c89 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -153,7 +153,6 @@ ssize_t btrfs_xattr_get(struct inode *inode, int name_index,
 		return -ENOMEM;
 	}
 
-	mutex_lock(&root->fs_info->fs_mutex);
 	/* lookup the xattr by name */
 	di = btrfs_lookup_xattr(NULL, root, path, inode->i_ino, name,
 				strlen(name), 0);
@@ -181,7 +180,6 @@ ssize_t btrfs_xattr_get(struct inode *inode, int name_index,
 	ret = btrfs_dir_data_len(leaf, di);
 
 out:
-	mutex_unlock(&root->fs_info->fs_mutex);
 	kfree(name);
 	btrfs_free_path(path);
 	return ret;
@@ -210,7 +208,6 @@ int btrfs_xattr_set(struct inode *inode, int name_index,
 		return -ENOMEM;
 	}
 
-	mutex_lock(&root->fs_info->fs_mutex);
 	trans = btrfs_start_transaction(root, 1);
 	btrfs_set_trans_block_group(trans, inode);
 
@@ -260,7 +257,6 @@ out:
 	}
 
 	btrfs_end_transaction(trans, root);
-	mutex_unlock(&root->fs_info->fs_mutex);
 	kfree(name);
 	btrfs_free_path(path);
 
@@ -297,8 +293,6 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
 		return -ENOMEM;
 	path->reada = 2;
 
-	mutex_lock(&root->fs_info->fs_mutex);
-
 	/* search for our xattrs */
 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
 	if (ret < 0)
@@ -379,15 +373,13 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
 	ret = total_size;
 
 err:
-	mutex_unlock(&root->fs_info->fs_mutex);
 	btrfs_free_path(path);
 
 	return ret;
 }
 
 /*
- * delete all the xattrs associated with the inode.  fs_mutex should be
- * held when we come into here
+ * delete all the xattrs associated with the inode.
  */
 int btrfs_delete_xattrs(struct btrfs_trans_handle *trans,
 			struct btrfs_root *root, struct inode *inode)
-- 
cgit v1.2.3


From a74a4b97b61beede185b4b3ad359d7d378b0d312 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 25 Jun 2008 16:01:31 -0400
Subject: Btrfs: Replace the transaction work queue with kthreads

This creates one kthread for commits and one kthread for
deleting old snapshots.  All the work queues are removed.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c       |   5 ++-
 fs/btrfs/ctree.h       |  13 ++----
 fs/btrfs/disk-io.c     | 116 +++++++++++++++++++++++++++++++++++++++++++++----
 fs/btrfs/extent-tree.c |  10 ++---
 fs/btrfs/super.c       |  16 +++----
 fs/btrfs/transaction.c |  72 +-----------------------------
 fs/btrfs/transaction.h |  10 -----
 fs/btrfs/volumes.c     |  12 +++--
 8 files changed, 136 insertions(+), 118 deletions(-)

(limited to 'fs/btrfs/volumes.c')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 5edbcc09b3cc..40f0e0cb804b 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1352,6 +1352,8 @@ again:
 						free_extent_buffer(tmp);
 					goto again;
 				} else {
+					if (tmp)
+						free_extent_buffer(tmp);
 					b = read_node_slot(root, b, slot);
 				}
 			}
@@ -3048,7 +3050,8 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
 		free_extent_buffer(c);
 		path->nodes[level] = next;
 		path->slots[level] = 0;
-		path->locks[level] = 1;
+		if (!path->skip_locking)
+			path->locks[level] = 1;
 		if (!level)
 			break;
 		if (level == 1 && path->locks[1] && path->reada)
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index e9bbb53eda63..244fe86bcc55 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -23,7 +23,6 @@
 #include <linux/mm.h>
 #include <linux/highmem.h>
 #include <linux/fs.h>
-#include <linux/workqueue.h>
 #include <linux/completion.h>
 #include <linux/backing-dev.h>
 #include <asm/kmap_types.h>
@@ -519,15 +518,14 @@ struct btrfs_fs_info {
 	struct backing_dev_info bdi;
 	spinlock_t hash_lock;
 	struct mutex trans_mutex;
+	struct mutex transaction_kthread_mutex;
+	struct mutex cleaner_mutex;
 	struct mutex alloc_mutex;
 	struct mutex chunk_mutex;
 	struct mutex drop_mutex;
 	struct list_head trans_list;
 	struct list_head hashers;
 	struct list_head dead_roots;
-	struct list_head end_io_work_list;
-	struct work_struct end_io_work;
-	spinlock_t end_io_work_lock;
 	atomic_t nr_async_submits;
 
 	/*
@@ -543,13 +541,10 @@ struct btrfs_fs_info {
 	struct btrfs_workers workers;
 	struct btrfs_workers endio_workers;
 	struct btrfs_workers submit_workers;
+	struct task_struct *transaction_kthread;
+	struct task_struct *cleaner_kthread;
 	int thread_pool_size;
 
-#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18)
-	struct work_struct trans_work;
-#else
-	struct delayed_work trans_work;
-#endif
 	struct kobject super_kobj;
 	struct completion kobj_unregister;
 	int do_barriers;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 52569b57692d..31ca9f89388d 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -16,6 +16,7 @@
  * Boston, MA 021110-1307, USA.
  */
 
+#include <linux/version.h>
 #include <linux/fs.h>
 #include <linux/blkdev.h>
 #include <linux/scatterlist.h>
@@ -24,6 +25,12 @@
 #include <linux/writeback.h>
 #include <linux/buffer_head.h> // for block_sync_page
 #include <linux/workqueue.h>
+#include <linux/kthread.h>
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,20)
+# include <linux/freezer.h>
+#else
+# include <linux/sched.h>
+#endif
 #include "crc32c.h"
 #include "ctree.h"
 #include "disk-io.h"
@@ -1100,6 +1107,87 @@ static void end_workqueue_fn(struct btrfs_work *work)
 #endif
 }
 
+static int cleaner_kthread(void *arg)
+{
+	struct btrfs_root *root = arg;
+
+	do {
+		smp_mb();
+		if (root->fs_info->closing)
+			break;
+
+		vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE);
+		mutex_lock(&root->fs_info->cleaner_mutex);
+printk("cleaner awake\n");
+		btrfs_clean_old_snapshots(root);
+printk("cleaner done\n");
+		mutex_unlock(&root->fs_info->cleaner_mutex);
+
+		if (freezing(current)) {
+			refrigerator();
+		} else {
+			smp_mb();
+			if (root->fs_info->closing)
+				break;
+			set_current_state(TASK_INTERRUPTIBLE);
+			schedule();
+			__set_current_state(TASK_RUNNING);
+		}
+	} while (!kthread_should_stop());
+	return 0;
+}
+
+static int transaction_kthread(void *arg)
+{
+	struct btrfs_root *root = arg;
+	struct btrfs_trans_handle *trans;
+	struct btrfs_transaction *cur;
+	unsigned long now;
+	unsigned long delay;
+	int ret;
+
+	do {
+		smp_mb();
+		if (root->fs_info->closing)
+			break;
+
+		delay = HZ * 30;
+		vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE);
+		mutex_lock(&root->fs_info->transaction_kthread_mutex);
+
+		mutex_lock(&root->fs_info->trans_mutex);
+		cur = root->fs_info->running_transaction;
+		if (!cur) {
+			mutex_unlock(&root->fs_info->trans_mutex);
+			goto sleep;
+		}
+		now = get_seconds();
+		if (now < cur->start_time || now - cur->start_time < 30) {
+			mutex_unlock(&root->fs_info->trans_mutex);
+			delay = HZ * 5;
+			goto sleep;
+		}
+		mutex_unlock(&root->fs_info->trans_mutex);
+		btrfs_defrag_dirty_roots(root->fs_info);
+		trans = btrfs_start_transaction(root, 1);
+		ret = btrfs_commit_transaction(trans, root);
+sleep:
+		wake_up_process(root->fs_info->cleaner_kthread);
+		mutex_unlock(&root->fs_info->transaction_kthread_mutex);
+
+		if (freezing(current)) {
+			refrigerator();
+		} else {
+			if (root->fs_info->closing)
+				break;
+			set_current_state(TASK_INTERRUPTIBLE);
+			schedule_timeout(delay);
+			__set_current_state(TASK_RUNNING);
+		}
+	} while (!kthread_should_stop());
+	return 0;
+}
+
 struct btrfs_root *open_ctree(struct super_block *sb,
 			      struct btrfs_fs_devices *fs_devices,
 			      char *options)
@@ -1189,11 +1277,6 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 			     fs_info->btree_inode->i_mapping, GFP_NOFS);
 	fs_info->do_barriers = 1;
 
-#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18)
-	INIT_WORK(&fs_info->trans_work, btrfs_transaction_cleaner, fs_info);
-#else
-	INIT_DELAYED_WORK(&fs_info->trans_work, btrfs_transaction_cleaner);
-#endif
 	BTRFS_I(fs_info->btree_inode)->root = tree_root;
 	memset(&BTRFS_I(fs_info->btree_inode)->location, 0,
 	       sizeof(struct btrfs_key));
@@ -1204,6 +1287,8 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	mutex_init(&fs_info->drop_mutex);
 	mutex_init(&fs_info->alloc_mutex);
 	mutex_init(&fs_info->chunk_mutex);
+	mutex_init(&fs_info->transaction_kthread_mutex);
+	mutex_init(&fs_info->cleaner_mutex);
 
 #if 0
 	ret = add_hasher(fs_info, "crc32c");
@@ -1247,7 +1332,6 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	btrfs_start_workers(&fs_info->submit_workers, 1);
 	btrfs_start_workers(&fs_info->endio_workers, fs_info->thread_pool_size);
 
-
 	err = -EINVAL;
 	if (btrfs_super_num_devices(disk_super) > fs_devices->open_devices) {
 		printk("Btrfs: wanted %llu devices, but found %llu\n",
@@ -1341,9 +1425,22 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	fs_info->data_alloc_profile = (u64)-1;
 	fs_info->metadata_alloc_profile = (u64)-1;
 	fs_info->system_alloc_profile = fs_info->metadata_alloc_profile;
+	fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root,
+					       "btrfs-cleaner");
+	if (!fs_info->cleaner_kthread)
+		goto fail_extent_root;
+
+	fs_info->transaction_kthread = kthread_run(transaction_kthread,
+						   tree_root,
+						   "btrfs-transaction");
+	if (!fs_info->transaction_kthread)
+		goto fail_trans_kthread;
+
 
 	return tree_root;
 
+fail_trans_kthread:
+	kthread_stop(fs_info->cleaner_kthread);
 fail_extent_root:
 	free_extent_buffer(extent_root->node);
 fail_tree_root:
@@ -1562,8 +1659,11 @@ int close_ctree(struct btrfs_root *root)
 	fs_info->closing = 1;
 	smp_mb();
 
-	btrfs_transaction_flush_work(root);
+	kthread_stop(root->fs_info->transaction_kthread);
+	kthread_stop(root->fs_info->cleaner_kthread);
+
 	btrfs_defrag_dirty_roots(root->fs_info);
+	btrfs_clean_old_snapshots(root);
 	trans = btrfs_start_transaction(root, 1);
 	ret = btrfs_commit_transaction(trans, root);
 	/* run commit again to  drop the original snapshot */
@@ -1574,8 +1674,6 @@ int close_ctree(struct btrfs_root *root)
 
 	write_ctree_super(NULL, root);
 
-	btrfs_transaction_flush_work(root);
-
 	if (fs_info->delalloc_bytes) {
 		printk("btrfs: at unmount delalloc count %Lu\n",
 		       fs_info->delalloc_bytes);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 6274f30031db..89cc4f611869 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1216,15 +1216,16 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
 	if (ret == -ENOSPC) {
 printk("space info full %Lu\n", flags);
 		space_info->full = 1;
-		goto out;
+		goto out_unlock;
 	}
 	BUG_ON(ret);
 
 	ret = btrfs_make_block_group(trans, extent_root, 0, flags,
 		     BTRFS_FIRST_CHUNK_TREE_OBJECTID, start, num_bytes);
 	BUG_ON(ret);
-out:
+out_unlock:
 	mutex_unlock(&extent_root->fs_info->chunk_mutex);
+out:
 	return 0;
 }
 
@@ -2274,7 +2275,8 @@ static int noinline walk_down_tree(struct btrfs_trans_handle *trans,
 			free_extent_buffer(next);
 			mutex_unlock(&root->fs_info->alloc_mutex);
 
-			reada_walk_down(root, cur, path->slots[*level]);
+			if (path->slots[*level] == 0)
+				reada_walk_down(root, cur, path->slots[*level]);
 
 			next = read_tree_block(root, bytenr, blocksize,
 					       ptr_gen);
@@ -2446,8 +2448,6 @@ int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
 			break;
 		if (wret < 0)
 			ret = wret;
-		ret = -EAGAIN;
-		break;
 	}
 	for (i = 0; i <= orig_level; i++) {
 		if (path->nodes[i]) {
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index b61ded7a20c9..726d6871fa13 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -340,7 +340,6 @@ static int btrfs_fill_super(struct super_block * sb,
 		goto fail_close;
 
 	sb->s_root = root_dentry;
-	btrfs_transaction_queue_work(tree_root, HZ * 30);
 
 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,25)
 	save_mount_options(sb, data);
@@ -416,9 +415,7 @@ static int btrfs_get_sb(struct file_system_type *fs_type, int flags,
 		goto error_free_subvol_name;
 
 	bdev = fs_devices->latest_bdev;
-	btrfs_lock_volumes();
 	s = sget(fs_type, btrfs_test_super, set_anon_super, fs_devices);
-	btrfs_unlock_volumes();
 	if (IS_ERR(s))
 		goto error_s;
 
@@ -530,13 +527,15 @@ out:
 static void btrfs_write_super_lockfs(struct super_block *sb)
 {
 	struct btrfs_root *root = btrfs_sb(sb);
-	btrfs_transaction_flush_work(root);
+	mutex_lock(&root->fs_info->transaction_kthread_mutex);
+	mutex_lock(&root->fs_info->cleaner_mutex);
 }
 
 static void btrfs_unlockfs(struct super_block *sb)
 {
 	struct btrfs_root *root = btrfs_sb(sb);
-	btrfs_transaction_queue_work(root, HZ * 30);
+	mutex_unlock(&root->fs_info->cleaner_mutex);
+	mutex_unlock(&root->fs_info->transaction_kthread_mutex);
 }
 
 static struct super_operations btrfs_super_ops = {
@@ -589,10 +588,9 @@ static int __init init_btrfs_fs(void)
 	if (err)
 		return err;
 
-	btrfs_init_transaction_sys();
 	err = btrfs_init_cachep();
 	if (err)
-		goto free_transaction_sys;
+		goto free_sysfs;
 
 	err = extent_io_init();
 	if (err)
@@ -618,15 +616,13 @@ free_extent_io:
 	extent_io_exit();
 free_cachep:
 	btrfs_destroy_cachep();
-free_transaction_sys:
-	btrfs_exit_transaction_sys();
+free_sysfs:
 	btrfs_exit_sysfs();
 	return err;
 }
 
 static void __exit exit_btrfs_fs(void)
 {
-	btrfs_exit_transaction_sys();
 	btrfs_destroy_cachep();
 	extent_map_exit();
 	extent_io_exit();
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 69ed5f85a387..0c53ff775b92 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -29,8 +29,6 @@ static int total_trans = 0;
 extern struct kmem_cache *btrfs_trans_handle_cachep;
 extern struct kmem_cache *btrfs_transaction_cachep;
 
-static struct workqueue_struct *trans_wq;
-
 #define BTRFS_ROOT_TRANS_TAG 0
 #define BTRFS_ROOT_DEFRAG_TAG 1
 
@@ -807,81 +805,15 @@ int btrfs_clean_old_snapshots(struct btrfs_root *root)
 {
 	struct list_head dirty_roots;
 	INIT_LIST_HEAD(&dirty_roots);
-
+again:
 	mutex_lock(&root->fs_info->trans_mutex);
 	list_splice_init(&root->fs_info->dead_roots, &dirty_roots);
 	mutex_unlock(&root->fs_info->trans_mutex);
 
 	if (!list_empty(&dirty_roots)) {
 		drop_dirty_roots(root, &dirty_roots);
+		goto again;
 	}
 	return 0;
 }
-#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18)
-void btrfs_transaction_cleaner(void *p)
-#else
-void btrfs_transaction_cleaner(struct work_struct *work)
-#endif
-{
-#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18)
-	struct btrfs_fs_info *fs_info = p;
-#else
-	struct btrfs_fs_info *fs_info = container_of(work,
-						     struct btrfs_fs_info,
-						     trans_work.work);
-
-#endif
-	struct btrfs_root *root = fs_info->tree_root;
-	struct btrfs_transaction *cur;
-	struct btrfs_trans_handle *trans;
-	unsigned long now;
-	unsigned long delay = HZ * 30;
-	int ret;
-
-	smp_mb();
-	if (root->fs_info->closing)
-		goto out;
-
-	mutex_lock(&root->fs_info->trans_mutex);
-	cur = root->fs_info->running_transaction;
-	if (!cur) {
-		mutex_unlock(&root->fs_info->trans_mutex);
-		goto out;
-	}
-	now = get_seconds();
-	if (now < cur->start_time || now - cur->start_time < 30) {
-		mutex_unlock(&root->fs_info->trans_mutex);
-		delay = HZ * 5;
-		goto out;
-	}
-	mutex_unlock(&root->fs_info->trans_mutex);
-	btrfs_defrag_dirty_roots(root->fs_info);
-	trans = btrfs_start_transaction(root, 1);
-	ret = btrfs_commit_transaction(trans, root);
-out:
-	btrfs_clean_old_snapshots(root);
-	btrfs_transaction_queue_work(root, delay);
-}
-
-void btrfs_transaction_queue_work(struct btrfs_root *root, int delay)
-{
-	if (!root->fs_info->closing)
-		queue_delayed_work(trans_wq, &root->fs_info->trans_work, delay);
-}
-
-void btrfs_transaction_flush_work(struct btrfs_root *root)
-{
-	cancel_delayed_work(&root->fs_info->trans_work);
-	flush_workqueue(trans_wq);
-}
-
-void __init btrfs_init_transaction_sys(void)
-{
-	trans_wq = create_workqueue("btrfs-transaction");
-}
-
-void btrfs_exit_transaction_sys(void)
-{
-	destroy_workqueue(trans_wq);
-}
 
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 52559b51b181..e1e5a06b65f4 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -82,16 +82,6 @@ int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
 int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans,
 			    struct btrfs_root *root);
 
-#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18)
-void btrfs_transaction_cleaner(void *p);
-#else
-void btrfs_transaction_cleaner(struct work_struct *work);
-#endif
-
-void btrfs_transaction_flush_work(struct btrfs_root *root);
-void btrfs_transaction_queue_work(struct btrfs_root *root, int delay);
-void btrfs_init_transaction_sys(void);
-void btrfs_exit_transaction_sys(void);
 int btrfs_add_dead_root(struct btrfs_root *root, struct btrfs_root *latest,
 			struct list_head *dead_list);
 int btrfs_defrag_dirty_roots(struct btrfs_fs_info *info);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 869864ddcc29..4e7cee27aab5 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -271,13 +271,17 @@ again:
 	list_for_each(cur, head) {
 		device = list_entry(cur, struct btrfs_device, dev_list);
 		if (!device->in_fs_metadata) {
-			if (device->bdev) {
-				close_bdev_excl(device->bdev);
-				fs_devices->open_devices--;
-			}
+			struct block_device *bdev;
 			list_del(&device->dev_list);
 			list_del(&device->dev_alloc_list);
 			fs_devices->num_devices--;
+			if (device->bdev) {
+				bdev = device->bdev;
+				fs_devices->open_devices--;
+				mutex_unlock(&uuid_mutex);
+				close_bdev_excl(bdev);
+				mutex_lock(&uuid_mutex);
+			}
 			kfree(device->name);
 			kfree(device);
 			goto again;
-- 
cgit v1.2.3


From 7d9eb12c8739e7dc80c78c6b3596f912ecd8f941 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 8 Jul 2008 14:19:17 -0400
Subject: Btrfs: Add locking around volume management (device
 add/remove/balance)

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c       |  4 ---
 fs/btrfs/ctree.h       |  1 +
 fs/btrfs/disk-io.c     |  1 +
 fs/btrfs/extent-tree.c | 73 +++++++++++++++++++++++++++++++++++++-------------
 fs/btrfs/ioctl.c       |  6 ++---
 fs/btrfs/volumes.c     | 58 +++++++++++++++++++++++++++++----------
 6 files changed, 103 insertions(+), 40 deletions(-)

(limited to 'fs/btrfs/volumes.c')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index c6759fc1004a..bbf9bf374066 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1251,10 +1251,6 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
 	WARN_ON(p->nodes[0] != NULL);
 	WARN_ON(cow && root == root->fs_info->extent_root &&
 		!mutex_is_locked(&root->fs_info->alloc_mutex));
-	WARN_ON(root == root->fs_info->chunk_root &&
-		!mutex_is_locked(&root->fs_info->chunk_mutex));
-	WARN_ON(root == root->fs_info->dev_root &&
-		!mutex_is_locked(&root->fs_info->chunk_mutex));
 	if (ins_len < 0)
 		lowest_unlock = 2;
 again:
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index a28796482b4a..f3783dbd9b60 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -523,6 +523,7 @@ struct btrfs_fs_info {
 	struct mutex alloc_mutex;
 	struct mutex chunk_mutex;
 	struct mutex drop_mutex;
+	struct mutex volume_mutex;
 	struct list_head trans_list;
 	struct list_head hashers;
 	struct list_head dead_roots;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 4cdc0b6a2672..8f4c40033e92 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1287,6 +1287,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	mutex_init(&fs_info->chunk_mutex);
 	mutex_init(&fs_info->transaction_kthread_mutex);
 	mutex_init(&fs_info->cleaner_mutex);
+	mutex_init(&fs_info->volume_mutex);
 
 #if 0
 	ret = add_hasher(fs_info, "crc32c");
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 5e0857ffbc35..8ebfa6be0790 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -245,6 +245,7 @@ static int noinline find_search_start(struct btrfs_root *root,
 	u64 search_start = *start_ret;
 	int wrapped = 0;
 
+	WARN_ON(!mutex_is_locked(&root->fs_info->alloc_mutex));
 	total_fs_bytes = btrfs_super_total_bytes(&root->fs_info->super_copy);
 	free_space_cache = &root->fs_info->free_space_cache;
 
@@ -1242,6 +1243,7 @@ static int update_block_group(struct btrfs_trans_handle *trans,
 	u64 start;
 	u64 end;
 
+	WARN_ON(!mutex_is_locked(&root->fs_info->alloc_mutex));
 	while(total) {
 		cache = btrfs_lookup_block_group(info, bytenr);
 		if (!cache) {
@@ -1297,6 +1299,7 @@ static int update_pinned_extents(struct btrfs_root *root,
 	struct btrfs_block_group_cache *cache;
 	struct btrfs_fs_info *fs_info = root->fs_info;
 
+	WARN_ON(!mutex_is_locked(&root->fs_info->alloc_mutex));
 	if (pin) {
 		set_extent_dirty(&fs_info->pinned_extents,
 				bytenr, bytenr + num - 1, GFP_NOFS);
@@ -1391,6 +1394,7 @@ static int finish_current_insert(struct btrfs_trans_handle *trans,
 	int level;
 	int err = 0;
 
+	WARN_ON(!mutex_is_locked(&extent_root->fs_info->alloc_mutex));
 	btrfs_set_stack_extent_refs(&extent_item, 1);
 	btrfs_set_key_type(&ins, BTRFS_EXTENT_ITEM_KEY);
 	path = btrfs_alloc_path();
@@ -1437,6 +1441,7 @@ static int pin_down_bytes(struct btrfs_root *root, u64 bytenr, u32 num_bytes,
 {
 	int err = 0;
 
+	WARN_ON(!mutex_is_locked(&root->fs_info->alloc_mutex));
 	if (!pending) {
 		struct extent_buffer *buf;
 		buf = btrfs_find_tree_block(root, bytenr, num_bytes);
@@ -1490,6 +1495,7 @@ static int __free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
 	struct btrfs_extent_item *ei;
 	u32 refs;
 
+	WARN_ON(!mutex_is_locked(&root->fs_info->alloc_mutex));
 	key.objectid = bytenr;
 	btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
 	key.offset = num_bytes;
@@ -1619,6 +1625,7 @@ static int del_pending_extents(struct btrfs_trans_handle *trans, struct
 	struct extent_io_tree *pending_del;
 	struct extent_io_tree *pinned_extents;
 
+	WARN_ON(!mutex_is_locked(&extent_root->fs_info->alloc_mutex));
 	pending_del = &extent_root->fs_info->pending_del;
 	pinned_extents = &extent_root->fs_info->pinned_extents;
 
@@ -2428,6 +2435,10 @@ int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
 		btrfs_node_key(node, &found_key, path->slots[level]);
 		WARN_ON(memcmp(&found_key, &root_item->drop_progress,
 			       sizeof(found_key)));
+		/*
+		 * unlock our path, this is safe because only this
+		 * function is allowed to delete this snapshot
+		 */
 		for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
 			if (path->nodes[i] && path->locks[i]) {
 				path->locks[i] = 0;
@@ -2611,7 +2622,6 @@ static int find_root_for_ref(struct btrfs_root *root,
 	u64 root_search_start = BTRFS_FS_TREE_OBJECTID;
 	u64 found_bytenr;
 	int ret;
-	int i;
 
 	root_location.offset = (u64)-1;
 	root_location.type = BTRFS_ROOT_ITEM_KEY;
@@ -2635,12 +2645,6 @@ static int find_root_for_ref(struct btrfs_root *root,
 				found_bytenr = path->nodes[level]->start;
 		}
 
-		for (i = level; i < BTRFS_MAX_LEVEL; i++) {
-			if (!path->nodes[i])
-				break;
-			free_extent_buffer(path->nodes[i]);
-			path->nodes[i] = NULL;
-		}
 		btrfs_release_path(cur_root, path);
 
 		if (found_bytenr == bytenr) {
@@ -2689,6 +2693,8 @@ static int noinline relocate_one_reference(struct btrfs_root *extent_root,
 	int ret;
 	int level;
 
+	WARN_ON(!mutex_is_locked(&extent_root->fs_info->alloc_mutex));
+
 	ref = btrfs_item_ptr(path->nodes[0], path->slots[0],
 			     struct btrfs_extent_ref);
 	ref_root = btrfs_ref_root(path->nodes[0], ref);
@@ -2707,6 +2713,7 @@ static int noinline relocate_one_reference(struct btrfs_root *extent_root,
 	found_root = btrfs_read_fs_root_no_name(extent_root->fs_info,
 						&root_location);
 	BUG_ON(!found_root);
+	mutex_unlock(&extent_root->fs_info->alloc_mutex);
 
 	if (ref_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
 		found_key.objectid = ref_objectid;
@@ -2748,9 +2755,9 @@ static int noinline relocate_one_reference(struct btrfs_root *extent_root,
 		/* this can happen if the reference is not against
 		 * the latest version of the tree root
 		 */
-		if (is_bad_inode(inode)) {
+		if (is_bad_inode(inode))
 			goto out;
-		}
+
 		*last_file_objectid = inode->i_ino;
 		*last_file_root = found_root->root_key.objectid;
 		*last_file_offset = ref_offset;
@@ -2760,7 +2767,7 @@ static int noinline relocate_one_reference(struct btrfs_root *extent_root,
 	} else {
 		struct btrfs_trans_handle *trans;
 		struct extent_buffer *eb;
-		int i;
+		int needs_lock = 0;
 
 		eb = read_tree_block(found_root, extent_key->objectid,
 				     extent_key->offset, 0);
@@ -2782,26 +2789,40 @@ static int noinline relocate_one_reference(struct btrfs_root *extent_root,
 		if (ret)
 			goto out;
 
+		/*
+		 * right here almost anything could happen to our key,
+		 * but that's ok.  The cow below will either relocate it
+		 * or someone else will have relocated it.  Either way,
+		 * it is in a different spot than it was before and
+		 * we're happy.
+		 */
+
 		trans = btrfs_start_transaction(found_root, 1);
 
+		if (found_root == extent_root->fs_info->extent_root ||
+		    found_root == extent_root->fs_info->chunk_root ||
+		    found_root == extent_root->fs_info->dev_root) {
+			needs_lock = 1;
+			mutex_lock(&extent_root->fs_info->alloc_mutex);
+		}
+
 		path->lowest_level = level;
 		path->reada = 2;
 		ret = btrfs_search_slot(trans, found_root, &found_key, path,
 					0, 1);
 		path->lowest_level = 0;
-		for (i = level; i < BTRFS_MAX_LEVEL; i++) {
-			if (!path->nodes[i])
-				break;
-			free_extent_buffer(path->nodes[i]);
-			path->nodes[i] = NULL;
-		}
 		btrfs_release_path(found_root, path);
+
 		if (found_root == found_root->fs_info->extent_root)
 			btrfs_extent_post_op(trans, found_root);
+		if (needs_lock)
+			mutex_unlock(&extent_root->fs_info->alloc_mutex);
+
 		btrfs_end_transaction(trans, found_root);
-	}
 
+	}
 out:
+	mutex_lock(&extent_root->fs_info->alloc_mutex);
 	return 0;
 }
 
@@ -2943,7 +2964,10 @@ int __alloc_chunk_for_shrink(struct btrfs_root *root,
 
 	if (btrfs_block_group_used(&shrink_block_group->item) > 0) {
 
+		mutex_unlock(&root->fs_info->alloc_mutex);
 		trans = btrfs_start_transaction(root, 1);
+		mutex_lock(&root->fs_info->alloc_mutex);
+
 		new_alloc_flags = update_block_group_flags(root,
 						   shrink_block_group->flags);
 		if (new_alloc_flags != shrink_block_group->flags) {
@@ -2954,7 +2978,10 @@ int __alloc_chunk_for_shrink(struct btrfs_root *root,
 		}
 		do_chunk_alloc(trans, root->fs_info->extent_root,
 			       calc + 2 * 1024 * 1024, new_alloc_flags, force);
+
+		mutex_unlock(&root->fs_info->alloc_mutex);
 		btrfs_end_transaction(trans, root);
+		mutex_lock(&root->fs_info->alloc_mutex);
 	}
 	return 0;
 }
@@ -3031,9 +3058,9 @@ again:
 		if (ret < 0)
 			goto out;
 
+next:
 		leaf = path->nodes[0];
 		nritems = btrfs_header_nritems(leaf);
-next:
 		if (path->slots[0] >= nritems) {
 			ret = btrfs_next_leaf(root, path);
 			if (ret < 0)
@@ -3083,6 +3110,7 @@ next:
 		printk("btrfs relocate found %llu last extent was %llu\n",
 		       (unsigned long long)total_found,
 		       (unsigned long long)found_key.objectid);
+		mutex_unlock(&root->fs_info->alloc_mutex);
 		trans = btrfs_start_transaction(tree_root, 1);
 		btrfs_commit_transaction(trans, tree_root);
 
@@ -3090,6 +3118,7 @@ next:
 
 		trans = btrfs_start_transaction(tree_root, 1);
 		btrfs_commit_transaction(trans, tree_root);
+		mutex_lock(&root->fs_info->alloc_mutex);
 		goto again;
 	}
 
@@ -3097,7 +3126,10 @@ next:
 	 * we've freed all the extents, now remove the block
 	 * group item from the tree
 	 */
+	mutex_unlock(&root->fs_info->alloc_mutex);
+
 	trans = btrfs_start_transaction(root, 1);
+	mutex_lock(&root->fs_info->alloc_mutex);
 	memcpy(&key, &shrink_block_group->key, sizeof(key));
 
 	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
@@ -3119,8 +3151,12 @@ next:
 	kfree(shrink_block_group);
 
 	btrfs_del_item(trans, root, path);
+	btrfs_release_path(root, path);
+	mutex_unlock(&root->fs_info->alloc_mutex);
 	btrfs_commit_transaction(trans, root);
 
+	mutex_lock(&root->fs_info->alloc_mutex);
+
 	/* the code to unpin extents might set a few bits in the free
 	 * space cache for this range again
 	 */
@@ -3263,6 +3299,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
 	struct btrfs_block_group_cache *cache;
 	struct extent_io_tree *block_group_cache;
 
+	WARN_ON(!mutex_is_locked(&root->fs_info->alloc_mutex));
 	extent_root = root->fs_info->extent_root;
 	block_group_cache = &root->fs_info->block_group_cache;
 
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 026039a2ac58..83f17a5cbd6a 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -307,8 +307,7 @@ static int btrfs_ioctl_resize(struct btrfs_root *root, void __user *arg)
 		goto out;
 	}
 
-	mutex_lock(&root->fs_info->alloc_mutex);
-	mutex_lock(&root->fs_info->chunk_mutex);
+	mutex_lock(&root->fs_info->volume_mutex);
 	sizestr = vol_args->name;
 	devstr = strchr(sizestr, ':');
 	if (devstr) {
@@ -378,8 +377,7 @@ static int btrfs_ioctl_resize(struct btrfs_root *root, void __user *arg)
 	}
 
 out_unlock:
-	mutex_lock(&root->fs_info->alloc_mutex);
-	mutex_lock(&root->fs_info->chunk_mutex);
+	mutex_unlock(&root->fs_info->volume_mutex);
 out:
 	kfree(vol_args);
 	return ret;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 4e7cee27aab5..5e6ee7a6f738 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -56,6 +56,18 @@ void btrfs_unlock_volumes(void)
 	mutex_unlock(&uuid_mutex);
 }
 
+static void lock_chunks(struct btrfs_root *root)
+{
+	mutex_lock(&root->fs_info->alloc_mutex);
+	mutex_lock(&root->fs_info->chunk_mutex);
+}
+
+static void unlock_chunks(struct btrfs_root *root)
+{
+	mutex_unlock(&root->fs_info->alloc_mutex);
+	mutex_unlock(&root->fs_info->chunk_mutex);
+}
+
 int btrfs_cleanup_fs_uuids(void)
 {
 	struct btrfs_fs_devices *fs_devices;
@@ -822,6 +834,7 @@ static int btrfs_rm_dev_item(struct btrfs_root *root,
 	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
 	key.type = BTRFS_DEV_ITEM_KEY;
 	key.offset = device->devid;
+	lock_chunks(root);
 
 	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
 	if (ret < 0)
@@ -856,6 +869,7 @@ static int btrfs_rm_dev_item(struct btrfs_root *root,
 				    total_bytes - 1);
 out:
 	btrfs_free_path(path);
+	unlock_chunks(root);
 	btrfs_commit_transaction(trans, root);
 	return ret;
 }
@@ -870,9 +884,8 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
 	u64 devid;
 	int ret = 0;
 
-	mutex_lock(&root->fs_info->alloc_mutex);
-	mutex_lock(&root->fs_info->chunk_mutex);
 	mutex_lock(&uuid_mutex);
+	mutex_lock(&root->fs_info->volume_mutex);
 
 	all_avail = root->fs_info->avail_data_alloc_bits |
 		root->fs_info->avail_system_alloc_bits |
@@ -988,9 +1001,8 @@ error_close:
 	if (bdev)
 		close_bdev_excl(bdev);
 out:
+	mutex_unlock(&root->fs_info->volume_mutex);
 	mutex_unlock(&uuid_mutex);
-	mutex_unlock(&root->fs_info->chunk_mutex);
-	mutex_unlock(&root->fs_info->alloc_mutex);
 	return ret;
 }
 
@@ -1010,10 +1022,10 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
 		return -EIO;
 	}
 
-	mutex_lock(&root->fs_info->alloc_mutex);
-	mutex_lock(&root->fs_info->chunk_mutex);
+	mutex_lock(&root->fs_info->volume_mutex);
 
 	trans = btrfs_start_transaction(root, 1);
+	lock_chunks(root);
 	devices = &root->fs_info->fs_devices->devices;
 	list_for_each(cur, devices) {
 		device = list_entry(cur, struct btrfs_device, dev_list);
@@ -1065,9 +1077,9 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
 	root->fs_info->fs_devices->num_devices++;
 	root->fs_info->fs_devices->open_devices++;
 out:
+	unlock_chunks(root);
 	btrfs_end_transaction(trans, root);
-	mutex_unlock(&root->fs_info->chunk_mutex);
-	mutex_unlock(&root->fs_info->alloc_mutex);
+	mutex_unlock(&root->fs_info->volume_mutex);
 
 	return ret;
 
@@ -1122,7 +1134,7 @@ out:
 	return ret;
 }
 
-int btrfs_grow_device(struct btrfs_trans_handle *trans,
+static int __btrfs_grow_device(struct btrfs_trans_handle *trans,
 		      struct btrfs_device *device, u64 new_size)
 {
 	struct btrfs_super_block *super_copy =
@@ -1134,6 +1146,16 @@ int btrfs_grow_device(struct btrfs_trans_handle *trans,
 	return btrfs_update_device(trans, device);
 }
 
+int btrfs_grow_device(struct btrfs_trans_handle *trans,
+		      struct btrfs_device *device, u64 new_size)
+{
+	int ret;
+	lock_chunks(device->dev_root);
+	ret = __btrfs_grow_device(trans, device, new_size);
+	unlock_chunks(device->dev_root);
+	return ret;
+}
+
 static int btrfs_free_chunk(struct btrfs_trans_handle *trans,
 			    struct btrfs_root *root,
 			    u64 chunk_tree, u64 chunk_objectid,
@@ -1234,6 +1256,8 @@ int btrfs_relocate_chunk(struct btrfs_root *root,
 	trans = btrfs_start_transaction(root, 1);
 	BUG_ON(!trans);
 
+	lock_chunks(root);
+
 	/*
 	 * step two, delete the device extents and the
 	 * chunk tree entries
@@ -1278,6 +1302,7 @@ int btrfs_relocate_chunk(struct btrfs_root *root,
 	/* once for us */
 	free_extent_map(em);
 
+	unlock_chunks(root);
 	btrfs_end_transaction(trans, root);
 	return 0;
 }
@@ -1308,8 +1333,7 @@ int btrfs_balance(struct btrfs_root *dev_root)
 	struct btrfs_key found_key;
 
 
-	BUG(); /* FIXME, needs locking */
-
+	mutex_lock(&dev_root->fs_info->volume_mutex);
 	dev_root = dev_root->fs_info->dev_root;
 
 	/* step one make some room on all the devices */
@@ -1355,13 +1379,14 @@ int btrfs_balance(struct btrfs_root *dev_root)
 
 		ret = btrfs_previous_item(chunk_root, path, 0,
 					  BTRFS_CHUNK_ITEM_KEY);
-		if (ret) {
+		if (ret)
 			break;
-		}
+
 		btrfs_item_key_to_cpu(path->nodes[0], &found_key,
 				      path->slots[0]);
 		if (found_key.objectid != key.objectid)
 			break;
+
 		chunk = btrfs_item_ptr(path->nodes[0],
 				       path->slots[0],
 				       struct btrfs_chunk);
@@ -1370,16 +1395,17 @@ int btrfs_balance(struct btrfs_root *dev_root)
 		if (key.offset == 0)
 			break;
 
+		btrfs_release_path(chunk_root, path);
 		ret = btrfs_relocate_chunk(chunk_root,
 					   chunk_root->root_key.objectid,
 					   found_key.objectid,
 					   found_key.offset);
 		BUG_ON(ret);
-		btrfs_release_path(chunk_root, path);
 	}
 	ret = 0;
 error:
 	btrfs_free_path(path);
+	mutex_unlock(&dev_root->fs_info->volume_mutex);
 	return ret;
 }
 
@@ -1419,14 +1445,18 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
 
 	path->reada = 2;
 
+	lock_chunks(root);
+
 	device->total_bytes = new_size;
 	ret = btrfs_update_device(trans, device);
 	if (ret) {
+		unlock_chunks(root);
 		btrfs_end_transaction(trans, root);
 		goto done;
 	}
 	WARN_ON(diff > old_total);
 	btrfs_set_super_total_bytes(super_copy, old_total - diff);
+	unlock_chunks(root);
 	btrfs_end_transaction(trans, root);
 
 	key.objectid = device->devid;
-- 
cgit v1.2.3


From bcc63abbf3e9bf948a1b0129b3e6120ec7d7f698 Mon Sep 17 00:00:00 2001
From: Yan <zheng.yan@oracle.com>
Date: Wed, 30 Jul 2008 16:29:20 -0400
Subject: Btrfs: implement memory reclaim for leaf reference cache

The memory reclaiming issue happens when snapshot exists. In that
case, some cache entries may not be used during old snapshot dropping,
so they will remain in the cache until umount.

The patch adds a field to struct btrfs_leaf_ref to record create time. Besides,
the patch makes all dead roots of a given snapshot linked together in order of
create time. After a old snapshot was completely dropped, we check the dead
root list and remove all cache entries created before the oldest dead root in
the list.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c       |  1 -
 fs/btrfs/ctree.h       |  3 ++-
 fs/btrfs/dir-item.c    |  1 -
 fs/btrfs/disk-io.c     |  5 +++--
 fs/btrfs/extent-tree.c | 18 +++++++++---------
 fs/btrfs/extent_io.c   |  1 -
 fs/btrfs/file-item.c   |  1 -
 fs/btrfs/file.c        |  1 -
 fs/btrfs/inode.c       | 22 +++++++++++-----------
 fs/btrfs/locking.c     |  1 -
 fs/btrfs/print-tree.c  |  1 -
 fs/btrfs/ref-cache.c   | 48 +++++++++++++++++++++++++-----------------------
 fs/btrfs/ref-cache.h   | 11 ++++++-----
 fs/btrfs/transaction.c | 40 ++++++++++++++++++++++++++++++----------
 fs/btrfs/volumes.c     |  1 -
 15 files changed, 86 insertions(+), 69 deletions(-)

(limited to 'fs/btrfs/volumes.c')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 245eb00435dd..c4792062dd53 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -3275,4 +3275,3 @@ int btrfs_previous_item(struct btrfs_root *root,
 	}
 	return 1;
 }
-
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 83422088c629..be16cd49ef69 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -666,7 +666,8 @@ struct btrfs_root {
 	/* the dirty list is only used by non-reference counted roots */
 	struct list_head dirty_list;
 
-	spinlock_t orphan_lock;
+	spinlock_t list_lock;
+	struct list_head dead_list;
 	struct list_head orphan_list;
 };
 
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index eb4dd3d75cf9..125094617fe8 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -340,4 +340,3 @@ int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans,
 	}
 	return 0;
 }
-
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index ec1ba8ddb35f..e826730d750f 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -735,8 +735,9 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
 
 	INIT_LIST_HEAD(&root->dirty_list);
 	INIT_LIST_HEAD(&root->orphan_list);
+	INIT_LIST_HEAD(&root->dead_list);
 	spin_lock_init(&root->node_lock);
-	spin_lock_init(&root->orphan_lock);
+	spin_lock_init(&root->list_lock);
 	mutex_init(&root->objectid_mutex);
 
 	btrfs_leaf_ref_tree_init(&root->ref_tree_struct);
@@ -1717,7 +1718,7 @@ int close_ctree(struct btrfs_root *root)
 		printk("btrfs: at umount reference cache size %Lu\n",
 			fs_info->total_ref_cache_size);
 	}
-	
+
 	if (fs_info->extent_root->node)
 		free_extent_buffer(fs_info->extent_root->node);
 
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index fe1ddbd2bfd6..37ca8df30c30 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -867,8 +867,8 @@ static int get_reference_status(struct btrfs_root *root, u64 bytenr,
 		/*
 		 * For (parent_gen > 0 && parent_gen > ref_gen):
 		 *
-		 * we reach here through the oldest root, therefore 
-		 * all other reference from same snapshot should have 
+		 * we reach here through the oldest root, therefore
+		 * all other reference from same snapshot should have
 		 * a larger generation.
 		 */
 		if ((root_objectid != btrfs_ref_root(leaf, ref_item)) ||
@@ -954,7 +954,7 @@ int btrfs_cross_ref_exists(struct btrfs_root *root,
 			if (!eb)
 				continue;
 			extent_start = eb->start;
-		} else 
+		} else
 			extent_start = bytenr;
 
 		ret = get_reference_status(root, extent_start, ref_generation,
@@ -1048,7 +1048,7 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 		struct btrfs_leaf_ref *ref;
 		struct btrfs_extent_info *info;
 
-		ref = btrfs_alloc_leaf_ref(nr_file_extents);
+		ref = btrfs_alloc_leaf_ref(root, nr_file_extents);
 		if (!ref) {
 			WARN_ON(1);
 			goto out;
@@ -1059,7 +1059,7 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 		ref->generation = btrfs_header_generation(buf);
 		ref->nritems = nr_file_extents;
 		info = ref->extents;
-		
+
 		for (i = 0; nr_file_extents > 0 && i < nritems; i++) {
 			u64 disk_bytenr;
 			btrfs_item_key_to_cpu(buf, &key, i);
@@ -1085,7 +1085,7 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 		BUG_ON(!root->ref_tree);
 		ret = btrfs_add_leaf_ref(root, ref);
 		WARN_ON(ret);
-		btrfs_free_leaf_ref(ref);
+		btrfs_free_leaf_ref(root, ref);
 	}
 out:
 	return 0;
@@ -2316,7 +2316,7 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
 }
 
 static int noinline drop_leaf_ref_no_cache(struct btrfs_trans_handle *trans,
-				  	   struct btrfs_root *root,
+					   struct btrfs_root *root,
 					   struct extent_buffer *leaf)
 {
 	u64 leaf_owner;
@@ -2367,7 +2367,7 @@ static int noinline drop_leaf_ref_no_cache(struct btrfs_trans_handle *trans,
 }
 
 static int noinline drop_leaf_ref(struct btrfs_trans_handle *trans,
-				  	 struct btrfs_root *root,
+					 struct btrfs_root *root,
 					 struct btrfs_leaf_ref *ref)
 {
 	int i;
@@ -2521,7 +2521,7 @@ static int noinline walk_down_tree(struct btrfs_trans_handle *trans,
 				ret = drop_leaf_ref(trans, root, ref);
 				BUG_ON(ret);
 				btrfs_remove_leaf_ref(root, ref);
-				btrfs_free_leaf_ref(ref);
+				btrfs_free_leaf_ref(root, ref);
 				*level = 0;
 				break;
 			}
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 964ec1622d66..5368e3b6eb96 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -3497,4 +3497,3 @@ out:
 	return ret;
 }
 EXPORT_SYMBOL(try_release_extent_buffer);
-
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index afe42d00b5a6..2311061f070e 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -422,4 +422,3 @@ int btrfs_csum_truncate(struct btrfs_trans_handle *trans,
 	BUG_ON(ret);
 	return ret;
 }
-
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index ded5281f8463..412ab4a26382 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1095,4 +1095,3 @@ struct file_operations btrfs_file_operations = {
 	.compat_ioctl	= btrfs_ioctl,
 #endif
 };
-
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 3aa82cec6bf7..7af8be076ee5 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -835,17 +835,17 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	int ret = 0;
 
-	spin_lock(&root->orphan_lock);
+	spin_lock(&root->list_lock);
 
 	/* already on the orphan list, we're good */
 	if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
-		spin_unlock(&root->orphan_lock);
+		spin_unlock(&root->list_lock);
 		return 0;
 	}
 
 	list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list);
 
-	spin_unlock(&root->orphan_lock);
+	spin_unlock(&root->list_lock);
 
 	/*
 	 * insert an orphan item to track this unlinked/truncated file
@@ -864,20 +864,20 @@ int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode)
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	int ret = 0;
 
-	spin_lock(&root->orphan_lock);
+	spin_lock(&root->list_lock);
 
 	if (list_empty(&BTRFS_I(inode)->i_orphan)) {
-		spin_unlock(&root->orphan_lock);
+		spin_unlock(&root->list_lock);
 		return 0;
 	}
 
 	list_del_init(&BTRFS_I(inode)->i_orphan);
 	if (!trans) {
-		spin_unlock(&root->orphan_lock);
+		spin_unlock(&root->list_lock);
 		return 0;
 	}
 
-	spin_unlock(&root->orphan_lock);
+	spin_unlock(&root->list_lock);
 
 	ret = btrfs_del_orphan_item(trans, root, inode->i_ino);
 
@@ -973,9 +973,9 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
 		 * add this inode to the orphan list so btrfs_orphan_del does
 		 * the proper thing when we hit it
 		 */
-		spin_lock(&root->orphan_lock);
+		spin_lock(&root->list_lock);
 		list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list);
-		spin_unlock(&root->orphan_lock);
+		spin_unlock(&root->list_lock);
 
 		/*
 		 * if this is a bad inode, means we actually succeeded in
@@ -3269,13 +3269,13 @@ void btrfs_destroy_inode(struct inode *inode)
 	    BTRFS_I(inode)->i_default_acl != BTRFS_ACL_NOT_CACHED)
 		posix_acl_release(BTRFS_I(inode)->i_default_acl);
 
-	spin_lock(&BTRFS_I(inode)->root->orphan_lock);
+	spin_lock(&BTRFS_I(inode)->root->list_lock);
 	if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
 		printk(KERN_ERR "BTRFS: inode %lu: inode still on the orphan"
 		       " list\n", inode->i_ino);
 		dump_stack();
 	}
-	spin_unlock(&BTRFS_I(inode)->root->orphan_lock);
+	spin_unlock(&BTRFS_I(inode)->root->list_lock);
 
 	while(1) {
 		ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1);
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index d617c29787fa..d43e14c7471a 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -56,4 +56,3 @@ int btrfs_tree_locked(struct extent_buffer *eb)
 {
 	return mutex_is_locked(&eb->mutex);
 }
-
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index 14d863720302..f1374d597a17 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -198,4 +198,3 @@ void btrfs_print_tree(struct btrfs_root *root, struct extent_buffer *c)
 		free_extent_buffer(next);
 	}
 }
-
diff --git a/fs/btrfs/ref-cache.c b/fs/btrfs/ref-cache.c
index ec9587784a3d..272b9890c982 100644
--- a/fs/btrfs/ref-cache.c
+++ b/fs/btrfs/ref-cache.c
@@ -21,12 +21,18 @@
 #include "ref-cache.h"
 #include "transaction.h"
 
-struct btrfs_leaf_ref *btrfs_alloc_leaf_ref(int nr_extents)
+struct btrfs_leaf_ref *btrfs_alloc_leaf_ref(struct btrfs_root *root,
+					    int nr_extents)
 {
 	struct btrfs_leaf_ref *ref;
+	size_t size = btrfs_leaf_ref_size(nr_extents);
 
-	ref = kmalloc(btrfs_leaf_ref_size(nr_extents), GFP_NOFS);
+	ref = kmalloc(size, GFP_NOFS);
 	if (ref) {
+		spin_lock(&root->fs_info->ref_cache_lock);
+		root->fs_info->total_ref_cache_size += size;
+		spin_unlock(&root->fs_info->ref_cache_lock);
+
 		memset(ref, 0, sizeof(*ref));
 		atomic_set(&ref->usage, 1);
 		INIT_LIST_HEAD(&ref->list);
@@ -34,14 +40,20 @@ struct btrfs_leaf_ref *btrfs_alloc_leaf_ref(int nr_extents)
 	return ref;
 }
 
-void btrfs_free_leaf_ref(struct btrfs_leaf_ref *ref)
+void btrfs_free_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref)
 {
 	if (!ref)
 		return;
 	WARN_ON(atomic_read(&ref->usage) == 0);
 	if (atomic_dec_and_test(&ref->usage)) {
+		size_t size = btrfs_leaf_ref_size(ref->nritems);
+
 		BUG_ON(ref->in_tree);
 		kfree(ref);
+
+		spin_lock(&root->fs_info->ref_cache_lock);
+		root->fs_info->total_ref_cache_size -= size;
+		spin_unlock(&root->fs_info->ref_cache_lock);
 	}
 }
 
@@ -64,7 +76,7 @@ static struct rb_node *tree_insert(struct rb_root *root, u64 bytenr,
 		else
 			return parent;
 	}
-	
+
 	entry = rb_entry(node, struct btrfs_leaf_ref, rb_node);
 	entry->in_tree = 1;
 	rb_link_node(node, parent, p);
@@ -91,9 +103,8 @@ static struct rb_node *tree_search(struct rb_root *root, u64 bytenr)
 	return NULL;
 }
 
-int btrfs_remove_leaf_refs(struct btrfs_root *root)
+int btrfs_remove_leaf_refs(struct btrfs_root *root, u64 max_root_gen)
 {
-	struct rb_node *rb;
 	struct btrfs_leaf_ref *ref = NULL;
 	struct btrfs_leaf_ref_tree *tree = root->ref_tree;
 
@@ -101,17 +112,18 @@ int btrfs_remove_leaf_refs(struct btrfs_root *root)
 		return 0;
 
 	spin_lock(&tree->lock);
-	while(!btrfs_leaf_ref_tree_empty(tree)) {
-		rb = rb_first(&tree->root);
-		ref = rb_entry(rb, struct btrfs_leaf_ref, rb_node);
+	while(!list_empty(&tree->list)) {
+		ref = list_entry(tree->list.next, struct btrfs_leaf_ref, list);
+		BUG_ON(!ref->in_tree);
+		if (ref->root_gen > max_root_gen)
+			break;
+
 		rb_erase(&ref->rb_node, &tree->root);
 		ref->in_tree = 0;
 		list_del_init(&ref->list);
 
 		spin_unlock(&tree->lock);
-
-		btrfs_free_leaf_ref(ref);
-
+		btrfs_free_leaf_ref(root, ref);
 		cond_resched();
 		spin_lock(&tree->lock);
 	}
@@ -143,7 +155,6 @@ int btrfs_add_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref)
 {
 	int ret = 0;
 	struct rb_node *rb;
-	size_t size = btrfs_leaf_ref_size(ref->nritems);
 	struct btrfs_leaf_ref_tree *tree = root->ref_tree;
 
 	spin_lock(&tree->lock);
@@ -151,9 +162,6 @@ int btrfs_add_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref)
 	if (rb) {
 		ret = -EEXIST;
 	} else {
-		spin_lock(&root->fs_info->ref_cache_lock);
-		root->fs_info->total_ref_cache_size += size;
-		spin_unlock(&root->fs_info->ref_cache_lock);
 		atomic_inc(&ref->usage);
 		list_add_tail(&ref->list, &tree->list);
 	}
@@ -163,15 +171,10 @@ int btrfs_add_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref)
 
 int btrfs_remove_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref)
 {
-	size_t size = btrfs_leaf_ref_size(ref->nritems);
 	struct btrfs_leaf_ref_tree *tree = root->ref_tree;
 
 	BUG_ON(!ref->in_tree);
 	spin_lock(&tree->lock);
-	
-	spin_lock(&root->fs_info->ref_cache_lock);
-	root->fs_info->total_ref_cache_size -= size;
-	spin_unlock(&root->fs_info->ref_cache_lock);
 
 	rb_erase(&ref->rb_node, &tree->root);
 	ref->in_tree = 0;
@@ -179,7 +182,6 @@ int btrfs_remove_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref)
 
 	spin_unlock(&tree->lock);
 
-	btrfs_free_leaf_ref(ref);
+	btrfs_free_leaf_ref(root, ref);
 	return 0;
 }
-
diff --git a/fs/btrfs/ref-cache.h b/fs/btrfs/ref-cache.h
index 823c049f72f1..c361b321c0c3 100644
--- a/fs/btrfs/ref-cache.h
+++ b/fs/btrfs/ref-cache.h
@@ -30,6 +30,7 @@ struct btrfs_leaf_ref {
 	int in_tree;
 	atomic_t usage;
 
+	u64 root_gen;
 	u64 bytenr;
 	u64 owner;
 	u64 generation;
@@ -41,14 +42,13 @@ struct btrfs_leaf_ref {
 
 static inline size_t btrfs_leaf_ref_size(int nr_extents)
 {
-	return sizeof(struct btrfs_leaf_ref) + 
+	return sizeof(struct btrfs_leaf_ref) +
 	       sizeof(struct btrfs_extent_info) * nr_extents;
 }
 
 static inline void btrfs_leaf_ref_tree_init(struct btrfs_leaf_ref_tree *tree)
 {
 	tree->root.rb_node = NULL;
-	tree->last = NULL;
 	INIT_LIST_HEAD(&tree->list);
 	spin_lock_init(&tree->lock);
 }
@@ -59,12 +59,13 @@ static inline int btrfs_leaf_ref_tree_empty(struct btrfs_leaf_ref_tree *tree)
 }
 
 void btrfs_leaf_ref_tree_init(struct btrfs_leaf_ref_tree *tree);
-struct btrfs_leaf_ref *btrfs_alloc_leaf_ref(int nr_extents);
-void btrfs_free_leaf_ref(struct btrfs_leaf_ref *ref);
+struct btrfs_leaf_ref *btrfs_alloc_leaf_ref(struct btrfs_root *root,
+					    int nr_extents);
+void btrfs_free_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref);
 struct btrfs_leaf_ref *btrfs_lookup_leaf_ref(struct btrfs_root *root,
 					     u64 bytenr);
 int btrfs_add_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref);
-int btrfs_remove_leaf_refs(struct btrfs_root *root);
+int btrfs_remove_leaf_refs(struct btrfs_root *root, u64 max_root_gen);
 int btrfs_remove_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref);
 
 #endif
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 216f31571620..52c5524896a3 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -98,20 +98,24 @@ static noinline int record_root_in_trans(struct btrfs_root *root)
 			BUG_ON(!dirty);
 			dirty->root = kmalloc(sizeof(*dirty->root), GFP_NOFS);
 			BUG_ON(!dirty->root);
-
 			dirty->latest_root = root;
 			INIT_LIST_HEAD(&dirty->list);
 
 			root->commit_root = btrfs_root_node(root);
-			root->dirty_root = dirty;
 
 			memcpy(dirty->root, root, sizeof(*root));
-			dirty->root->ref_tree = &root->ref_tree_struct;
-
 			spin_lock_init(&dirty->root->node_lock);
+			spin_lock_init(&dirty->root->list_lock);
 			mutex_init(&dirty->root->objectid_mutex);
+			INIT_LIST_HEAD(&dirty->root->dead_list);
 			dirty->root->node = root->commit_root;
 			dirty->root->commit_root = NULL;
+
+			spin_lock(&root->list_lock);
+			list_add(&dirty->root->dead_list, &root->dead_list);
+			spin_unlock(&root->list_lock);
+
+			root->dirty_root = dirty;
 		} else {
 			WARN_ON(1);
 		}
@@ -356,8 +360,6 @@ int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans,
 		list_del_init(next);
 		root = list_entry(next, struct btrfs_root, dirty_list);
 		update_cowonly_root(trans, root);
-		if (root->fs_info->closing)
-			btrfs_remove_leaf_refs(root);
 	}
 	return 0;
 }
@@ -410,7 +412,11 @@ static noinline int add_dirty_roots(struct btrfs_trans_handle *trans,
 
 				free_extent_buffer(root->commit_root);
 				root->commit_root = NULL;
-				
+
+				spin_lock(&root->list_lock);
+				list_del_init(&dirty->root->dead_list);
+				spin_unlock(&root->list_lock);
+
 				kfree(dirty->root);
 				kfree(dirty);
 
@@ -497,6 +503,7 @@ static noinline int drop_dirty_roots(struct btrfs_root *tree_root,
 	unsigned long nr;
 	u64 num_bytes;
 	u64 bytes_used;
+	u64 max_useless;
 	int ret = 0;
 	int err;
 
@@ -554,10 +561,25 @@ static noinline int drop_dirty_roots(struct btrfs_root *tree_root,
 		}
 		mutex_unlock(&root->fs_info->drop_mutex);
 
+		spin_lock(&root->list_lock);
+		list_del_init(&dirty->root->dead_list);
+		if (!list_empty(&root->dead_list)) {
+			struct btrfs_root *oldest;
+			oldest = list_entry(root->dead_list.prev,
+					    struct btrfs_root, dead_list);
+			max_useless = oldest->root_key.offset - 1;
+		} else {
+			max_useless = root->root_key.offset - 1;
+		}
+		spin_unlock(&root->list_lock);
+
 		nr = trans->blocks_used;
 		ret = btrfs_end_transaction(trans, tree_root);
 		BUG_ON(ret);
 
+		ret = btrfs_remove_leaf_refs(root, max_useless);
+		BUG_ON(ret);
+
 		free_extent_buffer(dirty->root->node);
 		kfree(dirty->root);
 		kfree(dirty);
@@ -785,10 +807,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 	put_transaction(cur_trans);
 	put_transaction(cur_trans);
 
+	list_splice_init(&dirty_fs_roots, &root->fs_info->dead_roots);
 	if (root->fs_info->closing)
 		list_splice_init(&root->fs_info->dead_roots, &dirty_fs_roots);
-	else
-		list_splice_init(&dirty_fs_roots, &root->fs_info->dead_roots);
 
 	mutex_unlock(&root->fs_info->trans_mutex);
 	kmem_cache_free(btrfs_trans_handle_cachep, trans);
@@ -814,4 +835,3 @@ again:
 	}
 	return 0;
 }
-
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 5e6ee7a6f738..18db4cbe2794 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -2527,4 +2527,3 @@ again:
 error:
 	return ret;
 }
-
-- 
cgit v1.2.3


From 492bb6deee3416ad792dcd8584ebd95c463af1a6 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 31 Jul 2008 16:29:02 -0400
Subject: Btrfs: Hold a reference on bios during submit_bio, add some extra bio
 checks

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/volumes.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

(limited to 'fs/btrfs/volumes.c')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 18db4cbe2794..0de51e37e789 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -180,7 +180,11 @@ loop:
 		pending = pending->bi_next;
 		cur->bi_next = NULL;
 		atomic_dec(&device->dev_root->fs_info->nr_async_submits);
+
+		BUG_ON(atomic_read(&cur->bi_cnt) == 0);
+		bio_get(cur);
 		submit_bio(cur->bi_rw, cur);
+		bio_put(cur);
 		num_run++;
 
 		/*
@@ -188,10 +192,11 @@ loop:
 		 * is now congested.  Back off and let other work structs
 		 * run instead
 		 */
-		if (pending && num_run && bdi_write_congested(bdi)) {
+		if (pending && bdi_write_congested(bdi)) {
 			struct bio *old_head;
 
 			spin_lock(&device->io_lock);
+
 			old_head = device->pending_bios;
 			device->pending_bios = pending;
 			if (device->pending_bio_tail)
@@ -2125,7 +2130,9 @@ int schedule_bio(struct btrfs_root *root, struct btrfs_device *device,
 
 	/* don't bother with additional async steps for reads, right now */
 	if (!(rw & (1 << BIO_RW))) {
+		bio_get(bio);
 		submit_bio(rw, bio);
+		bio_put(bio);
 		return 0;
 	}
 
@@ -2136,6 +2143,7 @@ int schedule_bio(struct btrfs_root *root, struct btrfs_device *device,
 	 * on a queue for later
 	 */
 	atomic_inc(&root->fs_info->nr_async_submits);
+	WARN_ON(bio->bi_next);
 	bio->bi_next = NULL;
 	bio->bi_rw |= rw;
 
-- 
cgit v1.2.3


From 7d2b4daa67379960477568abda62b8ba9ee3a8aa Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 5 Aug 2008 10:13:57 -0400
Subject: Btrfs: Fix the multi-bio code to save the original bio for completion

The multi-bio code is responsible for duplicating blocks in raid1 and
single spindle duplication.  It has counters to make sure all of
the locations for a given extent are properly written before io completion
is returned to the higher layers.

But, it didn't always complete the same bio it was given, sometimes a
clone was completed instead.  This lead to problems with the async
work queues because they saved a pointer to the bio in a struct off
bi_private.

The fix is to remember the original bio and only complete that one.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/volumes.c | 11 ++++++++++-
 fs/btrfs/volumes.h |  1 +
 2 files changed, 11 insertions(+), 1 deletion(-)

(limited to 'fs/btrfs/volumes.c')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 0de51e37e789..09311b3066df 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -2070,6 +2070,7 @@ static int end_bio_multi_stripe(struct bio *bio,
 #endif
 {
 	struct btrfs_multi_bio *multi = bio->bi_private;
+	int is_orig_bio = 0;
 
 #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
 	if (bio->bi_size)
@@ -2078,7 +2079,14 @@ static int end_bio_multi_stripe(struct bio *bio,
 	if (err)
 		atomic_inc(&multi->error);
 
+	if (bio == multi->orig_bio)
+		is_orig_bio = 1;
+
 	if (atomic_dec_and_test(&multi->stripes_pending)) {
+		if (!is_orig_bio) {
+			bio_put(bio);
+			bio = multi->orig_bio;
+		}
 		bio->bi_private = multi->private;
 		bio->bi_end_io = multi->end_io;
 		/* only send an error to the higher layers if it is
@@ -2101,7 +2109,7 @@ static int end_bio_multi_stripe(struct bio *bio,
 #else
 		bio_endio(bio, err);
 #endif
-	} else {
+	} else if (!is_orig_bio) {
 		bio_put(bio);
 	}
 #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
@@ -2196,6 +2204,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
 	}
 	multi->end_io = first_bio->bi_end_io;
 	multi->private = first_bio->bi_private;
+	multi->orig_bio = first_bio;
 	atomic_set(&multi->stripes_pending, multi->num_stripes);
 
 	while(dev_nr < total_devs) {
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 48a44f7a9385..c50e50580b51 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -95,6 +95,7 @@ struct btrfs_bio_stripe {
 struct btrfs_multi_bio {
 	atomic_t stripes_pending;
 	bio_end_io_t *end_io;
+	struct bio *orig_bio;
 	void *private;
 	atomic_t error;
 	int max_errors;
-- 
cgit v1.2.3


From 0986fe9eac24fd186927c3b87af51d62f8ab92cd Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 15 Aug 2008 15:34:15 -0400
Subject: Btrfs: Count async bios separately from async checksum work items

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h   |  1 +
 fs/btrfs/disk-io.c | 25 ++++++++++++++++++++++---
 fs/btrfs/volumes.c |  6 +++---
 3 files changed, 26 insertions(+), 6 deletions(-)

(limited to 'fs/btrfs/volumes.c')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index c88f1e16ce2d..bb4a8d2200d0 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -544,6 +544,7 @@ struct btrfs_fs_info {
 	struct list_head hashers;
 	struct list_head dead_roots;
 	atomic_t nr_async_submits;
+	atomic_t nr_async_bios;
 
 	/*
 	 * this is used by the balancing code to wait for all the pending
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 99bd9f9b9eed..9902d29abd06 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -429,6 +429,21 @@ int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
 	return 0;
 }
 
+static int congested_async(struct btrfs_fs_info *info, int iodone)
+{
+	int limit = 256 * info->fs_devices->open_devices;
+
+	if (iodone)
+		limit = (limit * 3) / 2;
+	if (atomic_read(&info->nr_async_submits) > limit)
+		return 1;
+
+	limit = 8192 * info->fs_devices->open_devices;
+	if (iodone)
+		limit = (limit * 3) / 2;
+	return atomic_read(&info->nr_async_bios) > limit;
+}
+
 static void run_one_async_submit(struct btrfs_work *work)
 {
 	struct btrfs_fs_info *fs_info;
@@ -437,6 +452,11 @@ static void run_one_async_submit(struct btrfs_work *work)
 	async = container_of(work, struct  async_submit_bio, work);
 	fs_info = BTRFS_I(async->inode)->root->fs_info;
 	atomic_dec(&fs_info->nr_async_submits);
+
+	if ((async->bio->bi_rw & (1 << BIO_RW)) &&
+	    !congested_async(fs_info, 1)) {
+		clear_bdi_congested(&fs_info->bdi, WRITE);
+	}
 	async->submit_bio_hook(async->inode, async->rw, async->bio,
 			       async->mirror_num);
 	kfree(async);
@@ -938,15 +958,13 @@ static int btrfs_congested_fn(void *congested_data, int bdi_bits)
 {
 	struct btrfs_fs_info *info = (struct btrfs_fs_info *)congested_data;
 	int ret = 0;
-	int limit = 256 * info->fs_devices->open_devices;
 	struct list_head *cur;
 	struct btrfs_device *device;
 	struct backing_dev_info *bdi;
 
 	if ((bdi_bits & (1 << BDI_write_congested)) &&
-	    atomic_read(&info->nr_async_submits) > limit) {
+	    congested_async(info, 0))
 		return 1;
-	}
 
 	list_for_each(cur, &info->fs_devices->devices) {
 		device = list_entry(cur, struct btrfs_device, dev_list);
@@ -1250,6 +1268,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	INIT_LIST_HEAD(&fs_info->space_info);
 	btrfs_mapping_init(&fs_info->mapping_tree);
 	atomic_set(&fs_info->nr_async_submits, 0);
+	atomic_set(&fs_info->nr_async_bios, 0);
 	atomic_set(&fs_info->throttles, 0);
 	atomic_set(&fs_info->throttle_gen, 0);
 	fs_info->sb = sb;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 09311b3066df..23a5b0aba00a 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -179,7 +179,7 @@ loop:
 		cur = pending;
 		pending = pending->bi_next;
 		cur->bi_next = NULL;
-		atomic_dec(&device->dev_root->fs_info->nr_async_submits);
+		atomic_dec(&device->dev_root->fs_info->nr_async_bios);
 
 		BUG_ON(atomic_read(&cur->bi_cnt) == 0);
 		bio_get(cur);
@@ -2145,12 +2145,12 @@ int schedule_bio(struct btrfs_root *root, struct btrfs_device *device,
 	}
 
 	/*
-	 * nr_async_sumbits allows us to reliably return congestion to the
+	 * nr_async_bios allows us to reliably return congestion to the
 	 * higher layers.  Otherwise, the async bio makes it appear we have
 	 * made progress against dirty pages when we've really just put it
 	 * on a queue for later
 	 */
-	atomic_inc(&root->fs_info->nr_async_submits);
+	atomic_inc(&root->fs_info->nr_async_bios);
 	WARN_ON(bio->bi_next);
 	bio->bi_next = NULL;
 	bio->bi_rw |= rw;
-- 
cgit v1.2.3


From b64a2851ba25b3410a87d3d1b751155612105c8e Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 20 Aug 2008 13:39:41 -0400
Subject: Btrfs: Wait for async bio submissions to make some progress at queue
 time

Before, the btrfs bdi congestion function was used to test for too many
async bios.  This keeps that check to throttle pdflush, but also
adds a check while queuing bios.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/disk-io.c     | 16 +++++++++-------
 fs/btrfs/disk-io.h     |  1 +
 fs/btrfs/transaction.c |  2 --
 fs/btrfs/volumes.c     | 18 +++++++++++++++++-
 4 files changed, 27 insertions(+), 10 deletions(-)

(limited to 'fs/btrfs/volumes.c')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 92e14dd9bddb..bbba14b629d2 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -429,7 +429,7 @@ int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
 	return 0;
 }
 
-static unsigned long async_submit_limit(struct btrfs_fs_info *info)
+unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info)
 {
 	unsigned long limit = min_t(unsigned long,
 				    info->workers.max_workers,
@@ -439,7 +439,8 @@ static unsigned long async_submit_limit(struct btrfs_fs_info *info)
 
 int btrfs_congested_async(struct btrfs_fs_info *info, int iodone)
 {
-	return atomic_read(&info->nr_async_bios) > async_submit_limit(info);
+	return atomic_read(&info->nr_async_bios) >
+		btrfs_async_submit_limit(info);
 }
 
 static void run_one_async_submit(struct btrfs_work *work)
@@ -451,12 +452,13 @@ static void run_one_async_submit(struct btrfs_work *work)
 	async = container_of(work, struct  async_submit_bio, work);
 	fs_info = BTRFS_I(async->inode)->root->fs_info;
 
-	limit = async_submit_limit(fs_info);
+	limit = btrfs_async_submit_limit(fs_info);
 	limit = limit * 2 / 3;
 
 	atomic_dec(&fs_info->nr_async_submits);
 
-	if (atomic_read(&fs_info->nr_async_submits) < limit)
+	if (atomic_read(&fs_info->nr_async_submits) < limit &&
+	    waitqueue_active(&fs_info->async_submit_wait))
 		wake_up(&fs_info->async_submit_wait);
 
 	async->submit_bio_hook(async->inode, async->rw, async->bio,
@@ -469,7 +471,7 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
 			extent_submit_bio_hook_t *submit_bio_hook)
 {
 	struct async_submit_bio *async;
-	int limit = async_submit_limit(fs_info);
+	int limit = btrfs_async_submit_limit(fs_info);
 
 	async = kmalloc(sizeof(*async), GFP_NOFS);
 	if (!async)
@@ -1863,10 +1865,10 @@ void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr)
 	struct extent_io_tree *tree;
 	u64 num_dirty;
 	u64 start = 0;
-	unsigned long thresh = 12 * 1024 * 1024;
+	unsigned long thresh = 96 * 1024 * 1024;
 	tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree;
 
-	if (current_is_pdflush())
+	if (current_is_pdflush() || current->flags & PF_MEMALLOC)
 		return;
 
 	num_dirty = count_range_bits(tree, &start, (u64)-1,
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index e904a69347a4..2562a273ae18 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -73,4 +73,5 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
 			int rw, struct bio *bio, int mirror_num,
 			extent_submit_bio_hook_t *submit_bio_hook);
 int btrfs_congested_async(struct btrfs_fs_info *info, int iodone);
+unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info);
 #endif
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 6bcb0876f9bb..eff3ad72991b 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -322,8 +322,6 @@ int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
 		if (ret)
 			break;
 		while(start <= end) {
-			if (btrfs_congested_async(root->fs_info, 0))
-				congestion_wait(WRITE, HZ/10);
 			cond_resched();
 
 			index = start >> PAGE_CACHE_SHIFT;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 23a5b0aba00a..2652660e6079 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -138,12 +138,18 @@ int run_scheduled_bios(struct btrfs_device *device)
 {
 	struct bio *pending;
 	struct backing_dev_info *bdi;
+	struct btrfs_fs_info *fs_info;
 	struct bio *tail;
 	struct bio *cur;
 	int again = 0;
 	unsigned long num_run = 0;
+	unsigned long limit;
 
 	bdi = device->bdev->bd_inode->i_mapping->backing_dev_info;
+	fs_info = device->dev_root->fs_info;
+	limit = btrfs_async_submit_limit(fs_info);
+	limit = limit * 2 / 3;
+
 loop:
 	spin_lock(&device->io_lock);
 
@@ -179,7 +185,11 @@ loop:
 		cur = pending;
 		pending = pending->bi_next;
 		cur->bi_next = NULL;
-		atomic_dec(&device->dev_root->fs_info->nr_async_bios);
+		atomic_dec(&fs_info->nr_async_bios);
+
+		if (atomic_read(&fs_info->nr_async_bios) < limit &&
+		    waitqueue_active(&fs_info->async_submit_wait))
+			wake_up(&fs_info->async_submit_wait);
 
 		BUG_ON(atomic_read(&cur->bi_cnt) == 0);
 		bio_get(cur);
@@ -2135,6 +2145,7 @@ int schedule_bio(struct btrfs_root *root, struct btrfs_device *device,
 		 int rw, struct bio *bio)
 {
 	int should_queue = 1;
+	unsigned long limit;
 
 	/* don't bother with additional async steps for reads, right now */
 	if (!(rw & (1 << BIO_RW))) {
@@ -2171,6 +2182,11 @@ int schedule_bio(struct btrfs_root *root, struct btrfs_device *device,
 	if (should_queue)
 		btrfs_queue_worker(&root->fs_info->submit_workers,
 				   &device->work);
+
+	limit = btrfs_async_submit_limit(root->fs_info);
+	wait_event_timeout(root->fs_info->async_submit_wait,
+			   (atomic_read(&root->fs_info->nr_async_bios) < limit),
+			   HZ/10);
 	return 0;
 }
 
-- 
cgit v1.2.3


From 9473f16c75606fe6b2e5000525fe9766114505f3 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 28 Aug 2008 06:15:24 -0400
Subject: Btrfs: Throttle for async bio submits higher up the chain

The current code waits for the count of async bio submits to get below
a given threshold if it is too high right after adding the latest bio
to the work queue.  This isn't optimal because the caller may have
sequential adjacent bios pending they are waiting to send down the pipe.

This changeset requires the caller to wait on the async bio count,
and changes the async checksumming submits to wait for async bios any
time they self throttle.

The end result is much higher sequential throughput.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/disk-io.c | 8 +++++++-
 fs/btrfs/volumes.c | 6 ------
 2 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'fs/btrfs/volumes.c')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index bbba14b629d2..6a218f792e59 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -487,9 +487,15 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
 	atomic_inc(&fs_info->nr_async_submits);
 	btrfs_queue_worker(&fs_info->workers, &async->work);
 
-	wait_event_timeout(fs_info->async_submit_wait,
+	if (atomic_read(&fs_info->nr_async_submits) > limit) {
+		wait_event_timeout(fs_info->async_submit_wait,
 			   (atomic_read(&fs_info->nr_async_submits) < limit),
 			   HZ/10);
+
+		wait_event_timeout(fs_info->async_submit_wait,
+			   (atomic_read(&fs_info->nr_async_bios) < limit),
+			   HZ/10);
+	}
 	return 0;
 }
 
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 2652660e6079..5b1b60839d21 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -2145,7 +2145,6 @@ int schedule_bio(struct btrfs_root *root, struct btrfs_device *device,
 		 int rw, struct bio *bio)
 {
 	int should_queue = 1;
-	unsigned long limit;
 
 	/* don't bother with additional async steps for reads, right now */
 	if (!(rw & (1 << BIO_RW))) {
@@ -2182,11 +2181,6 @@ int schedule_bio(struct btrfs_root *root, struct btrfs_device *device,
 	if (should_queue)
 		btrfs_queue_worker(&root->fs_info->submit_workers,
 				   &device->work);
-
-	limit = btrfs_async_submit_limit(root->fs_info);
-	wait_event_timeout(root->fs_info->async_submit_wait,
-			   (atomic_read(&root->fs_info->nr_async_bios) < limit),
-			   HZ/10);
 	return 0;
 }
 
-- 
cgit v1.2.3


From a1b32a5932cfac7c38b442582285f3da2a09dfd8 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 5 Sep 2008 16:09:51 -0400
Subject: Btrfs: Add debugging checks to track down corrupted metadata

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/disk-io.c   |  7 ++++++-
 fs/btrfs/extent_io.c | 21 ++++++++++++++-------
 fs/btrfs/file.c      | 15 ++++++++-------
 fs/btrfs/volumes.c   | 40 +++++++++++++++++++++-------------------
 4 files changed, 49 insertions(+), 34 deletions(-)

(limited to 'fs/btrfs/volumes.c')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 6a218f792e59..8e7a938bfbc7 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -250,7 +250,7 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root,
 		if (!ret &&
 		    !verify_parent_transid(io_tree, eb, parent_transid))
 			return ret;
-
+printk("read extent buffer pages failed with ret %d mirror no %d\n", ret, mirror_num);
 		num_copies = btrfs_num_copies(&root->fs_info->mapping_tree,
 					      eb->start, eb->len);
 		if (num_copies == 1)
@@ -348,6 +348,9 @@ int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
 
 	found_start = btrfs_header_bytenr(eb);
 	if (found_start != start) {
+		printk("bad tree block start %llu %llu\n",
+		       (unsigned long long)found_start,
+		       (unsigned long long)eb->start);
 		ret = -EIO;
 		goto err;
 	}
@@ -709,6 +712,8 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
 
 	if (ret == 0) {
 		buf->flags |= EXTENT_UPTODATE;
+	} else {
+		WARN_ON(1);
 	}
 	return buf;
 
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 83ba0c328722..7ca89c45d401 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1811,6 +1811,7 @@ printk("2bad mapping end %Lu cur %Lu\n", end, cur);
 		}
 		/* the get_extent function already copied into the page */
 		if (test_range_bit(tree, cur, cur_end, EXTENT_UPTODATE, 1)) {
+			check_page_uptodate(tree, page);
 			unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
 			cur = cur + iosize;
 			page_offset += iosize;
@@ -2785,21 +2786,20 @@ int set_extent_buffer_dirty(struct extent_io_tree *tree,
 		 * properly set.  releasepage may drop page->private
 		 * on us if the page isn't already dirty.
 		 */
+		lock_page(page);
 		if (i == 0) {
-			lock_page(page);
 			set_page_extent_head(page, eb->len);
 		} else if (PagePrivate(page) &&
 			   page->private != EXTENT_PAGE_PRIVATE) {
-			lock_page(page);
 			set_page_extent_mapped(page);
-			unlock_page(page);
 		}
 		__set_page_dirty_nobuffers(extent_buffer_page(eb, i));
-		if (i == 0)
-			unlock_page(page);
+		set_extent_dirty(tree, page_offset(page),
+				 page_offset(page) + PAGE_CACHE_SIZE -1,
+				 GFP_NOFS);
+		unlock_page(page);
 	}
-	return set_extent_dirty(tree, eb->start,
-				eb->start + eb->len - 1, GFP_NOFS);
+	return 0;
 }
 EXPORT_SYMBOL(set_extent_buffer_dirty);
 
@@ -2952,6 +2952,9 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
 	if (all_uptodate) {
 		if (start_i == 0)
 			eb->flags |= EXTENT_UPTODATE;
+		if (ret) {
+			printk("all up to date but ret is %d\n", ret);
+		}
 		goto unlock_exit;
 	}
 
@@ -2968,6 +2971,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
 						      mirror_num);
 			if (err) {
 				ret = err;
+				printk("err %d from __extent_read_full_page\n", ret);
 			}
 		} else {
 			unlock_page(page);
@@ -2978,12 +2982,15 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
 		submit_one_bio(READ, bio, mirror_num);
 
 	if (ret || !wait) {
+		if (ret)
+			printk("ret %d wait %d returning\n", ret, wait);
 		return ret;
 	}
 	for (i = start_i; i < num_pages; i++) {
 		page = extent_buffer_page(eb, i);
 		wait_on_page_locked(page);
 		if (!PageUptodate(page)) {
+			printk("page not uptodate after wait_on_page_locked\n");
 			ret = -EIO;
 		}
 	}
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index eb8e4556fa71..e9e86fbaa243 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -39,9 +39,10 @@
 #include "compat.h"
 
 
-static int btrfs_copy_from_user(loff_t pos, int num_pages, int write_bytes,
-				struct page **prepared_pages,
-				const char __user * buf)
+static int noinline btrfs_copy_from_user(loff_t pos, int num_pages,
+					 int write_bytes,
+					 struct page **prepared_pages,
+					 const char __user * buf)
 {
 	long page_fault = 0;
 	int i;
@@ -69,7 +70,7 @@ static int btrfs_copy_from_user(loff_t pos, int num_pages, int write_bytes,
 	return page_fault ? -EFAULT : 0;
 }
 
-static void btrfs_drop_pages(struct page **pages, size_t num_pages)
+static void noinline btrfs_drop_pages(struct page **pages, size_t num_pages)
 {
 	size_t i;
 	for (i = 0; i < num_pages; i++) {
@@ -359,7 +360,7 @@ out_unlock:
 	return err;
 }
 
-int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end)
+int noinline btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end)
 {
 	struct extent_map *em;
 	struct extent_map *split = NULL;
@@ -515,7 +516,7 @@ out:
  * it is either truncated or split.  Anything entirely inside the range
  * is deleted from the tree.
  */
-int btrfs_drop_extents(struct btrfs_trans_handle *trans,
+int noinline btrfs_drop_extents(struct btrfs_trans_handle *trans,
 		       struct btrfs_root *root, struct inode *inode,
 		       u64 start, u64 end, u64 inline_limit, u64 *hint_byte)
 {
@@ -785,7 +786,7 @@ out:
 /*
  * this gets pages into the page cache and locks them down
  */
-static int prepare_pages(struct btrfs_root *root, struct file *file,
+static int noinline prepare_pages(struct btrfs_root *root, struct file *file,
 			 struct page **pages, size_t num_pages,
 			 loff_t pos, unsigned long first_index,
 			 unsigned long last_index, size_t write_bytes)
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 5b1b60839d21..37a8ea23e81d 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -94,8 +94,8 @@ int btrfs_cleanup_fs_uuids(void)
 	return 0;
 }
 
-static struct btrfs_device *__find_device(struct list_head *head, u64 devid,
-					  u8 *uuid)
+static noinline struct btrfs_device *__find_device(struct list_head *head,
+						   u64 devid, u8 *uuid)
 {
 	struct btrfs_device *dev;
 	struct list_head *cur;
@@ -110,7 +110,7 @@ static struct btrfs_device *__find_device(struct list_head *head, u64 devid,
 	return NULL;
 }
 
-static struct btrfs_fs_devices *find_fsid(u8 *fsid)
+static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid)
 {
 	struct list_head *cur;
 	struct btrfs_fs_devices *fs_devices;
@@ -134,7 +134,7 @@ static struct btrfs_fs_devices *find_fsid(u8 *fsid)
  * the list if the block device is congested.  This way, multiple devices
  * can make progress from a single worker thread.
  */
-int run_scheduled_bios(struct btrfs_device *device)
+static int noinline run_scheduled_bios(struct btrfs_device *device)
 {
 	struct bio *pending;
 	struct backing_dev_info *bdi;
@@ -233,7 +233,7 @@ void pending_bios_fn(struct btrfs_work *work)
 	run_scheduled_bios(device);
 }
 
-static int device_list_add(const char *path,
+static noinline int device_list_add(const char *path,
 			   struct btrfs_super_block *disk_super,
 			   u64 devid, struct btrfs_fs_devices **fs_devices_ret)
 {
@@ -480,10 +480,10 @@ error:
  * called very infrequently and that a given device has a small number
  * of extents
  */
-static int find_free_dev_extent(struct btrfs_trans_handle *trans,
-				struct btrfs_device *device,
-				struct btrfs_path *path,
-				u64 num_bytes, u64 *start)
+static noinline int find_free_dev_extent(struct btrfs_trans_handle *trans,
+					 struct btrfs_device *device,
+					 struct btrfs_path *path,
+					 u64 num_bytes, u64 *start)
 {
 	struct btrfs_key key;
 	struct btrfs_root *root = device->dev_root;
@@ -645,7 +645,7 @@ int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
 	return ret;
 }
 
-int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
+int noinline btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
 			   struct btrfs_device *device,
 			   u64 chunk_tree, u64 chunk_objectid,
 			   u64 chunk_offset,
@@ -693,7 +693,8 @@ err:
 	return ret;
 }
 
-static int find_next_chunk(struct btrfs_root *root, u64 objectid, u64 *offset)
+static noinline int find_next_chunk(struct btrfs_root *root,
+				    u64 objectid, u64 *offset)
 {
 	struct btrfs_path *path;
 	int ret;
@@ -735,8 +736,8 @@ error:
 	return ret;
 }
 
-static int find_next_devid(struct btrfs_root *root, struct btrfs_path *path,
-			   u64 *objectid)
+static noinline int find_next_devid(struct btrfs_root *root,
+				    struct btrfs_path *path, u64 *objectid)
 {
 	int ret;
 	struct btrfs_key key;
@@ -1103,8 +1104,8 @@ out_close_bdev:
 	goto out;
 }
 
-int btrfs_update_device(struct btrfs_trans_handle *trans,
-			struct btrfs_device *device)
+int noinline btrfs_update_device(struct btrfs_trans_handle *trans,
+				 struct btrfs_device *device)
 {
 	int ret;
 	struct btrfs_path *path;
@@ -1544,8 +1545,8 @@ int btrfs_add_system_chunk(struct btrfs_trans_handle *trans,
 	return 0;
 }
 
-static u64 chunk_bytes_by_type(u64 type, u64 calc_size, int num_stripes,
-			       int sub_stripes)
+static u64 noinline chunk_bytes_by_type(u64 type, u64 calc_size,
+					int num_stripes, int sub_stripes)
 {
 	if (type & (BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_DUP))
 		return calc_size;
@@ -2141,8 +2142,9 @@ struct async_sched {
  * This will add one bio to the pending list for a device and make sure
  * the work struct is scheduled.
  */
-int schedule_bio(struct btrfs_root *root, struct btrfs_device *device,
-		 int rw, struct bio *bio)
+static int noinline schedule_bio(struct btrfs_root *root,
+				 struct btrfs_device *device,
+				 int rw, struct bio *bio)
 {
 	int should_queue = 1;
 
-- 
cgit v1.2.3


From 325cd4bafeb6cfb44addd6e807a9b74170d1be31 Mon Sep 17 00:00:00 2001
From: Zheng Yan <zheng.yan@oracle.com>
Date: Fri, 5 Sep 2008 16:43:54 -0400
Subject: Btrfs: properly set blocksize when adding new device.

---

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/volumes.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs/btrfs/volumes.c')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 37a8ea23e81d..1546fa6f4f7d 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1079,6 +1079,8 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
 	if (ret)
 		goto out_close_bdev;
 
+	set_blocksize(device->bdev, 4096);
+
 	total_bytes = btrfs_super_total_bytes(&root->fs_info->super_copy);
 	btrfs_set_super_total_bytes(&root->fs_info->super_copy,
 				    total_bytes + device->total_bytes);
-- 
cgit v1.2.3


From 0f9dd46cda36b8de3b9f48bc42bd09d20b9c3b52 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@redhat.com>
Date: Tue, 23 Sep 2008 13:14:11 -0400
Subject: Btrfs: free space accounting redo

1) replace the per fs_info extent_io_tree that tracked free space with two
rb-trees per block group to track free space areas via offset and size.  The
reason to do this is because most allocations come with a hint byte where to
start, so we can usually find a chunk of free space at that hint byte to satisfy
the allocation and get good space packing.  If we cannot find free space at or
after the given offset we fall back on looking for a chunk of the given size as
close to that given offset as possible.  When we fall back on the size search we
also try to find a slot as close to the size we want as possible, to avoid
breaking small chunks off of huge areas if possible.

2) remove the extent_io_tree that tracked the block group cache from fs_info and
replaced it with an rb-tree thats tracks block group cache via offset.  also
added a per space_info list that tracks the block group cache for the particular
space so we can lookup related block groups easily.

3) cleaned up the allocation code to make it a little easier to read and a
little less complicated.  Basically there are 3 steps, first look from our
provided hint.  If we couldn't find from that given hint, start back at our
original search start and look for space from there.  If that fails try to
allocate space if we can and start looking again.  If not we're screwed and need
to start over again.

4) small fixes.  there were some issues in volumes.c where we wouldn't allocate
the rest of the disk.  fixed cow_file_range to actually pass the alloc_hint,
which has helped a good bit in making the fs_mark test I run have semi-normal
results as we run out of space.  Generally with data allocations we don't track
where we last allocated from, so everytime we did a data allocation we'd search
through every block group that we have looking for free space.  Now searching a
block group with no free space isn't terribly time consuming, it was causing a
slight degradation as we got more data block groups.  The alloc_hint has fixed
this slight degredation and made things semi-normal.

There is still one nagging problem I'm working on where we will get ENOSPC when
there is definitely plenty of space.  This only happens with metadata
allocations, and only when we are almost full.  So you generally hit the 85%
mark first, but sometimes you'll hit the BUG before you hit the 85% wall.  I'm
still tracking it down, but until then this seems to be pretty stable and make a
significant performance gain.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/Makefile           |   2 +-
 fs/btrfs/ctree.c            |   3 +-
 fs/btrfs/ctree.h            |  46 ++-
 fs/btrfs/disk-io.c          |   7 +-
 fs/btrfs/extent-tree.c      | 869 +++++++++++++++++++++++---------------------
 fs/btrfs/extent_io.c        |   4 +
 fs/btrfs/free-space-cache.c | 415 +++++++++++++++++++++
 fs/btrfs/inode.c            |   3 +-
 fs/btrfs/volumes.c          |  11 +-
 9 files changed, 925 insertions(+), 435 deletions(-)
 create mode 100644 fs/btrfs/free-space-cache.c

(limited to 'fs/btrfs/volumes.c')

diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index b7addbfd8c22..eb36ae981bdc 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -7,7 +7,7 @@ btrfs-y := super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
 	   transaction.o bit-radix.o inode.o file.o tree-defrag.o \
 	   extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \
 	   extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
-	   ref-cache.o export.o tree-log.o acl.o
+	   ref-cache.o export.o tree-log.o acl.o free-space-cache.o
 else
 
 # Normal Makefile
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 18e84472abb5..6f467901246f 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -2725,9 +2725,8 @@ int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
 
 	total_size = total_data + (nr * sizeof(struct btrfs_item));
 	ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
-	if (ret == 0) {
+	if (ret == 0)
 		return -EEXIST;
-	}
 	if (ret < 0)
 		goto out;
 
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index eb65fd808883..730aae3bc181 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -483,7 +483,6 @@ struct btrfs_csum_item {
 #define BTRFS_BLOCK_GROUP_DUP	   (1 << 5)
 #define BTRFS_BLOCK_GROUP_RAID10   (1 << 6)
 
-
 struct btrfs_block_group_item {
 	__le64 used;
 	__le64 chunk_objectid;
@@ -498,17 +497,40 @@ struct btrfs_space_info {
 	int full;
 	int force_alloc;
 	struct list_head list;
+
+	/* for block groups in our same type */
+	struct list_head block_groups;
+	spinlock_t lock;
+};
+
+struct btrfs_free_space {
+	struct rb_node bytes_index;
+	struct rb_node offset_index;
+	u64 offset;
+	u64 bytes;
 };
 
 struct btrfs_block_group_cache {
 	struct btrfs_key key;
 	struct btrfs_block_group_item item;
-	struct btrfs_space_info *space_info;
 	spinlock_t lock;
 	u64 pinned;
 	u64 flags;
 	int cached;
 	int ro;
+	int dirty;
+
+	struct btrfs_space_info *space_info;
+
+	/* free space cache stuff */
+	struct rb_root free_space_bytes;
+	struct rb_root free_space_offset;
+
+	/* block group cache stuff */
+	struct rb_node cache_node;
+
+	/* for block groups in the same raid type */
+	struct list_head list;
 };
 
 struct btrfs_device;
@@ -525,8 +547,10 @@ struct btrfs_fs_info {
 	struct btrfs_root *log_root_tree;
 	struct radix_tree_root fs_roots_radix;
 
-	struct extent_io_tree free_space_cache;
-	struct extent_io_tree block_group_cache;
+	/* block group cache stuff */
+	spinlock_t block_group_cache_lock;
+	struct rb_root block_group_cache_tree;
+
 	struct extent_io_tree pinned_extents;
 	struct extent_io_tree pending_del;
 	struct extent_io_tree extent_ins;
@@ -1814,4 +1838,18 @@ int btrfs_sync_fs(struct super_block *sb, int wait);
 int btrfs_check_acl(struct inode *inode, int mask);
 int btrfs_init_acl(struct inode *inode, struct inode *dir);
 int btrfs_acl_chmod(struct inode *inode);
+
+/* free-space-cache.c */
+int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
+			 u64 bytenr, u64 size);
+int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
+			    u64 bytenr, u64 size);
+void btrfs_remove_free_space_cache(struct btrfs_block_group_cache
+				   *block_group);
+struct btrfs_free_space *btrfs_find_free_space(struct btrfs_block_group_cache
+					       *block_group, u64 offset,
+					       u64 bytes);
+void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
+			   u64 bytes);
+u64 btrfs_block_group_free_space(struct btrfs_block_group_cache *block_group);
 #endif
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index f6f7821d43a5..535bd0fe1a71 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1410,10 +1410,9 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 
 	BTRFS_I(fs_info->btree_inode)->io_tree.ops = &btree_extent_io_ops;
 
-	extent_io_tree_init(&fs_info->free_space_cache,
-			     fs_info->btree_inode->i_mapping, GFP_NOFS);
-	extent_io_tree_init(&fs_info->block_group_cache,
-			     fs_info->btree_inode->i_mapping, GFP_NOFS);
+	spin_lock_init(&fs_info->block_group_cache_lock);
+	fs_info->block_group_cache_tree.rb_node = NULL;
+
 	extent_io_tree_init(&fs_info->pinned_extents,
 			     fs_info->btree_inode->i_mapping, GFP_NOFS);
 	extent_io_tree_init(&fs_info->pending_del,
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 1c10ffc837c8..813566acc5d3 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -29,12 +29,6 @@
 #include "locking.h"
 #include "ref-cache.h"
 
-#define BLOCK_GROUP_DATA     EXTENT_WRITEBACK
-#define BLOCK_GROUP_METADATA EXTENT_UPTODATE
-#define BLOCK_GROUP_SYSTEM   EXTENT_NEW
-
-#define BLOCK_GROUP_DIRTY EXTENT_DIRTY
-
 static int finish_current_insert(struct btrfs_trans_handle *trans, struct
 				 btrfs_root *extent_root);
 static int del_pending_extents(struct btrfs_trans_handle *trans, struct
@@ -62,6 +56,127 @@ void maybe_unlock_mutex(struct btrfs_root *root)
 	}
 }
 
+static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
+{
+	return (cache->flags & bits) == bits;
+}
+
+/*
+ * this adds the block group to the fs_info rb tree for the block group
+ * cache
+ */
+int btrfs_add_block_group_cache(struct btrfs_fs_info *info,
+				struct btrfs_block_group_cache *block_group)
+{
+	struct rb_node **p;
+	struct rb_node *parent = NULL;
+	struct btrfs_block_group_cache *cache;
+
+	spin_lock(&info->block_group_cache_lock);
+	p = &info->block_group_cache_tree.rb_node;
+
+	while (*p) {
+		parent = *p;
+		cache = rb_entry(parent, struct btrfs_block_group_cache,
+				 cache_node);
+		if (block_group->key.objectid < cache->key.objectid) {
+			p = &(*p)->rb_left;
+		} else if (block_group->key.objectid > cache->key.objectid) {
+			p = &(*p)->rb_right;
+		} else {
+			spin_unlock(&info->block_group_cache_lock);
+			return -EEXIST;
+		}
+	}
+
+	rb_link_node(&block_group->cache_node, parent, p);
+	rb_insert_color(&block_group->cache_node,
+			&info->block_group_cache_tree);
+	spin_unlock(&info->block_group_cache_lock);
+
+	return 0;
+}
+
+/*
+ * This will return the block group at or after bytenr if contains is 0, else
+ * it will return the block group that contains the bytenr
+ */
+static struct btrfs_block_group_cache *
+block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr,
+			      int contains)
+{
+	struct btrfs_block_group_cache *cache, *ret = NULL;
+	struct rb_node *n;
+	u64 end, start;
+
+	spin_lock(&info->block_group_cache_lock);
+	n = info->block_group_cache_tree.rb_node;
+
+	while (n) {
+		cache = rb_entry(n, struct btrfs_block_group_cache,
+				 cache_node);
+		end = cache->key.objectid + cache->key.offset - 1;
+		start = cache->key.objectid;
+
+		if (bytenr < start) {
+			if (!contains && (!ret || start < ret->key.objectid))
+				ret = cache;
+			n = n->rb_left;
+		} else if (bytenr > start) {
+			if (contains && bytenr <= end) {
+				ret = cache;
+				break;
+			}
+			n = n->rb_right;
+		} else {
+			ret = cache;
+			break;
+		}
+	}
+	spin_unlock(&info->block_group_cache_lock);
+
+	return ret;
+}
+
+/*
+ * this is only called by cache_block_group, since we could have freed extents
+ * we need to check the pinned_extents for any extents that can't be used yet
+ * since their free space will be released as soon as the transaction commits.
+ */
+static int add_new_free_space(struct btrfs_block_group_cache *block_group,
+			      struct btrfs_fs_info *info, u64 start, u64 end)
+{
+	u64 extent_start, extent_end, size;
+	int ret;
+
+	while (start < end) {
+		ret = find_first_extent_bit(&info->pinned_extents, start,
+					    &extent_start, &extent_end,
+					    EXTENT_DIRTY);
+		if (ret)
+			break;
+
+		if (extent_start == start) {
+			start = extent_end + 1;
+		} else if (extent_start > start && extent_start < end) {
+			size = extent_start - start;
+			ret = btrfs_add_free_space(block_group, start, size);
+			BUG_ON(ret);
+			start = extent_end + 1;
+		} else {
+			break;
+		}
+	}
+
+	if (start < end) {
+		size = end - start;
+		ret = btrfs_add_free_space(block_group, start, size);
+		BUG_ON(ret);
+	}
+
+	return 0;
+}
+
 static int cache_block_group(struct btrfs_root *root,
 			     struct btrfs_block_group_cache *block_group)
 {
@@ -69,10 +184,8 @@ static int cache_block_group(struct btrfs_root *root,
 	int ret = 0;
 	struct btrfs_key key;
 	struct extent_buffer *leaf;
-	struct extent_io_tree *free_space_cache;
 	int slot;
 	u64 last = 0;
-	u64 hole_size;
 	u64 first_free;
 	int found = 0;
 
@@ -80,7 +193,6 @@ static int cache_block_group(struct btrfs_root *root,
 		return 0;
 
 	root = root->fs_info->extent_root;
-	free_space_cache = &root->fs_info->free_space_cache;
 
 	if (block_group->cached)
 		return 0;
@@ -96,7 +208,8 @@ static int cache_block_group(struct btrfs_root *root,
 	 * skip the locking here
 	 */
 	path->skip_locking = 1;
-	first_free = block_group->key.objectid;
+	first_free = max_t(u64, block_group->key.objectid,
+			   BTRFS_SUPER_INFO_OFFSET + BTRFS_SUPER_INFO_SIZE);
 	key.objectid = block_group->key.objectid;
 	key.offset = 0;
 	btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
@@ -119,32 +232,28 @@ static int cache_block_group(struct btrfs_root *root,
 			ret = btrfs_next_leaf(root, path);
 			if (ret < 0)
 				goto err;
-			if (ret == 0) {
+			if (ret == 0)
 				continue;
-			} else {
+			else
 				break;
-			}
 		}
 		btrfs_item_key_to_cpu(leaf, &key, slot);
-		if (key.objectid < block_group->key.objectid) {
+		if (key.objectid < block_group->key.objectid)
 			goto next;
-		}
+
 		if (key.objectid >= block_group->key.objectid +
-		    block_group->key.offset) {
+		    block_group->key.offset)
 			break;
-		}
 
 		if (btrfs_key_type(&key) == BTRFS_EXTENT_ITEM_KEY) {
 			if (!found) {
 				last = first_free;
 				found = 1;
 			}
-			if (key.objectid > last) {
-				hole_size = key.objectid - last;
-				set_extent_dirty(free_space_cache, last,
-						 last + hole_size - 1,
-						 GFP_NOFS);
-			}
+
+			add_new_free_space(block_group, root->fs_info, last,
+					   key.objectid);
+
 			last = key.objectid + key.offset;
 		}
 next:
@@ -153,13 +262,11 @@ next:
 
 	if (!found)
 		last = first_free;
-	if (block_group->key.objectid +
-	    block_group->key.offset > last) {
-		hole_size = block_group->key.objectid +
-			block_group->key.offset - last;
-		set_extent_dirty(free_space_cache, last,
-				 last + hole_size - 1, GFP_NOFS);
-	}
+
+	add_new_free_space(block_group, root->fs_info, last,
+			   block_group->key.objectid +
+			   block_group->key.offset);
+
 	block_group->cached = 1;
 	ret = 0;
 err:
@@ -167,166 +274,79 @@ err:
 	return ret;
 }
 
+/*
+ * return the block group that starts at or after bytenr
+ */
 struct btrfs_block_group_cache *btrfs_lookup_first_block_group(struct
 						       btrfs_fs_info *info,
 							 u64 bytenr)
 {
-	struct extent_io_tree *block_group_cache;
-	struct btrfs_block_group_cache *block_group = NULL;
-	u64 ptr;
-	u64 start;
-	u64 end;
-	int ret;
+	struct btrfs_block_group_cache *cache;
 
-	bytenr = max_t(u64, bytenr,
-		       BTRFS_SUPER_INFO_OFFSET + BTRFS_SUPER_INFO_SIZE);
-	block_group_cache = &info->block_group_cache;
-	ret = find_first_extent_bit(block_group_cache,
-				    bytenr, &start, &end,
-				    BLOCK_GROUP_DATA | BLOCK_GROUP_METADATA |
-				    BLOCK_GROUP_SYSTEM);
-	if (ret) {
-		return NULL;
-	}
-	ret = get_state_private(block_group_cache, start, &ptr);
-	if (ret)
-		return NULL;
+	cache = block_group_cache_tree_search(info, bytenr, 0);
 
-	block_group = (struct btrfs_block_group_cache *)(unsigned long)ptr;
-	return block_group;
+	return cache;
 }
 
+/*
+ * return the block group that contains teh given bytenr
+ */
 struct btrfs_block_group_cache *btrfs_lookup_block_group(struct
 							 btrfs_fs_info *info,
 							 u64 bytenr)
 {
-	struct extent_io_tree *block_group_cache;
-	struct btrfs_block_group_cache *block_group = NULL;
-	u64 ptr;
-	u64 start;
-	u64 end;
-	int ret;
+	struct btrfs_block_group_cache *cache;
 
-	bytenr = max_t(u64, bytenr,
-		       BTRFS_SUPER_INFO_OFFSET + BTRFS_SUPER_INFO_SIZE);
-	block_group_cache = &info->block_group_cache;
-	ret = find_first_extent_bit(block_group_cache,
-				    bytenr, &start, &end,
-				    BLOCK_GROUP_DATA | BLOCK_GROUP_METADATA |
-				    BLOCK_GROUP_SYSTEM);
-	if (ret) {
-		return NULL;
-	}
-	ret = get_state_private(block_group_cache, start, &ptr);
-	if (ret)
-		return NULL;
+	cache = block_group_cache_tree_search(info, bytenr, 1);
 
-	block_group = (struct btrfs_block_group_cache *)(unsigned long)ptr;
-	if (block_group->key.objectid <= bytenr && bytenr <
-	    block_group->key.objectid + block_group->key.offset)
-		return block_group;
-	return NULL;
+	return cache;
 }
 
-static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
-{
-	return (cache->flags & bits) == bits;
-}
-
-static int noinline find_search_start(struct btrfs_root *root,
-			      struct btrfs_block_group_cache **cache_ret,
-			      u64 *start_ret, u64 num, int data)
+static int noinline find_free_space(struct btrfs_root *root,
+				    struct btrfs_block_group_cache **cache_ret,
+				    u64 *start_ret, u64 num, int data)
 {
 	int ret;
 	struct btrfs_block_group_cache *cache = *cache_ret;
-	struct extent_io_tree *free_space_cache;
-	struct extent_state *state;
+	struct btrfs_free_space *info = NULL;
 	u64 last;
-	u64 start = 0;
-	u64 cache_miss = 0;
 	u64 total_fs_bytes;
 	u64 search_start = *start_ret;
-	int wrapped = 0;
 
 	WARN_ON(!mutex_is_locked(&root->fs_info->alloc_mutex));
 	total_fs_bytes = btrfs_super_total_bytes(&root->fs_info->super_copy);
-	free_space_cache = &root->fs_info->free_space_cache;
 
 	if (!cache)
 		goto out;
 
+	last = max(search_start, cache->key.objectid);
+
 again:
 	ret = cache_block_group(root, cache);
-	if (ret) {
+	if (ret)
 		goto out;
-	}
 
-	last = max(search_start, cache->key.objectid);
-	if (!block_group_bits(cache, data) || cache->ro)
+	if (cache->ro || !block_group_bits(cache, data))
 		goto new_group;
 
-	spin_lock_irq(&free_space_cache->lock);
-	state = find_first_extent_bit_state(free_space_cache, last, EXTENT_DIRTY);
-	while(1) {
-		if (!state) {
-			if (!cache_miss)
-				cache_miss = last;
-			spin_unlock_irq(&free_space_cache->lock);
-			goto new_group;
-		}
-
-		start = max(last, state->start);
-		last = state->end + 1;
-		if (last - start < num) {
-			do {
-				state = extent_state_next(state);
-			} while(state && !(state->state & EXTENT_DIRTY));
-			continue;
-		}
-		spin_unlock_irq(&free_space_cache->lock);
-		if (cache->ro) {
-			goto new_group;
-		}
-		if (start + num > cache->key.objectid + cache->key.offset)
-			goto new_group;
-		if (!block_group_bits(cache, data)) {
-			printk("block group bits don't match %Lu %d\n", cache->flags, data);
-		}
-		*start_ret = start;
+	info = btrfs_find_free_space(cache, last, num);
+	if (info) {
+		*start_ret = info->offset;
 		return 0;
 	}
-out:
-	cache = btrfs_lookup_block_group(root->fs_info, search_start);
-	if (!cache) {
-		printk("Unable to find block group for %Lu\n", search_start);
-		WARN_ON(1);
-	}
-	return -ENOSPC;
 
 new_group:
 	last = cache->key.objectid + cache->key.offset;
-wrapped:
+
 	cache = btrfs_lookup_first_block_group(root->fs_info, last);
-	if (!cache || cache->key.objectid >= total_fs_bytes) {
-no_cache:
-		if (!wrapped) {
-			wrapped = 1;
-			last = search_start;
-			goto wrapped;
-		}
+	if (!cache || cache->key.objectid >= total_fs_bytes)
 		goto out;
-	}
-	if (cache_miss && !cache->cached) {
-		cache_block_group(root, cache);
-		last = cache_miss;
-		cache = btrfs_lookup_first_block_group(root->fs_info, last);
-	}
-	cache_miss = 0;
-	cache = btrfs_find_block_group(root, cache, last, data, 0);
-	if (!cache)
-		goto no_cache;
+
 	*cache_ret = cache;
 	goto again;
+
+out:
+	return -ENOSPC;
 }
 
 static u64 div_factor(u64 num, int factor)
@@ -338,16 +358,19 @@ static u64 div_factor(u64 num, int factor)
 	return num;
 }
 
-static int block_group_state_bits(u64 flags)
+static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
+						  u64 flags)
 {
-	int bits = 0;
-	if (flags & BTRFS_BLOCK_GROUP_DATA)
-		bits |= BLOCK_GROUP_DATA;
-	if (flags & BTRFS_BLOCK_GROUP_METADATA)
-		bits |= BLOCK_GROUP_METADATA;
-	if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
-		bits |= BLOCK_GROUP_SYSTEM;
-	return bits;
+	struct list_head *head = &info->space_info;
+	struct list_head *cur;
+	struct btrfs_space_info *found;
+	list_for_each(cur, head) {
+		found = list_entry(cur, struct btrfs_space_info, list);
+		if (found->flags == flags)
+			return found;
+	}
+	return NULL;
+
 }
 
 static struct btrfs_block_group_cache *
@@ -356,28 +379,19 @@ __btrfs_find_block_group(struct btrfs_root *root,
 			 u64 search_start, int data, int owner)
 {
 	struct btrfs_block_group_cache *cache;
-	struct extent_io_tree *block_group_cache;
 	struct btrfs_block_group_cache *found_group = NULL;
 	struct btrfs_fs_info *info = root->fs_info;
+	struct btrfs_space_info *sinfo;
 	u64 used;
 	u64 last = 0;
-	u64 start;
-	u64 end;
 	u64 free_check;
-	u64 ptr;
-	int bit;
-	int ret;
 	int full_search = 0;
 	int factor = 10;
 	int wrapped = 0;
 
-	block_group_cache = &info->block_group_cache;
-
 	if (data & BTRFS_BLOCK_GROUP_METADATA)
 		factor = 9;
 
-	bit = block_group_state_bits(data);
-
 	if (search_start) {
 		struct btrfs_block_group_cache *shint;
 		shint = btrfs_lookup_first_block_group(info, search_start);
@@ -408,20 +422,30 @@ __btrfs_find_block_group(struct btrfs_root *root,
 		else
 			last = search_start;
 	}
+	sinfo = __find_space_info(root->fs_info, data);
+	if (!sinfo)
+		goto found;
 again:
 	while(1) {
-		ret = find_first_extent_bit(block_group_cache, last,
-					    &start, &end, bit);
-		if (ret)
-			break;
+		struct list_head *l;
 
-		ret = get_state_private(block_group_cache, start, &ptr);
-		if (ret) {
-			last = end + 1;
-			continue;
+		cache = NULL;
+
+		spin_lock(&sinfo->lock);
+		list_for_each(l, &sinfo->block_groups) {
+			struct btrfs_block_group_cache *entry;
+			entry = list_entry(l, struct btrfs_block_group_cache,
+					   list);
+			if ((entry->key.objectid >= last) &&
+			    (!cache || (entry->key.objectid <
+					cache->key.objectid)))
+				cache = entry;
 		}
+		spin_unlock(&sinfo->lock);
+
+		if (!cache)
+			break;
 
-		cache = (struct btrfs_block_group_cache *)(unsigned long)ptr;
 		spin_lock(&cache->lock);
 		last = cache->key.objectid + cache->key.offset;
 		used = btrfs_block_group_used(&cache->item);
@@ -462,6 +486,7 @@ struct btrfs_block_group_cache *btrfs_find_block_group(struct btrfs_root *root,
 	ret = __btrfs_find_block_group(root, hint, search_start, data, owner);
 	return ret;
 }
+
 static u64 hash_extent_ref(u64 root_objectid, u64 ref_generation,
 			   u64 owner, u64 owner_offset)
 {
@@ -1175,34 +1200,37 @@ fail:
 int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
 				   struct btrfs_root *root)
 {
-	struct extent_io_tree *block_group_cache;
-	struct btrfs_block_group_cache *cache;
-	int ret;
+	struct btrfs_block_group_cache *cache, *entry;
+	struct rb_node *n;
 	int err = 0;
 	int werr = 0;
 	struct btrfs_path *path;
 	u64 last = 0;
-	u64 start;
-	u64 end;
-	u64 ptr;
 
-	block_group_cache = &root->fs_info->block_group_cache;
 	path = btrfs_alloc_path();
 	if (!path)
 		return -ENOMEM;
 
 	mutex_lock(&root->fs_info->alloc_mutex);
 	while(1) {
-		ret = find_first_extent_bit(block_group_cache, last,
-					    &start, &end, BLOCK_GROUP_DIRTY);
-		if (ret)
-			break;
+		cache = NULL;
+		spin_lock(&root->fs_info->block_group_cache_lock);
+		for (n = rb_first(&root->fs_info->block_group_cache_tree);
+		     n; n = rb_next(n)) {
+			entry = rb_entry(n, struct btrfs_block_group_cache,
+					 cache_node);
+			if (entry->dirty) {
+				cache = entry;
+				break;
+			}
+		}
+		spin_unlock(&root->fs_info->block_group_cache_lock);
 
-		last = end + 1;
-		ret = get_state_private(block_group_cache, start, &ptr);
-		if (ret)
+		if (!cache)
 			break;
-		cache = (struct btrfs_block_group_cache *)(unsigned long)ptr;
+
+		last += cache->key.offset;
+
 		err = write_one_cache_group(trans, root,
 					    path, cache);
 		/*
@@ -1214,29 +1242,14 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
 			werr = err;
 			continue;
 		}
-		clear_extent_bits(block_group_cache, start, end,
-				  BLOCK_GROUP_DIRTY, GFP_NOFS);
+
+		cache->dirty = 0;
 	}
 	btrfs_free_path(path);
 	mutex_unlock(&root->fs_info->alloc_mutex);
 	return werr;
 }
 
-static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
-						  u64 flags)
-{
-	struct list_head *head = &info->space_info;
-	struct list_head *cur;
-	struct btrfs_space_info *found;
-	list_for_each(cur, head) {
-		found = list_entry(cur, struct btrfs_space_info, list);
-		if (found->flags == flags)
-			return found;
-	}
-	return NULL;
-
-}
-
 static int update_space_info(struct btrfs_fs_info *info, u64 flags,
 			     u64 total_bytes, u64 bytes_used,
 			     struct btrfs_space_info **space_info)
@@ -1256,6 +1269,8 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
 		return -ENOMEM;
 
 	list_add(&found->list, &info->space_info);
+	INIT_LIST_HEAD(&found->block_groups);
+	spin_lock_init(&found->lock);
 	found->flags = flags;
 	found->total_bytes = total_bytes;
 	found->bytes_used = bytes_used;
@@ -1318,7 +1333,7 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
 	u64 thresh;
 	u64 start;
 	u64 num_bytes;
-	int ret;
+	int ret = 0;
 
 	flags = reduce_alloc_profile(extent_root, flags);
 
@@ -1355,10 +1370,11 @@ printk("space info full %Lu\n", flags);
 	ret = btrfs_make_block_group(trans, extent_root, 0, flags,
 		     BTRFS_FIRST_CHUNK_TREE_OBJECTID, start, num_bytes);
 	BUG_ON(ret);
+
 out_unlock:
 	mutex_unlock(&extent_root->fs_info->chunk_mutex);
 out:
-	return 0;
+	return ret;
 }
 
 static int update_block_group(struct btrfs_trans_handle *trans,
@@ -1371,8 +1387,6 @@ static int update_block_group(struct btrfs_trans_handle *trans,
 	u64 total = num_bytes;
 	u64 old_val;
 	u64 byte_in_group;
-	u64 start;
-	u64 end;
 
 	WARN_ON(!mutex_is_locked(&root->fs_info->alloc_mutex));
 	while(total) {
@@ -1382,12 +1396,9 @@ static int update_block_group(struct btrfs_trans_handle *trans,
 		}
 		byte_in_group = bytenr - cache->key.objectid;
 		WARN_ON(byte_in_group > cache->key.offset);
-		start = cache->key.objectid;
-		end = start + cache->key.offset - 1;
-		set_extent_bits(&info->block_group_cache, start, end,
-				BLOCK_GROUP_DIRTY, GFP_NOFS);
 
 		spin_lock(&cache->lock);
+		cache->dirty = 1;
 		old_val = btrfs_block_group_used(&cache->item);
 		num_bytes = min(total, cache->key.offset - byte_in_group);
 		if (alloc) {
@@ -1401,9 +1412,11 @@ static int update_block_group(struct btrfs_trans_handle *trans,
 			btrfs_set_block_group_used(&cache->item, old_val);
 			spin_unlock(&cache->lock);
 			if (mark_free) {
-				set_extent_dirty(&info->free_space_cache,
-						 bytenr, bytenr + num_bytes - 1,
-						 GFP_NOFS);
+				int ret;
+				ret = btrfs_add_free_space(cache, bytenr,
+							   num_bytes);
+				if (ret)
+					return -1;
 			}
 		}
 		total -= num_bytes;
@@ -1414,16 +1427,13 @@ static int update_block_group(struct btrfs_trans_handle *trans,
 
 static u64 first_logical_byte(struct btrfs_root *root, u64 search_start)
 {
-	u64 start;
-	u64 end;
-	int ret;
-	ret = find_first_extent_bit(&root->fs_info->block_group_cache,
-				    search_start, &start, &end,
-				    BLOCK_GROUP_DATA | BLOCK_GROUP_METADATA |
-				    BLOCK_GROUP_SYSTEM);
-	if (ret)
+	struct btrfs_block_group_cache *cache;
+
+	cache = btrfs_lookup_first_block_group(root->fs_info, search_start);
+	if (!cache)
 		return 0;
-	return start;
+
+	return cache->key.objectid;
 }
 
 
@@ -1501,8 +1511,7 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
 	u64 start;
 	u64 end;
 	int ret;
-	struct extent_io_tree *free_space_cache;
-	free_space_cache = &root->fs_info->free_space_cache;
+	struct btrfs_block_group_cache *cache;
 
 	mutex_lock(&root->fs_info->alloc_mutex);
 	while(1) {
@@ -1512,7 +1521,9 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
 			break;
 		btrfs_update_pinned_extents(root, start, end + 1 - start, 0);
 		clear_extent_dirty(unpin, start, end, GFP_NOFS);
-		set_extent_dirty(free_space_cache, start, end, GFP_NOFS);
+		cache = btrfs_lookup_block_group(root->fs_info, start);
+		if (cache->cached)
+			btrfs_add_free_space(cache, start, end - start + 1);
 		if (need_resched()) {
 			mutex_unlock(&root->fs_info->alloc_mutex);
 			cond_resched();
@@ -1875,9 +1886,12 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
 	/* if metadata always pin */
 	if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID) {
 		if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
+			struct btrfs_block_group_cache *cache;
+
 			/* btrfs_free_reserved_extent */
-			set_extent_dirty(&root->fs_info->free_space_cache,
-				 bytenr, bytenr + num_bytes - 1, GFP_NOFS);
+			cache = btrfs_lookup_block_group(root->fs_info, bytenr);
+			BUG_ON(!cache);
+			btrfs_add_free_space(cache, bytenr, num_bytes);
 			return 0;
 		}
 		pin = 1;
@@ -1942,8 +1956,6 @@ static int noinline find_free_extent(struct btrfs_trans_handle *trans,
 	u64 total_needed = num_bytes;
 	u64 *last_ptr = NULL;
 	struct btrfs_block_group_cache *block_group;
-	int full_scan = 0;
-	int wrapped = 0;
 	int chunk_alloc_done = 0;
 	int empty_cluster = 2 * 1024 * 1024;
 	int allowed_chunk_alloc = 0;
@@ -1959,9 +1971,9 @@ static int noinline find_free_extent(struct btrfs_trans_handle *trans,
 		empty_cluster = 256 * 1024;
 	}
 
-	if ((data & BTRFS_BLOCK_GROUP_DATA) && btrfs_test_opt(root, SSD)) {
+	if ((data & BTRFS_BLOCK_GROUP_DATA) && btrfs_test_opt(root, SSD))
 		last_ptr = &root->fs_info->last_data_alloc;
-	}
+
 	if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
 		last_ptr = &root->fs_info->last_log_alloc;
 		if (!last_ptr == 0 && root->fs_info->last_alloc) {
@@ -1972,9 +1984,8 @@ static int noinline find_free_extent(struct btrfs_trans_handle *trans,
 	if (last_ptr) {
 		if (*last_ptr)
 			hint_byte = *last_ptr;
-		else {
+		else
 			empty_size += empty_cluster;
-		}
 	}
 
 	search_start = max(search_start, first_logical_byte(root, 0));
@@ -1983,145 +1994,172 @@ static int noinline find_free_extent(struct btrfs_trans_handle *trans,
 	if (search_end == (u64)-1)
 		search_end = btrfs_super_total_bytes(&info->super_copy);
 
-	if (hint_byte) {
-		block_group = btrfs_lookup_first_block_group(info, hint_byte);
-		if (!block_group)
-			hint_byte = search_start;
-		block_group = btrfs_find_block_group(root, block_group,
-						     hint_byte, data, 1);
-		if (last_ptr && *last_ptr == 0 && block_group)
-			hint_byte = block_group->key.objectid;
-	} else {
-		block_group = btrfs_find_block_group(root,
-						     trans->block_group,
-						     search_start, data, 1);
-	}
 	search_start = max(search_start, hint_byte);
-
 	total_needed += empty_size;
 
-check_failed:
-	if (!block_group) {
-		block_group = btrfs_lookup_first_block_group(info,
-							     search_start);
-		if (!block_group)
-			block_group = btrfs_lookup_first_block_group(info,
-						       orig_search_start);
-	}
-	if (full_scan && !chunk_alloc_done) {
-		if (allowed_chunk_alloc) {
-			do_chunk_alloc(trans, root,
-				     num_bytes + 2 * 1024 * 1024, data, 1);
-			allowed_chunk_alloc = 0;
-		} else if (block_group && block_group_bits(block_group, data)) {
-			block_group->space_info->force_alloc = 1;
+new_group:
+	block_group = btrfs_lookup_block_group(info, search_start);
+
+	/*
+	 * Ok this looks a little tricky, buts its really simple.  First if we
+	 * didn't find a block group obviously we want to start over.
+	 * Secondly, if the block group we found does not match the type we
+	 * need, and we have a last_ptr and its not 0, chances are the last
+	 * allocation we made was at the end of the block group, so lets go
+	 * ahead and skip the looking through the rest of the block groups and
+	 * start at the beginning.  This helps with metadata allocations,
+	 * since you are likely to have a bunch of data block groups to search
+	 * through first before you realize that you need to start over, so go
+	 * ahead and start over and save the time.
+	 */
+	if (!block_group || (!block_group_bits(block_group, data) &&
+			     last_ptr && *last_ptr)) {
+		if (search_start != orig_search_start) {
+			if (last_ptr && *last_ptr)
+				*last_ptr = 0;
+			search_start = orig_search_start;
+			goto new_group;
+		} else if (!chunk_alloc_done && allowed_chunk_alloc) {
+			ret = do_chunk_alloc(trans, root,
+					     num_bytes + 2 * 1024 * 1024,
+					     data, 1);
+			if (ret < 0) {
+				struct btrfs_space_info *info;
+
+				info = __find_space_info(root->fs_info, data);
+				goto error;
+			}
+			BUG_ON(ret);
+			chunk_alloc_done = 1;
+			search_start = orig_search_start;
+			goto new_group;
+		} else {
+			ret = -ENOSPC;
+			goto error;
 		}
-		chunk_alloc_done = 1;
-	}
-	ret = find_search_start(root, &block_group, &search_start,
-				total_needed, data);
-	if (ret == -ENOSPC && last_ptr && *last_ptr) {
-		*last_ptr = 0;
-		block_group = btrfs_lookup_first_block_group(info,
-							     orig_search_start);
-		search_start = orig_search_start;
-		ret = find_search_start(root, &block_group, &search_start,
-					total_needed, data);
 	}
-	if (ret == -ENOSPC)
-		goto enospc;
-	if (ret)
-		goto error;
 
-	if (last_ptr && *last_ptr && search_start != *last_ptr) {
-		*last_ptr = 0;
-		if (!empty_size) {
-			empty_size += empty_cluster;
-			total_needed += empty_size;
+	/*
+	 * this is going to seach through all of the existing block groups it
+	 * can find, so if we don't find something we need to see if we can
+	 * allocate what we need.
+	 */
+	ret = find_free_space(root, &block_group, &search_start,
+			      total_needed, data);
+	if (ret == -ENOSPC) {
+		/*
+		 * instead of allocating, start at the original search start
+		 * and see if there is something to be found, if not then we
+		 * allocate
+		 */
+		if (search_start != orig_search_start) {
+			if (last_ptr && *last_ptr) {
+				*last_ptr = 0;
+				total_needed += empty_cluster;
+			}
+			search_start = orig_search_start;
+			goto new_group;
 		}
-		block_group = btrfs_lookup_first_block_group(info,
-						       orig_search_start);
-		search_start = orig_search_start;
-		ret = find_search_start(root, &block_group,
-					&search_start, total_needed, data);
-		if (ret == -ENOSPC)
-			goto enospc;
-		if (ret)
+
+		/*
+		 * we've already allocated, we're pretty screwed
+		 */
+		if (chunk_alloc_done) {
 			goto error;
+		} else if (!allowed_chunk_alloc && block_group &&
+			   block_group_bits(block_group, data)) {
+			block_group->space_info->force_alloc = 1;
+			goto error;
+		} else if (!allowed_chunk_alloc) {
+			goto error;
+		}
+
+		ret = do_chunk_alloc(trans, root, num_bytes + 2 * 1024 * 1024,
+				     data, 1);
+		if (ret < 0)
+			goto error;
+
+		BUG_ON(ret);
+		chunk_alloc_done = 1;
+		if (block_group)
+			search_start = block_group->key.objectid +
+				block_group->key.offset;
+		else
+			search_start = orig_search_start;
+		goto new_group;
 	}
 
+	if (ret)
+		goto error;
+
 	search_start = stripe_align(root, search_start);
 	ins->objectid = search_start;
 	ins->offset = num_bytes;
 
-	if (ins->objectid + num_bytes >= search_end)
-		goto enospc;
+	if (ins->objectid + num_bytes >= search_end) {
+		search_start = orig_search_start;
+		if (chunk_alloc_done) {
+			ret = -ENOSPC;
+			goto error;
+		}
+		goto new_group;
+	}
 
 	if (ins->objectid + num_bytes >
 	    block_group->key.objectid + block_group->key.offset) {
+		if (search_start == orig_search_start && chunk_alloc_done) {
+			ret = -ENOSPC;
+			goto error;
+		}
 		search_start = block_group->key.objectid +
 			block_group->key.offset;
 		goto new_group;
 	}
 
-	if (test_range_bit(&info->extent_ins, ins->objectid,
-			   ins->objectid + num_bytes -1, EXTENT_LOCKED, 0)) {
-		search_start = ins->objectid + num_bytes;
-		goto new_group;
-	}
-
-	if (test_range_bit(&info->pinned_extents, ins->objectid,
-			   ins->objectid + num_bytes -1, EXTENT_DIRTY, 0)) {
-		search_start = ins->objectid + num_bytes;
-		goto new_group;
-	}
-
 	if (exclude_nr > 0 && (ins->objectid + num_bytes > exclude_start &&
 	    ins->objectid < exclude_start + exclude_nr)) {
 		search_start = exclude_start + exclude_nr;
 		goto new_group;
 	}
 
-	if (!(data & BTRFS_BLOCK_GROUP_DATA)) {
-		block_group = btrfs_lookup_block_group(info, ins->objectid);
-		if (block_group)
-			trans->block_group = block_group;
-	}
+	if (!(data & BTRFS_BLOCK_GROUP_DATA))
+		trans->block_group = block_group;
+
 	ins->offset = num_bytes;
 	if (last_ptr) {
 		*last_ptr = ins->objectid + ins->offset;
 		if (*last_ptr ==
-		    btrfs_super_total_bytes(&root->fs_info->super_copy)) {
+		    btrfs_super_total_bytes(&root->fs_info->super_copy))
 			*last_ptr = 0;
-		}
-	}
-	return 0;
-
-new_group:
-	if (search_start + num_bytes >= search_end) {
-enospc:
-		search_start = orig_search_start;
-		if (full_scan) {
-			ret = -ENOSPC;
-			goto error;
-		}
-		if (wrapped) {
-			if (!full_scan)
-				total_needed -= empty_size;
-			full_scan = 1;
-		} else
-			wrapped = 1;
 	}
-	block_group = btrfs_lookup_first_block_group(info, search_start);
-	cond_resched();
-	block_group = btrfs_find_block_group(root, block_group,
-					     search_start, data, 0);
-	goto check_failed;
 
+	ret = 0;
 error:
 	return ret;
 }
 
+static void dump_space_info(struct btrfs_space_info *info, u64 bytes)
+{
+	struct btrfs_block_group_cache *cache;
+	struct list_head *l;
+
+	printk(KERN_INFO "space_info has %Lu free, is %sfull\n",
+	       info->total_bytes - info->bytes_used - info->bytes_pinned,
+	       (info->full) ? "" : "not ");
+
+	spin_lock(&info->lock);
+	list_for_each(l, &info->block_groups) {
+		cache = list_entry(l, struct btrfs_block_group_cache, list);
+		spin_lock(&cache->lock);
+		printk(KERN_INFO "block group %Lu has %Lu bytes, %Lu used "
+		       "%Lu pinned\n",
+		       cache->key.objectid, cache->key.offset,
+		       btrfs_block_group_used(&cache->item), cache->pinned);
+		btrfs_dump_free_space(cache, bytes);
+		spin_unlock(&cache->lock);
+	}
+	spin_unlock(&info->lock);
+}
 static int __btrfs_reserve_extent(struct btrfs_trans_handle *trans,
 				  struct btrfs_root *root,
 				  u64 num_bytes, u64 min_alloc_size,
@@ -2133,6 +2171,7 @@ static int __btrfs_reserve_extent(struct btrfs_trans_handle *trans,
 	u64 search_start = 0;
 	u64 alloc_profile;
 	struct btrfs_fs_info *info = root->fs_info;
+	struct btrfs_block_group_cache *cache;
 
 	if (data) {
 		alloc_profile = info->avail_data_alloc_bits &
@@ -2160,11 +2199,9 @@ again:
 				     BTRFS_BLOCK_GROUP_METADATA |
 				     (info->metadata_alloc_profile &
 				      info->avail_metadata_alloc_bits), 0);
-			BUG_ON(ret);
 		}
 		ret = do_chunk_alloc(trans, root->fs_info->extent_root,
 				     num_bytes + 2 * 1024 * 1024, data, 0);
-		BUG_ON(ret);
 	}
 
 	WARN_ON(num_bytes < root->sectorsize);
@@ -2175,26 +2212,44 @@ again:
 
 	if (ret == -ENOSPC && num_bytes > min_alloc_size) {
 		num_bytes = num_bytes >> 1;
+		num_bytes = num_bytes & ~(root->sectorsize - 1);
 		num_bytes = max(num_bytes, min_alloc_size);
 		do_chunk_alloc(trans, root->fs_info->extent_root,
 			       num_bytes, data, 1);
 		goto again;
 	}
 	if (ret) {
-		printk("allocation failed flags %Lu\n", data);
+		struct btrfs_space_info *sinfo;
+
+		sinfo = __find_space_info(root->fs_info, data);
+		printk("allocation failed flags %Lu, wanted %Lu\n",
+		       data, num_bytes);
+		dump_space_info(sinfo, num_bytes);
 		BUG();
 	}
-	clear_extent_dirty(&root->fs_info->free_space_cache,
-			   ins->objectid, ins->objectid + ins->offset - 1,
-			   GFP_NOFS);
-	return 0;
+	cache = btrfs_lookup_block_group(root->fs_info, ins->objectid);
+	if (!cache) {
+		printk(KERN_ERR "Unable to find block group for %Lu\n", ins->objectid);
+		return -ENOSPC;
+	}
+
+	ret = btrfs_remove_free_space(cache, ins->objectid, ins->offset);
+
+	return ret;
 }
 
 int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len)
 {
+	struct btrfs_block_group_cache *cache;
+
 	maybe_lock_mutex(root);
-	set_extent_dirty(&root->fs_info->free_space_cache,
-			 start, start + len - 1, GFP_NOFS);
+	cache = btrfs_lookup_block_group(root->fs_info, start);
+	if (!cache) {
+		printk(KERN_ERR "Unable to find block group for %Lu\n", start);
+		maybe_unlock_mutex(root);
+		return -ENOSPC;
+	}
+	btrfs_add_free_space(cache, start, len);
 	maybe_unlock_mutex(root);
 	return 0;
 }
@@ -2264,8 +2319,8 @@ static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
 
 	ret = btrfs_insert_empty_items(trans, extent_root, path, keys,
 				       sizes, 2);
-
 	BUG_ON(ret);
+
 	extent_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
 				     struct btrfs_extent_item);
 	btrfs_set_extent_refs(path->nodes[0], extent_item, 1);
@@ -2336,9 +2391,9 @@ int btrfs_alloc_logged_extent(struct btrfs_trans_handle *trans,
 	block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid);
 	cache_block_group(root, block_group);
 
-	clear_extent_dirty(&root->fs_info->free_space_cache,
-			   ins->objectid, ins->objectid + ins->offset - 1,
-			   GFP_NOFS);
+	ret = btrfs_remove_free_space(block_group, ins->objectid, ins->offset);
+	BUG_ON(ret);
+
 	ret = __btrfs_alloc_reserved_extent(trans, root, root_objectid,
 					    ref_generation, owner,
 					    owner_offset, ins);
@@ -2843,31 +2898,24 @@ out:
 
 int btrfs_free_block_groups(struct btrfs_fs_info *info)
 {
-	u64 start;
-	u64 end;
-	u64 ptr;
-	int ret;
+	struct btrfs_block_group_cache *block_group;
+	struct rb_node *n;
 
 	mutex_lock(&info->alloc_mutex);
-	while(1) {
-		ret = find_first_extent_bit(&info->block_group_cache, 0,
-					    &start, &end, (unsigned int)-1);
-		if (ret)
-			break;
-		ret = get_state_private(&info->block_group_cache, start, &ptr);
-		if (!ret)
-			kfree((void *)(unsigned long)ptr);
-		clear_extent_bits(&info->block_group_cache, start,
-				  end, (unsigned int)-1, GFP_NOFS);
-	}
-	while(1) {
-		ret = find_first_extent_bit(&info->free_space_cache, 0,
-					    &start, &end, EXTENT_DIRTY);
-		if (ret)
-			break;
-		clear_extent_dirty(&info->free_space_cache, start,
-				   end, GFP_NOFS);
-	}
+	spin_lock(&info->block_group_cache_lock);
+	while ((n = rb_last(&info->block_group_cache_tree)) != NULL) {
+		block_group = rb_entry(n, struct btrfs_block_group_cache,
+				       cache_node);
+
+		btrfs_remove_free_space_cache(block_group);
+		rb_erase(&block_group->cache_node,
+			 &info->block_group_cache_tree);
+		spin_lock(&block_group->space_info->lock);
+		list_del(&block_group->list);
+		spin_unlock(&block_group->space_info->lock);
+		kfree(block_group);
+	}
+	spin_unlock(&info->block_group_cache_lock);
 	mutex_unlock(&info->alloc_mutex);
 	return 0;
 }
@@ -3386,7 +3434,6 @@ int btrfs_shrink_extent_tree(struct btrfs_root *root, u64 shrink_start)
 	u64 total_found;
 	u64 shrink_last_byte;
 	struct btrfs_block_group_cache *shrink_block_group;
-	struct btrfs_fs_info *info = root->fs_info;
 	struct btrfs_key key;
 	struct btrfs_key found_key;
 	struct extent_buffer *leaf;
@@ -3542,15 +3589,17 @@ next:
 		goto out;
 	}
 
-	clear_extent_bits(&info->block_group_cache, key.objectid,
-			  key.objectid + key.offset - 1,
-			  (unsigned int)-1, GFP_NOFS);
-
-
-	clear_extent_bits(&info->free_space_cache,
-			   key.objectid, key.objectid + key.offset - 1,
-			   (unsigned int)-1, GFP_NOFS);
+	spin_lock(&root->fs_info->block_group_cache_lock);
+	rb_erase(&shrink_block_group->cache_node,
+		 &root->fs_info->block_group_cache_tree);
+	spin_unlock(&root->fs_info->block_group_cache_lock);
 
+	ret = btrfs_remove_free_space(shrink_block_group, key.objectid,
+				      key.offset);
+	if (ret) {
+		btrfs_end_transaction(trans, root);
+		goto out;
+	}
 	/*
 	memset(shrink_block_group, 0, sizeof(*shrink_block_group));
 	kfree(shrink_block_group);
@@ -3566,9 +3615,9 @@ next:
 	/* the code to unpin extents might set a few bits in the free
 	 * space cache for this range again
 	 */
-	clear_extent_bits(&info->free_space_cache,
-			   key.objectid, key.objectid + key.offset - 1,
-			   (unsigned int)-1, GFP_NOFS);
+	/* XXX? */
+	ret = btrfs_remove_free_space(shrink_block_group, key.objectid,
+				      key.offset);
 out:
 	btrfs_free_path(path);
 	mutex_unlock(&root->fs_info->alloc_mutex);
@@ -3616,16 +3665,13 @@ int btrfs_read_block_groups(struct btrfs_root *root)
 {
 	struct btrfs_path *path;
 	int ret;
-	int bit;
 	struct btrfs_block_group_cache *cache;
 	struct btrfs_fs_info *info = root->fs_info;
 	struct btrfs_space_info *space_info;
-	struct extent_io_tree *block_group_cache;
 	struct btrfs_key key;
 	struct btrfs_key found_key;
 	struct extent_buffer *leaf;
 
-	block_group_cache = &info->block_group_cache;
 	root = info->extent_root;
 	key.objectid = 0;
 	key.offset = 0;
@@ -3653,6 +3699,7 @@ int btrfs_read_block_groups(struct btrfs_root *root)
 		}
 
 		spin_lock_init(&cache->lock);
+		INIT_LIST_HEAD(&cache->list);
 		read_extent_buffer(leaf, &cache->item,
 				   btrfs_item_ptr_offset(leaf, path->slots[0]),
 				   sizeof(cache->item));
@@ -3661,31 +3708,19 @@ int btrfs_read_block_groups(struct btrfs_root *root)
 		key.objectid = found_key.objectid + found_key.offset;
 		btrfs_release_path(root, path);
 		cache->flags = btrfs_block_group_flags(&cache->item);
-		bit = 0;
-		if (cache->flags & BTRFS_BLOCK_GROUP_DATA) {
-			bit = BLOCK_GROUP_DATA;
-		} else if (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM) {
-			bit = BLOCK_GROUP_SYSTEM;
-		} else if (cache->flags & BTRFS_BLOCK_GROUP_METADATA) {
-			bit = BLOCK_GROUP_METADATA;
-		}
-		set_avail_alloc_bits(info, cache->flags);
 
 		ret = update_space_info(info, cache->flags, found_key.offset,
 					btrfs_block_group_used(&cache->item),
 					&space_info);
 		BUG_ON(ret);
 		cache->space_info = space_info;
+		spin_lock(&space_info->lock);
+		list_add(&cache->list, &space_info->block_groups);
+		spin_unlock(&space_info->lock);
+
+		ret = btrfs_add_block_group_cache(root->fs_info, cache);
+		BUG_ON(ret);
 
-		/* use EXTENT_LOCKED to prevent merging */
-		set_extent_bits(block_group_cache, found_key.objectid,
-				found_key.objectid + found_key.offset - 1,
-				EXTENT_LOCKED, GFP_NOFS);
-		set_state_private(block_group_cache, found_key.objectid,
-				  (unsigned long)cache);
-		set_extent_bits(block_group_cache, found_key.objectid,
-				found_key.objectid + found_key.offset - 1,
-				bit | EXTENT_LOCKED, GFP_NOFS);
 		if (key.objectid >=
 		    btrfs_super_total_bytes(&info->super_copy))
 			break;
@@ -3703,22 +3738,22 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
 			   u64 size)
 {
 	int ret;
-	int bit = 0;
 	struct btrfs_root *extent_root;
 	struct btrfs_block_group_cache *cache;
-	struct extent_io_tree *block_group_cache;
 
 	WARN_ON(!mutex_is_locked(&root->fs_info->alloc_mutex));
 	extent_root = root->fs_info->extent_root;
-	block_group_cache = &root->fs_info->block_group_cache;
 
 	root->fs_info->last_trans_new_blockgroup = trans->transid;
 
 	cache = kzalloc(sizeof(*cache), GFP_NOFS);
-	BUG_ON(!cache);
+	if (!cache)
+		return -ENOMEM;
+
 	cache->key.objectid = chunk_offset;
 	cache->key.offset = size;
 	spin_lock_init(&cache->lock);
+	INIT_LIST_HEAD(&cache->list);
 	btrfs_set_key_type(&cache->key, BTRFS_BLOCK_GROUP_ITEM_KEY);
 
 	btrfs_set_block_group_used(&cache->item, bytes_used);
@@ -3729,16 +3764,12 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
 	ret = update_space_info(root->fs_info, cache->flags, size, bytes_used,
 				&cache->space_info);
 	BUG_ON(ret);
+	spin_lock(&cache->space_info->lock);
+	list_add(&cache->list, &cache->space_info->block_groups);
+	spin_unlock(&cache->space_info->lock);
 
-	bit = block_group_state_bits(type);
-	set_extent_bits(block_group_cache, chunk_offset,
-			chunk_offset + size - 1,
-			EXTENT_LOCKED, GFP_NOFS);
-	set_state_private(block_group_cache, chunk_offset,
-			  (unsigned long)cache);
-	set_extent_bits(block_group_cache, chunk_offset,
-			chunk_offset + size - 1,
-			bit | EXTENT_LOCKED, GFP_NOFS);
+	ret = btrfs_add_block_group_cache(root->fs_info, cache);
+	BUG_ON(ret);
 
 	ret = btrfs_insert_item(trans, extent_root, &cache->key, &cache->item,
 				sizeof(cache->item));
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 319a0c7a4a58..8624f3e88036 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2634,6 +2634,7 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
 	if (eb) {
 		atomic_inc(&eb->refs);
 		spin_unlock(&tree->buffer_lock);
+		mark_page_accessed(eb->first_page);
 		return eb;
 	}
 	spin_unlock(&tree->buffer_lock);
@@ -2713,6 +2714,9 @@ struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
 		atomic_inc(&eb->refs);
 	spin_unlock(&tree->buffer_lock);
 
+	if (eb)
+		mark_page_accessed(eb->first_page);
+
 	return eb;
 }
 EXPORT_SYMBOL(find_extent_buffer);
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
new file mode 100644
index 000000000000..01c26e8ae555
--- /dev/null
+++ b/fs/btrfs/free-space-cache.c
@@ -0,0 +1,415 @@
+/*
+ * Copyright (C) 2008 Red Hat.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/sched.h>
+#include "ctree.h"
+
+static int tree_insert_offset(struct rb_root *root, u64 offset,
+			      struct rb_node *node)
+{
+	struct rb_node **p = &root->rb_node;
+	struct rb_node *parent = NULL;
+	struct btrfs_free_space *info;
+
+	while (*p) {
+		parent = *p;
+		info = rb_entry(parent, struct btrfs_free_space, offset_index);
+
+		if (offset < info->offset)
+			p = &(*p)->rb_left;
+		else if (offset > info->offset)
+			p = &(*p)->rb_right;
+		else
+			return -EEXIST;
+	}
+
+	rb_link_node(node, parent, p);
+	rb_insert_color(node, root);
+
+	return 0;
+}
+
+static int tree_insert_bytes(struct rb_root *root, u64 bytes,
+			     struct rb_node *node)
+{
+	struct rb_node **p = &root->rb_node;
+	struct rb_node *parent = NULL;
+	struct btrfs_free_space *info;
+
+	while (*p) {
+		parent = *p;
+		info = rb_entry(parent, struct btrfs_free_space, bytes_index);
+
+		if (bytes < info->bytes)
+			p = &(*p)->rb_left;
+		else
+			p = &(*p)->rb_right;
+	}
+
+	rb_link_node(node, parent, p);
+	rb_insert_color(node, root);
+
+	return 0;
+}
+
+/*
+ * searches the tree for the given offset.  If contains is set we will return
+ * the free space that contains the given offset.  If contains is not set we
+ * will return the free space that starts at or after the given offset and is
+ * at least bytes long.
+ */
+static struct btrfs_free_space *tree_search_offset(struct rb_root *root,
+						   u64 offset, u64 bytes,
+						   int contains)
+{
+	struct rb_node *n = root->rb_node;
+	struct btrfs_free_space *entry, *ret = NULL;
+
+	while (n) {
+		entry = rb_entry(n, struct btrfs_free_space, offset_index);
+
+		if (offset < entry->offset) {
+			if (!contains &&
+			    (!ret || entry->offset < ret->offset) &&
+			    (bytes <= entry->bytes))
+				ret = entry;
+			n = n->rb_left;
+		} else if (offset > entry->offset) {
+			if (contains &&
+			    (entry->offset + entry->bytes - 1) >= offset) {
+				ret = entry;
+				break;
+			}
+			n = n->rb_right;
+		} else {
+			if (bytes > entry->bytes) {
+				n = n->rb_right;
+				continue;
+			}
+			ret = entry;
+			break;
+		}
+	}
+
+	return ret;
+}
+
+/*
+ * return a chunk at least bytes size, as close to offset that we can get.
+ */
+static struct btrfs_free_space *tree_search_bytes(struct rb_root *root,
+						  u64 offset, u64 bytes)
+{
+	struct rb_node *n = root->rb_node;
+	struct btrfs_free_space *entry, *ret = NULL;
+
+	while (n) {
+		entry = rb_entry(n, struct btrfs_free_space, bytes_index);
+
+		if (bytes < entry->bytes) {
+			/*
+			 * We prefer to get a hole size as close to the size we
+			 * are asking for so we don't take small slivers out of
+			 * huge holes, but we also want to get as close to the
+			 * offset as possible so we don't have a whole lot of
+			 * fragmentation.
+			 */
+			if (offset <= entry->offset) {
+				if (!ret)
+					ret = entry;
+				else if (entry->bytes < ret->bytes)
+					ret = entry;
+				else if (entry->offset < ret->offset)
+					ret = entry;
+			}
+			n = n->rb_left;
+		} else if (bytes > entry->bytes) {
+			n = n->rb_right;
+		} else {
+			/*
+			 * Ok we may have multiple chunks of the wanted size,
+			 * so we don't want to take the first one we find, we
+			 * want to take the one closest to our given offset, so
+			 * keep searching just in case theres a better match.
+			 */
+			n = n->rb_right;
+			if (offset > entry->offset)
+				continue;
+			else if (!ret || entry->offset < ret->offset)
+				ret = entry;
+		}
+	}
+
+	return ret;
+}
+
+static void unlink_free_space(struct btrfs_block_group_cache *block_group,
+			      struct btrfs_free_space *info)
+{
+	rb_erase(&info->offset_index, &block_group->free_space_offset);
+	rb_erase(&info->bytes_index, &block_group->free_space_bytes);
+}
+
+static int link_free_space(struct btrfs_block_group_cache *block_group,
+			   struct btrfs_free_space *info)
+{
+	int ret = 0;
+
+
+	ret = tree_insert_offset(&block_group->free_space_offset, info->offset,
+				 &info->offset_index);
+	if (ret)
+		return ret;
+
+	ret = tree_insert_bytes(&block_group->free_space_bytes, info->bytes,
+				&info->bytes_index);
+	if (ret)
+		return ret;
+
+	return ret;
+}
+
+int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
+			 u64 offset, u64 bytes)
+{
+	struct btrfs_free_space *right_info;
+	struct btrfs_free_space *left_info;
+	struct btrfs_free_space *info = NULL;
+	struct btrfs_free_space *alloc_info;
+	int ret = 0;
+
+	alloc_info = kzalloc(sizeof(struct btrfs_free_space), GFP_NOFS);
+	if (!alloc_info)
+		return -ENOMEM;
+
+	/*
+	 * first we want to see if there is free space adjacent to the range we
+	 * are adding, if there is remove that struct and add a new one to
+	 * cover the entire range
+	 */
+	spin_lock(&block_group->lock);
+
+	right_info = tree_search_offset(&block_group->free_space_offset,
+					offset+bytes, 0, 1);
+	left_info = tree_search_offset(&block_group->free_space_offset,
+				       offset-1, 0, 1);
+
+	if (right_info && right_info->offset == offset+bytes) {
+		unlink_free_space(block_group, right_info);
+		info = right_info;
+		info->offset = offset;
+		info->bytes += bytes;
+	} else if (right_info && right_info->offset != offset+bytes) {
+		printk(KERN_ERR "adding space in the middle of an existing "
+		       "free space area. existing: offset=%Lu, bytes=%Lu. "
+		       "new: offset=%Lu, bytes=%Lu\n", right_info->offset,
+		       right_info->bytes, offset, bytes);
+		BUG();
+	}
+
+	if (left_info) {
+		unlink_free_space(block_group, left_info);
+
+		if (unlikely((left_info->offset + left_info->bytes) !=
+			     offset)) {
+			printk(KERN_ERR "free space to the left of new free "
+			       "space isn't quite right. existing: offset=%Lu,"
+			       " bytes=%Lu. new: offset=%Lu, bytes=%Lu\n",
+			       left_info->offset, left_info->bytes, offset,
+			       bytes);
+			BUG();
+		}
+
+		if (info) {
+			info->offset = left_info->offset;
+			info->bytes += left_info->bytes;
+			kfree(left_info);
+		} else {
+			info = left_info;
+			info->bytes += bytes;
+		}
+	}
+
+	if (info) {
+		ret = link_free_space(block_group, info);
+		if (!ret)
+			info = NULL;
+		goto out;
+	}
+
+	info = alloc_info;
+	alloc_info = NULL;
+	info->offset = offset;
+	info->bytes = bytes;
+
+	ret = link_free_space(block_group, info);
+	if (ret)
+		kfree(info);
+out:
+	spin_unlock(&block_group->lock);
+	if (ret) {
+		printk(KERN_ERR "btrfs: unable to add free space :%d\n", ret);
+		if (ret == -EEXIST)
+			BUG();
+	}
+
+	if (alloc_info)
+		kfree(alloc_info);
+
+	return ret;
+}
+
+int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
+			    u64 offset, u64 bytes)
+{
+	struct btrfs_free_space *info;
+	int ret = 0;
+
+	spin_lock(&block_group->lock);
+	info = tree_search_offset(&block_group->free_space_offset, offset, 0,
+				  1);
+
+	if (info && info->offset == offset) {
+		if (info->bytes < bytes) {
+			printk(KERN_ERR "Found free space at %Lu, size %Lu,"
+			       "trying to use %Lu\n",
+			       info->offset, info->bytes, bytes);
+			WARN_ON(1);
+			ret = -EINVAL;
+			goto out;
+		}
+
+		unlink_free_space(block_group, info);
+
+		if (info->bytes == bytes) {
+			kfree(info);
+			goto out;
+		}
+
+		info->offset += bytes;
+		info->bytes -= bytes;
+
+		ret = link_free_space(block_group, info);
+		BUG_ON(ret);
+	} else {
+		WARN_ON(1);
+	}
+out:
+	spin_unlock(&block_group->lock);
+	return ret;
+}
+
+void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
+			   u64 bytes)
+{
+	struct btrfs_free_space *info;
+	struct rb_node *n;
+	int count = 0;
+
+	for (n = rb_first(&block_group->free_space_offset); n; n = rb_next(n)) {
+		info = rb_entry(n, struct btrfs_free_space, offset_index);
+		if (info->bytes >= bytes)
+			count++;
+		//printk(KERN_INFO "offset=%Lu, bytes=%Lu\n", info->offset,
+		//       info->bytes);
+	}
+	printk(KERN_INFO "%d blocks of free space at or bigger than bytes is"
+	       "\n", count);
+}
+
+u64 btrfs_block_group_free_space(struct btrfs_block_group_cache *block_group)
+{
+	struct btrfs_free_space *info;
+	struct rb_node *n;
+	u64 ret = 0;
+
+	for (n = rb_first(&block_group->free_space_offset); n;
+	     n = rb_next(n)) {
+		info = rb_entry(n, struct btrfs_free_space, offset_index);
+		ret += info->bytes;
+	}
+
+	return ret;
+}
+
+void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group)
+{
+	struct btrfs_free_space *info;
+	struct rb_node *node;
+
+	spin_lock(&block_group->lock);
+	while ((node = rb_last(&block_group->free_space_bytes)) != NULL) {
+		info = rb_entry(node, struct btrfs_free_space, bytes_index);
+		unlink_free_space(block_group, info);
+		kfree(info);
+		if (need_resched()) {
+			spin_unlock(&block_group->lock);
+			cond_resched();
+			spin_lock(&block_group->lock);
+		}
+	}
+	spin_unlock(&block_group->lock);
+}
+
+struct btrfs_free_space *btrfs_find_free_space_offset(struct
+						      btrfs_block_group_cache
+						      *block_group, u64 offset,
+						      u64 bytes)
+{
+	struct btrfs_free_space *ret;
+
+	spin_lock(&block_group->lock);
+	ret = tree_search_offset(&block_group->free_space_offset, offset,
+				 bytes, 0);
+	spin_unlock(&block_group->lock);
+
+	return ret;
+}
+
+struct btrfs_free_space *btrfs_find_free_space_bytes(struct
+						     btrfs_block_group_cache
+						     *block_group, u64 offset,
+						     u64 bytes)
+{
+	struct btrfs_free_space *ret;
+
+	spin_lock(&block_group->lock);
+
+	ret = tree_search_bytes(&block_group->free_space_bytes, offset, bytes);
+	spin_unlock(&block_group->lock);
+
+	return ret;
+}
+
+struct btrfs_free_space *btrfs_find_free_space(struct btrfs_block_group_cache
+					       *block_group, u64 offset,
+					       u64 bytes)
+{
+	struct btrfs_free_space *ret;
+
+	spin_lock(&block_group->lock);
+	ret = tree_search_offset(&block_group->free_space_offset, offset,
+				 bytes, 0);
+	if (!ret)
+		ret = tree_search_bytes(&block_group->free_space_bytes,
+					offset, bytes);
+
+	spin_unlock(&block_group->lock);
+
+	return ret;
+}
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 12c1c0530f3d..65b4f864b0df 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -141,7 +141,7 @@ static int cow_file_range(struct inode *inode, u64 start, u64 end)
 	while(num_bytes > 0) {
 		cur_alloc_size = min(num_bytes, root->fs_info->max_extent);
 		ret = btrfs_reserve_extent(trans, root, cur_alloc_size,
-					   root->sectorsize, 0, 0,
+					   root->sectorsize, 0, alloc_hint,
 					   (u64)-1, &ins, 1);
 		if (ret) {
 			WARN_ON(1);
@@ -558,7 +558,6 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
 					  trans->transid, inode->i_ino,
 					  ordered_extent->file_offset, &ins);
 	BUG_ON(ret);
-
 	mutex_lock(&BTRFS_I(inode)->extent_mutex);
 
 	ret = btrfs_drop_extents(trans, root, inode,
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 1546fa6f4f7d..b9e5c2d82dde 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -64,8 +64,8 @@ static void lock_chunks(struct btrfs_root *root)
 
 static void unlock_chunks(struct btrfs_root *root)
 {
-	mutex_unlock(&root->fs_info->alloc_mutex);
 	mutex_unlock(&root->fs_info->chunk_mutex);
+	mutex_unlock(&root->fs_info->alloc_mutex);
 }
 
 int btrfs_cleanup_fs_uuids(void)
@@ -1668,8 +1668,13 @@ again:
 	else
 		min_free = calc_size;
 
-	/* we add 1MB because we never use the first 1MB of the device */
-	min_free += 1024 * 1024;
+	/*
+	 * we add 1MB because we never use the first 1MB of the device, unless
+	 * we've looped, then we are likely allocating the maximum amount of
+	 * space left already
+	 */
+	if (!looped)
+		min_free += 1024 * 1024;
 
 	/* build a private list of devices we will allocate from */
 	while(index < num_stripes) {
-- 
cgit v1.2.3


From 2b1f55b0f0d0d1a66470ef4ea2696cd5dd741a12 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 24 Sep 2008 11:48:04 -0400
Subject: Remove Btrfs compat code for older kernels

Btrfs had compatibility code for kernels back to 2.6.18.  These have
been removed, and will be maintained in a separate backport
git tree from now on.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/async-thread.c |   6 ---
 fs/btrfs/compat.h       |  40 +-------------
 fs/btrfs/crc32c.h       |   6 ---
 fs/btrfs/ctree.h        |   7 +--
 fs/btrfs/disk-io.c      |  28 ----------
 fs/btrfs/export.c       |   6 ---
 fs/btrfs/extent_io.c    |  66 ++---------------------
 fs/btrfs/file.c         |  24 +--------
 fs/btrfs/inode.c        | 135 +-----------------------------------------------
 fs/btrfs/super.c        |   7 ---
 fs/btrfs/sysfs.c        |  33 ------------
 fs/btrfs/volumes.c      |  20 -------
 12 files changed, 10 insertions(+), 368 deletions(-)

(limited to 'fs/btrfs/volumes.c')

diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index 2ee301740195..4e780b279de6 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -20,13 +20,7 @@
 #include <linux/kthread.h>
 #include <linux/list.h>
 #include <linux/spinlock.h>
-
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,20)
 # include <linux/freezer.h>
-#else
-# include <linux/sched.h>
-#endif
-
 #include "async-thread.h"
 
 /*
diff --git a/fs/btrfs/compat.h b/fs/btrfs/compat.h
index b0ed1887d9b1..cd6598b169df 100644
--- a/fs/btrfs/compat.h
+++ b/fs/btrfs/compat.h
@@ -1,9 +1,8 @@
 #ifndef _COMPAT_H_
 #define _COMPAT_H_
 
-#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,26)
-#define trylock_page(page) (!TestSetPageLocked(page))
-#endif
+#define btrfs_drop_nlink(inode) drop_nlink(inode)
+#define btrfs_inc_nlink(inode)	inc_nlink(inode)
 
 #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,27)
 static inline struct dentry *d_obtain_alias(struct inode *inode)
@@ -22,39 +21,4 @@ static inline struct dentry *d_obtain_alias(struct inode *inode)
 }
 #endif
 
-#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18)
-static inline void btrfs_drop_nlink(struct inode *inode)
-{
-	inode->i_nlink--;
-}
-
-static inline void btrfs_inc_nlink(struct inode *inode)
-{
-	inode->i_nlink++;
-}
-#else
-# define btrfs_drop_nlink(inode) drop_nlink(inode)
-# define btrfs_inc_nlink(inode)	inc_nlink(inode)
-#endif
-
-/*
- * Even if AppArmor isn't enabled, it still has different prototypes.
- * Add more distro/version pairs here to declare which has AppArmor applied.
- */
-#if defined(CONFIG_SUSE_KERNEL)
-# if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,22)
-# define REMOVE_SUID_PATH 1
-# endif
-#endif
-
-/*
- * catch any other distros that have patched in apparmor.  This isn't
- * 100% reliable because it won't catch people that hand compile their
- * own distro kernels without apparmor compiled in.  But, it is better
- * than nothing.
- */
-#ifdef CONFIG_SECURITY_APPARMOR
-# define REMOVE_SUID_PATH 1
-#endif
-
 #endif /* _COMPAT_H_ */
diff --git a/fs/btrfs/crc32c.h b/fs/btrfs/crc32c.h
index bf6c12e85730..4f0fefed132a 100644
--- a/fs/btrfs/crc32c.h
+++ b/fs/btrfs/crc32c.h
@@ -96,13 +96,7 @@ static inline u32 __btrfs_crc32c(u32 crc, unsigned char const *address,
  * We must workaround older implementations of crc32c_le()
  * found on older kernel versions.
  */
-#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,23)
-#define btrfs_crc32c(seed, data, length) \
-	__cpu_to_le32( __btrfs_crc32c( __le32_to_cpu(seed), \
-		                      (unsigned char const *)data, length) )
-#else
 #define btrfs_crc32c(seed, data, length) \
 	__btrfs_crc32c(seed, (unsigned char const *)data, length)
 #endif
-#endif
 
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 138c157bbc45..3b3c1ca50c5d 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1472,12 +1472,9 @@ static inline u32 btrfs_level_size(struct btrfs_root *root, int level) {
 	((unsigned long)(btrfs_leaf_data(leaf) + \
 	btrfs_item_offset_nr(leaf, slot)))
 
-static inline struct dentry *fdentry(struct file *file) {
-#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18)
-	return file->f_dentry;
-#else
+static inline struct dentry *fdentry(struct file *file)
+{
 	return file->f_path.dentry;
-#endif
 }
 
 /* extent-tree.c */
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index d35ca6a3f513..dffb8dabd533 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -26,11 +26,7 @@
 #include <linux/buffer_head.h> // for block_sync_page
 #include <linux/workqueue.h>
 #include <linux/kthread.h>
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,20)
 # include <linux/freezer.h>
-#else
-# include <linux/sched.h>
-#endif
 #include "crc32c.h"
 #include "ctree.h"
 #include "disk-io.h"
@@ -373,21 +369,11 @@ out:
 	return ret;
 }
 
-#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,23)
 static void end_workqueue_bio(struct bio *bio, int err)
-#else
-static int end_workqueue_bio(struct bio *bio,
-				   unsigned int bytes_done, int err)
-#endif
 {
 	struct end_io_wq *end_io_wq = bio->bi_private;
 	struct btrfs_fs_info *fs_info;
 
-#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
-	if (bio->bi_size)
-		return 1;
-#endif
-
 	fs_info = end_io_wq->info;
 	end_io_wq->error = err;
 	end_io_wq->work.func = end_workqueue_fn;
@@ -397,10 +383,6 @@ static int end_workqueue_bio(struct bio *bio,
 				   &end_io_wq->work);
 	else
 		btrfs_queue_worker(&fs_info->endio_workers, &end_io_wq->work);
-
-#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
-	return 0;
-#endif
 }
 
 int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
@@ -1161,9 +1143,7 @@ void btrfs_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
 
 static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi)
 {
-#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,23)
 	bdi_init(bdi);
-#endif
 	bdi->ra_pages	= default_backing_dev_info.ra_pages;
 	bdi->state		= 0;
 	bdi->capabilities	= default_backing_dev_info.capabilities;
@@ -1242,11 +1222,7 @@ static void end_workqueue_fn(struct btrfs_work *work)
 	bio->bi_private = end_io_wq->private;
 	bio->bi_end_io = end_io_wq->end_io;
 	kfree(end_io_wq);
-#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
-	bio_endio(bio, bio->bi_size, error);
-#else
 	bio_endio(bio, error);
-#endif
 }
 
 static int cleaner_kthread(void *arg)
@@ -1673,9 +1649,7 @@ fail:
 
 	kfree(extent_root);
 	kfree(tree_root);
-#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,23)
 	bdi_destroy(&fs_info->bdi);
-#endif
 	kfree(fs_info);
 	return ERR_PTR(err);
 }
@@ -1936,9 +1910,7 @@ int close_ctree(struct btrfs_root *root)
 	btrfs_close_devices(fs_info->fs_devices);
 	btrfs_mapping_tree_free(&fs_info->mapping_tree);
 
-#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,23)
 	bdi_destroy(&fs_info->bdi);
-#endif
 
 	kfree(fs_info->extent_root);
 	kfree(fs_info->tree_root);
diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
index 2b357a6d2407..48b82cd7583c 100644
--- a/fs/btrfs/export.c
+++ b/fs/btrfs/export.c
@@ -7,12 +7,6 @@
 #include "export.h"
 #include "compat.h"
 
-#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,28)
-#define FILEID_BTRFS_WITHOUT_PARENT		0x4d
-#define FILEID_BTRFS_WITH_PARENT 		0x4e
-#define FILEID_BTRFS_WITH_PARENT_ROOT 		0x4f
-#endif
-
 #define BTRFS_FID_SIZE_NON_CONNECTABLE		(offsetof(struct btrfs_fid, parent_objectid)/4)
 #define BTRFS_FID_SIZE_CONNECTABLE		(offsetof(struct btrfs_fid, parent_root_objectid)/4)
 #define BTRFS_FID_SIZE_CONNECTABLE_ROOT		(sizeof(struct btrfs_fid)/4)
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 58ad25838a41..e3a25be5c663 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1397,12 +1397,7 @@ static int check_page_writeback(struct extent_io_tree *tree,
  * Scheduling is not allowed, so the extent state tree is expected
  * to have one and only one object corresponding to this IO.
  */
-#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,23)
 static void end_bio_extent_writepage(struct bio *bio, int err)
-#else
-static int end_bio_extent_writepage(struct bio *bio,
-				   unsigned int bytes_done, int err)
-#endif
 {
 	int uptodate = err == 0;
 	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
@@ -1412,10 +1407,6 @@ static int end_bio_extent_writepage(struct bio *bio,
 	int whole_page;
 	int ret;
 
-#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
-	if (bio->bi_size)
-		return 1;
-#endif
 	do {
 		struct page *page = bvec->bv_page;
 		tree = &BTRFS_I(page->mapping->host)->io_tree;
@@ -1461,10 +1452,8 @@ static int end_bio_extent_writepage(struct bio *bio,
 		else
 			check_page_writeback(tree, page);
 	} while (bvec >= bio->bi_io_vec);
+
 	bio_put(bio);
-#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
-	return 0;
-#endif
 }
 
 /*
@@ -1478,12 +1467,7 @@ static int end_bio_extent_writepage(struct bio *bio,
  * Scheduling is not allowed, so the extent state tree is expected
  * to have one and only one object corresponding to this IO.
  */
-#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,23)
 static void end_bio_extent_readpage(struct bio *bio, int err)
-#else
-static int end_bio_extent_readpage(struct bio *bio,
-				   unsigned int bytes_done, int err)
-#endif
 {
 	int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
 	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
@@ -1493,11 +1477,6 @@ static int end_bio_extent_readpage(struct bio *bio,
 	int whole_page;
 	int ret;
 
-#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
-	if (bio->bi_size)
-		return 1;
-#endif
-
 	do {
 		struct page *page = bvec->bv_page;
 		tree = &BTRFS_I(page->mapping->host)->io_tree;
@@ -1556,9 +1535,6 @@ static int end_bio_extent_readpage(struct bio *bio,
 	} while (bvec >= bio->bi_io_vec);
 
 	bio_put(bio);
-#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
-	return 0;
-#endif
 }
 
 /*
@@ -1566,12 +1542,7 @@ static int end_bio_extent_readpage(struct bio *bio,
  * the structs in the extent tree when done, and set the uptodate bits
  * as appropriate.
  */
-#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,23)
 static void end_bio_extent_preparewrite(struct bio *bio, int err)
-#else
-static int end_bio_extent_preparewrite(struct bio *bio,
-				       unsigned int bytes_done, int err)
-#endif
 {
 	const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
 	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
@@ -1579,11 +1550,6 @@ static int end_bio_extent_preparewrite(struct bio *bio,
 	u64 start;
 	u64 end;
 
-#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
-	if (bio->bi_size)
-		return 1;
-#endif
-
 	do {
 		struct page *page = bvec->bv_page;
 		tree = &BTRFS_I(page->mapping->host)->io_tree;
@@ -1607,9 +1573,6 @@ static int end_bio_extent_preparewrite(struct bio *bio,
 	} while (bvec >= bio->bi_io_vec);
 
 	bio_put(bio);
-#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
-	return 0;
-#endif
 }
 
 static struct bio *
@@ -2079,12 +2042,6 @@ done:
 	return 0;
 }
 
-#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,22)
-/* Taken directly from 2.6.23 with a mod for a lockpage hook */
-typedef int (*writepage_t)(struct page *page, struct writeback_control *wbc,
-                                void *data);
-#endif
-
 /**
  * write_cache_pages - walk the list of dirty pages of the given address space and write all of them.
  * @mapping: address space structure to write
@@ -2201,10 +2158,9 @@ retry:
 	}
 	if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
 		mapping->writeback_index = index;
-#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,26)
+
 	if (wbc->range_cont)
 		wbc->range_start = index << PAGE_CACHE_SHIFT;
-#endif
 	return ret;
 }
 EXPORT_SYMBOL(extent_write_cache_pages);
@@ -2560,18 +2516,10 @@ static inline struct page *extent_buffer_page(struct extent_buffer *eb,
 	 * by increasing the reference count.  So we know the page must
 	 * be in the radix tree.
 	 */
-#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,26)
 	rcu_read_lock();
-#else
-	read_lock_irq(&mapping->tree_lock);
-#endif
 	p = radix_tree_lookup(&mapping->page_tree, i);
-
-#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,26)
 	rcu_read_unlock();
-#else
-	read_unlock_irq(&mapping->tree_lock);
-#endif
+
 	return p;
 }
 
@@ -2773,21 +2721,13 @@ int clear_extent_buffer_dirty(struct extent_io_tree *tree,
 			}
 		}
 		clear_page_dirty_for_io(page);
-#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,26)
 		spin_lock_irq(&page->mapping->tree_lock);
-#else
-		read_lock_irq(&page->mapping->tree_lock);
-#endif
 		if (!PageDirty(page)) {
 			radix_tree_tag_clear(&page->mapping->page_tree,
 						page_index(page),
 						PAGECACHE_TAG_DIRTY);
 		}
-#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,26)
 		spin_unlock_irq(&page->mapping->tree_lock);
-#else
-		read_unlock_irq(&page->mapping->tree_lock);
-#endif
 		unlock_page(page);
 	}
 	return 0;
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 48a702d41c8c..8856570a0ebd 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -871,15 +871,8 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
 		goto out_nolock;
 	if (count == 0)
 		goto out_nolock;
-#ifdef REMOVE_SUID_PATH
-	err = remove_suid(&file->f_path);
-#else
-# if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,26)
+
 	err = file_remove_suid(file);
-# else
-	err = remove_suid(fdentry(file));
-# endif
-#endif
 	if (err)
 		goto out_nolock;
 	file_update_time(file);
@@ -1003,17 +996,10 @@ out_nolock:
 			btrfs_commit_transaction(trans, root);
 		}
 	} else if (num_written > 0 && (file->f_flags & O_DIRECT)) {
-#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,22)
-		do_sync_file_range(file, start_pos,
-				      start_pos + num_written - 1,
-				      SYNC_FILE_RANGE_WRITE |
-				      SYNC_FILE_RANGE_WAIT_AFTER);
-#else
 		do_sync_mapping_range(inode->i_mapping, start_pos,
 				      start_pos + num_written - 1,
 				      SYNC_FILE_RANGE_WRITE |
 				      SYNC_FILE_RANGE_WAIT_AFTER);
-#endif
 		invalidate_mapping_pages(inode->i_mapping,
 		      start_pos >> PAGE_CACHE_SHIFT,
 		     (start_pos + num_written - 1) >> PAGE_CACHE_SHIFT);
@@ -1097,12 +1083,7 @@ out:
 }
 
 static struct vm_operations_struct btrfs_file_vm_ops = {
-#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,23)
-	.nopage         = filemap_nopage,
-	.populate       = filemap_populate,
-#else
 	.fault		= filemap_fault,
-#endif
 	.page_mkwrite	= btrfs_page_mkwrite,
 };
 
@@ -1118,9 +1099,6 @@ struct file_operations btrfs_file_operations = {
 	.read		= do_sync_read,
 	.aio_read       = generic_file_aio_read,
 	.splice_read	= generic_file_splice_read,
-#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18)
-	.sendfile	= generic_file_sendfile,
-#endif
 	.write		= btrfs_file_write,
 	.mmap		= btrfs_file_mmap,
 	.open		= generic_file_open,
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index adb169d739ce..48a3dc030807 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2073,104 +2073,6 @@ err:
 	return ret;
 }
 
-/* Kernels earlier than 2.6.28 still have the NFS deadlock where nfsd
-   will call the file system's ->lookup() method from within its
-   filldir callback, which in turn was called from the file system's
-   ->readdir() method. And will deadlock for many file systems. */
-#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,28)
-
-struct nfshack_dirent {
-	u64		ino;
-	loff_t		offset;
-	int		namlen;
-	unsigned int	d_type;
-	char		name[];
-};
-
-struct nfshack_readdir {
-	char		*dirent;
-	size_t		used;
-	int		full;
-};
-
-
-
-static int btrfs_nfshack_filldir(void *__buf, const char *name, int namlen,
-			      loff_t offset, u64 ino, unsigned int d_type)
-{
-	struct nfshack_readdir *buf = __buf;
-	struct nfshack_dirent *de = (void *)(buf->dirent + buf->used);
-	unsigned int reclen;
-
-	reclen = ALIGN(sizeof(struct nfshack_dirent) + namlen, sizeof(u64));
-	if (buf->used + reclen > PAGE_SIZE) {
-		buf->full = 1;
-		return -EINVAL;
-	}
-
-	de->namlen = namlen;
-	de->offset = offset;
-	de->ino = ino;
-	de->d_type = d_type;
-	memcpy(de->name, name, namlen);
-	buf->used += reclen;
-
-	return 0;
-}
-
-static int btrfs_nfshack_readdir(struct file *file, void *dirent,
-				 filldir_t filldir)
-{
-	struct nfshack_readdir buf;
-	struct nfshack_dirent *de;
-	int err;
-	int size;
-	loff_t offset;
-
-	buf.dirent = (void *)__get_free_page(GFP_KERNEL);
-	if (!buf.dirent)
-		return -ENOMEM;
-
-	offset = file->f_pos;
-
-	do {
-		unsigned int reclen;
-
-		buf.used = 0;
-		buf.full = 0;
-		err = btrfs_real_readdir(file, &buf, btrfs_nfshack_filldir);
-		if (err)
-			break;
-
-		size = buf.used;
-
-		if (!size)
-			break;
-
-		de = (struct nfshack_dirent *)buf.dirent;
-		while (size > 0) {
-			offset = de->offset;
-
-			if (filldir(dirent, de->name, de->namlen, de->offset,
-				    de->ino, de->d_type))
-				goto done;
-			offset = file->f_pos;
-
-			reclen = ALIGN(sizeof(*de) + de->namlen,
-				       sizeof(u64));
-			size -= reclen;
-			de = (struct nfshack_dirent *)((char *)de + reclen);
-		}
-	} while (buf.full);
-
- done:
-	free_page((unsigned long)buf.dirent);
-	file->f_pos = offset;
-
-	return err;
-}
-#endif
-
 int btrfs_write_inode(struct inode *inode, int wait)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -3311,13 +3213,8 @@ unsigned long btrfs_force_ra(struct address_space *mapping,
 {
 	pgoff_t req_size = last_index - offset + 1;
 
-#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,23)
-	offset = page_cache_readahead(mapping, ra, file, offset, req_size);
-	return offset;
-#else
 	page_cache_sync_readahead(mapping, ra, file, offset, req_size);
 	return offset + req_size;
-#endif
 }
 
 struct inode *btrfs_alloc_inode(struct super_block *sb)
@@ -3373,14 +3270,7 @@ void btrfs_destroy_inode(struct inode *inode)
 	kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
 }
 
-#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,26)
 static void init_once(void *foo)
-#elif LINUX_VERSION_CODE > KERNEL_VERSION(2,6,23)
-static void init_once(struct kmem_cache * cachep, void *foo)
-#else
-static void init_once(void * foo, struct kmem_cache * cachep,
-		      unsigned long flags)
-#endif
 {
 	struct btrfs_inode *ei = (struct btrfs_inode *) foo;
 
@@ -3403,22 +3293,10 @@ void btrfs_destroy_cachep(void)
 
 struct kmem_cache *btrfs_cache_create(const char *name, size_t size,
 				       unsigned long extra_flags,
-#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,26)
-				       void (*ctor)(void *)
-#elif LINUX_VERSION_CODE > KERNEL_VERSION(2,6,23)
-				       void (*ctor)(struct kmem_cache *, void *)
-#else
-				       void (*ctor)(void *, struct kmem_cache *,
-						    unsigned long)
-#endif
-				     )
+				       void (*ctor)(void *))
 {
 	return kmem_cache_create(name, size, 0, (SLAB_RECLAIM_ACCOUNT |
-				 SLAB_MEM_SPREAD | extra_flags), ctor
-#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,23)
-				 ,NULL
-#endif
-				);
+				 SLAB_MEM_SPREAD | extra_flags), ctor);
 }
 
 int btrfs_init_cachep(void)
@@ -3666,12 +3544,7 @@ static int btrfs_set_page_dirty(struct page *page)
 	return __set_page_dirty_nobuffers(page);
 }
 
-#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,26)
 static int btrfs_permission(struct inode *inode, int mask)
-#else
-static int btrfs_permission(struct inode *inode, int mask,
-			    struct nameidata *nd)
-#endif
 {
 	if (btrfs_test_flag(inode, READONLY) && (mask & MAY_WRITE))
 		return -EACCES;
@@ -3702,11 +3575,7 @@ static struct inode_operations btrfs_dir_ro_inode_operations = {
 static struct file_operations btrfs_dir_file_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= generic_read_dir,
-#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,28)
-	.readdir	= btrfs_nfshack_readdir,
-#else /* NFSd readdir/lookup deadlock is fixed */
 	.readdir	= btrfs_real_readdir,
-#endif
 	.unlocked_ioctl	= btrfs_ioctl,
 #ifdef CONFIG_COMPAT
 	.compat_ioctl	= btrfs_ioctl,
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index f7b3eac7ac6d..8399d6d05d63 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -349,10 +349,7 @@ static int btrfs_fill_super(struct super_block * sb,
 
 	sb->s_root = root_dentry;
 
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,25)
 	save_mount_options(sb, data);
-#endif
-
 	return 0;
 
 fail_close:
@@ -566,11 +563,7 @@ static struct super_operations btrfs_super_ops = {
 	.put_super	= btrfs_put_super,
 	.write_super	= btrfs_write_super,
 	.sync_fs	= btrfs_sync_fs,
-#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,25)
-	.read_inode     = btrfs_read_locked_inode,
-#else
 	.show_options	= generic_show_options,
-#endif
 	.write_inode	= btrfs_write_inode,
 	.dirty_inode	= btrfs_dirty_inode,
 	.alloc_inode	= btrfs_alloc_inode,
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 61af5d8446e3..300076e66765 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -28,7 +28,6 @@
 #include "disk-io.h"
 #include "transaction.h"
 
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,25)
 static ssize_t root_blocks_used_show(struct btrfs_root *root, char *buf)
 {
 	return snprintf(buf, PAGE_SIZE, "%llu\n",
@@ -267,35 +266,3 @@ void btrfs_exit_sysfs(void)
 	kset_unregister(btrfs_kset);
 }
 
-#else
-
-int btrfs_sysfs_add_super(struct btrfs_fs_info *fs)
-{
-	return 0;
-}
-
-int btrfs_sysfs_add_root(struct btrfs_root *root)
-{
-	return 0;
-}
-
-void btrfs_sysfs_del_root(struct btrfs_root *root)
-{
-	return;
-}
-
-void btrfs_sysfs_del_super(struct btrfs_fs_info *fs)
-{
-	return;
-}
-
-int btrfs_init_sysfs(void)
-{
-	return 0;
-}
-
-void btrfs_exit_sysfs(void)
-{
-	return;
-}
-#endif
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index b9e5c2d82dde..ddf89626498a 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -2080,20 +2080,11 @@ int btrfs_unplug_page(struct btrfs_mapping_tree *map_tree,
 }
 
 
-#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,23)
 static void end_bio_multi_stripe(struct bio *bio, int err)
-#else
-static int end_bio_multi_stripe(struct bio *bio,
-				   unsigned int bytes_done, int err)
-#endif
 {
 	struct btrfs_multi_bio *multi = bio->bi_private;
 	int is_orig_bio = 0;
 
-#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
-	if (bio->bi_size)
-		return 1;
-#endif
 	if (err)
 		atomic_inc(&multi->error);
 
@@ -2122,17 +2113,10 @@ static int end_bio_multi_stripe(struct bio *bio,
 		}
 		kfree(multi);
 
-#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
-		bio_endio(bio, bio->bi_size, err);
-#else
 		bio_endio(bio, err);
-#endif
 	} else if (!is_orig_bio) {
 		bio_put(bio);
 	}
-#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
-	return 0;
-#endif
 }
 
 struct async_sched {
@@ -2248,11 +2232,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
 		} else {
 			bio->bi_bdev = root->fs_info->fs_devices->latest_bdev;
 			bio->bi_sector = logical >> 9;
-#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
-			bio_endio(bio, bio->bi_size, -EIO);
-#else
 			bio_endio(bio, -EIO);
-#endif
 		}
 		dev_nr++;
 	}
-- 
cgit v1.2.3


From 1a40e23b95da45051ee4d74374c58ae87a14051c Mon Sep 17 00:00:00 2001
From: Zheng Yan <zheng.yan@oracle.com>
Date: Fri, 26 Sep 2008 10:09:34 -0400
Subject: Btrfs: update space balancing code

This patch updates the space balancing code to utilize the new
backref format.  Before, btrfs-vol -b would break any COW links
on data blocks or metadata.  This was slow and caused the amount
of space used to explode if a large number of snapshots were present.

The new code can keeps the sharing of all data extents and
most of the tree blocks.

To maintain the sharing of data extents, the space balance code uses
a seperate inode hold data extent pointers, then updates the references
to point to the new location.

To maintain the sharing of tree blocks, the space balance code uses
reloc trees to relocate tree blocks in reference counted roots.
There is one reloc tree for each subvol, and all reloc trees share
same root key objectid. Reloc trees are snapshots of the latest
committed roots of subvols (root->commit_root).

To relocate a tree block referenced by a subvol, there are two steps.
COW the block through subvol's reloc tree, then update block pointer in
the subvol to point to the new block. Since all reloc trees share
same root key objectid, doing special handing for tree blocks
owned by them is easy. Once a tree block has been COWed in one
reloc tree, we can use the resulting new block directly when the
same block is required to COW again through other reloc trees.
In this way, relocated tree blocks are shared between reloc trees,
so they are also shared between subvols.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c       |  155 +++-
 fs/btrfs/ctree.h       |   26 +-
 fs/btrfs/disk-io.c     |    9 +
 fs/btrfs/extent-tree.c | 2074 ++++++++++++++++++++++++++++++++++++++----------
 fs/btrfs/root-tree.c   |    5 +-
 fs/btrfs/transaction.c |   15 +-
 fs/btrfs/volumes.c     |    9 +-
 7 files changed, 1848 insertions(+), 445 deletions(-)

(limited to 'fs/btrfs/volumes.c')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index f9cd40967d04..50e81f43e6d4 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -179,7 +179,6 @@ int noinline __btrfs_cow_block(struct btrfs_trans_handle *trans,
 	struct extent_buffer *cow;
 	u32 nritems;
 	int ret = 0;
-	int different_trans = 0;
 	int level;
 	int unlock_orig = 0;
 
@@ -233,13 +232,33 @@ int noinline __btrfs_cow_block(struct btrfs_trans_handle *trans,
 	WARN_ON(btrfs_header_generation(buf) > trans->transid);
 	if (btrfs_header_generation(buf) != trans->transid) {
 		u32 nr_extents;
-		different_trans = 1;
 		ret = btrfs_inc_ref(trans, root, buf, cow, &nr_extents);
 		if (ret)
 			return ret;
 
 		ret = btrfs_cache_ref(trans, root, buf, nr_extents);
 		WARN_ON(ret);
+	} else if (btrfs_header_owner(buf) == BTRFS_TREE_RELOC_OBJECTID) {
+		/*
+		 * There are only two places that can drop reference to
+		 * tree blocks owned by living reloc trees, one is here,
+		 * the other place is btrfs_merge_path. In both places,
+		 * we check reference count while tree block is locked.
+		 * Furthermore, if reference count is one, it won't get
+		 * increased by someone else.
+		 */
+		u32 refs;
+		ret = btrfs_lookup_extent_ref(trans, root, buf->start,
+					      buf->len, &refs);
+		BUG_ON(ret);
+		if (refs == 1) {
+			ret = btrfs_update_ref(trans, root, buf, cow,
+					       0, nritems);
+			clean_tree_block(trans, root, buf);
+		} else {
+			ret = btrfs_inc_ref(trans, root, buf, cow, NULL);
+		}
+		BUG_ON(ret);
 	} else {
 		ret = btrfs_update_ref(trans, root, buf, cow, 0, nritems);
 		if (ret)
@@ -247,6 +266,14 @@ int noinline __btrfs_cow_block(struct btrfs_trans_handle *trans,
 		clean_tree_block(trans, root, buf);
 	}
 
+	if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
+		ret = btrfs_add_reloc_mapping(root, buf->start,
+					      buf->len, cow->start);
+		BUG_ON(ret);
+		ret = btrfs_reloc_tree_cache_ref(trans, root, cow, buf->start);
+		WARN_ON(ret);
+	}
+
 	if (buf == root->node) {
 		WARN_ON(parent && parent != buf);
 
@@ -1466,6 +1493,130 @@ done:
 	return ret;
 }
 
+int btrfs_merge_path(struct btrfs_trans_handle *trans,
+		     struct btrfs_root *root,
+		     struct btrfs_key *node_keys,
+		     u64 *nodes, int lowest_level)
+{
+	struct extent_buffer *eb;
+	struct extent_buffer *parent;
+	struct btrfs_key key;
+	u64 bytenr;
+	u64 generation;
+	u32 blocksize;
+	int level;
+	int slot;
+	int key_match;
+	int ret;
+
+	eb = btrfs_lock_root_node(root);
+	ret = btrfs_cow_block(trans, root, eb, NULL, 0, &eb, 0);
+	BUG_ON(ret);
+
+	parent = eb;
+	while (1) {
+		level = btrfs_header_level(parent);
+		if (level == 0 || level <= lowest_level)
+			break;
+
+		ret = bin_search(parent, &node_keys[lowest_level], level,
+				 &slot);
+		if (ret && slot > 0)
+			slot--;
+
+		bytenr = btrfs_node_blockptr(parent, slot);
+		if (nodes[level - 1] == bytenr)
+			break;
+
+		blocksize = btrfs_level_size(root, level - 1);
+		generation = btrfs_node_ptr_generation(parent, slot);
+		btrfs_node_key_to_cpu(eb, &key, slot);
+		key_match = !memcmp(&key, &node_keys[level - 1], sizeof(key));
+
+		/*
+		 * if node keys match and node pointer hasn't been modified
+		 * in the running transaction, we can merge the path. for
+		 * reloc trees, the node pointer check is skipped, this is
+		 * because the reloc trees are fully controlled by the space
+		 * balance code, no one else can modify them.
+		 */
+		if (!nodes[level - 1] || !key_match ||
+		    (generation == trans->transid &&
+		     root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID)) {
+next_level:
+			if (level == 1 || level == lowest_level + 1)
+				break;
+
+			eb = read_tree_block(root, bytenr, blocksize,
+					     generation);
+			btrfs_tree_lock(eb);
+
+			ret = btrfs_cow_block(trans, root, eb, parent, slot,
+					      &eb, 0);
+			BUG_ON(ret);
+
+			btrfs_tree_unlock(parent);
+			free_extent_buffer(parent);
+			parent = eb;
+			continue;
+		}
+
+		if (generation == trans->transid) {
+			u32 refs;
+			BUG_ON(btrfs_header_owner(eb) !=
+			       BTRFS_TREE_RELOC_OBJECTID);
+			/*
+			 * lock the block to keep __btrfs_cow_block from
+			 * changing the reference count.
+			 */
+			eb = read_tree_block(root, bytenr, blocksize,
+					     generation);
+			btrfs_tree_lock(eb);
+
+			ret = btrfs_lookup_extent_ref(trans, root, bytenr,
+						      blocksize, &refs);
+			BUG_ON(ret);
+			/*
+			 * if replace block whose reference count is one,
+			 * we have to "drop the subtree". so skip it for
+			 * simplicity
+			 */
+			if (refs == 1) {
+				btrfs_tree_unlock(eb);
+				free_extent_buffer(eb);
+				goto next_level;
+			}
+		}
+
+		btrfs_set_node_blockptr(parent, slot, nodes[level - 1]);
+		btrfs_set_node_ptr_generation(parent, slot, trans->transid);
+		btrfs_mark_buffer_dirty(parent);
+
+		ret = btrfs_inc_extent_ref(trans, root,
+					nodes[level - 1],
+					blocksize, parent->start,
+					btrfs_header_owner(parent),
+					btrfs_header_generation(parent),
+					level - 1, 0);
+		BUG_ON(ret);
+		ret = btrfs_free_extent(trans, root, bytenr,
+					blocksize, parent->start,
+					btrfs_header_owner(parent),
+					btrfs_header_generation(parent),
+					level - 1, 0, 1);
+		BUG_ON(ret);
+
+		if (generation == trans->transid) {
+			btrfs_tree_unlock(eb);
+			free_extent_buffer(eb);
+		}
+		break;
+	}
+	btrfs_tree_unlock(parent);
+	free_extent_buffer(parent);
+	return 0;
+}
+
 /*
  * adjust the pointers going up the tree, starting at level
  * making sure the right key of each node is points to 'key'.
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 3e62a1b0a1f7..2775e270881e 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -604,6 +604,7 @@ struct btrfs_fs_info {
 	struct mutex chunk_mutex;
 	struct mutex drop_mutex;
 	struct mutex volume_mutex;
+	struct mutex tree_reloc_mutex;
 	struct list_head trans_list;
 	struct list_head hashers;
 	struct list_head dead_roots;
@@ -647,6 +648,10 @@ struct btrfs_fs_info {
 	struct task_struct *cleaner_kthread;
 	int thread_pool_size;
 
+	/* tree relocation relocated fields */
+	struct extent_io_tree reloc_mapping_tree;
+	struct list_head dead_reloc_roots;
+	struct btrfs_leaf_ref_tree reloc_ref_tree;
 	struct btrfs_leaf_ref_tree shared_ref_tree;
 
 	struct kobject super_kobj;
@@ -698,6 +703,7 @@ struct btrfs_root {
 	struct btrfs_leaf_ref_tree ref_tree_struct;
 	struct btrfs_dirty_root *dirty_root;
 	struct btrfs_root *log_root;
+	struct btrfs_root *reloc_root;
 
 	struct btrfs_root_item root_item;
 	struct btrfs_key root_key;
@@ -1517,7 +1523,6 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
 struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
 					    struct btrfs_root *root,
 					    u64 bytenr, u32 blocksize);
-int btrfs_shrink_extent_tree(struct btrfs_root *root, u64 new_size);
 int btrfs_insert_extent_backref(struct btrfs_trans_handle *trans,
 				 struct btrfs_root *root,
 				 struct btrfs_path *path,
@@ -1582,10 +1587,29 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
 			   struct btrfs_root *root, u64 bytes_used,
 			   u64 type, u64 chunk_objectid, u64 chunk_offset,
 			   u64 size);
+int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
+			     struct btrfs_root *root, u64 group_start);
+int btrfs_relocate_block_group(struct btrfs_root *root, u64 group_start);
+int btrfs_free_reloc_root(struct btrfs_root *root);
+int btrfs_drop_dead_reloc_roots(struct btrfs_root *root);
+int btrfs_add_reloc_mapping(struct btrfs_root *root, u64 orig_bytenr,
+			    u64 num_bytes, u64 new_bytenr);
+int btrfs_get_reloc_mapping(struct btrfs_root *root, u64 orig_bytenr,
+			    u64 num_bytes, u64 *new_bytenr);
+void btrfs_free_reloc_mappings(struct btrfs_root *root);
+int btrfs_reloc_tree_cache_ref(struct btrfs_trans_handle *trans,
+			       struct btrfs_root *root,
+			       struct extent_buffer *buf, u64 orig_start);
+int btrfs_add_dead_reloc_root(struct btrfs_root *root);
+int btrfs_cleanup_reloc_trees(struct btrfs_root *root);
 /* ctree.c */
 int btrfs_previous_item(struct btrfs_root *root,
 			struct btrfs_path *path, u64 min_objectid,
 			int type);
+int btrfs_merge_path(struct btrfs_trans_handle *trans,
+		     struct btrfs_root *root,
+		     struct btrfs_key *node_keys,
+		     u64 *nodes, int lowest_level);
 int btrfs_set_item_key_safe(struct btrfs_trans_handle *trans,
 			    struct btrfs_root *root, struct btrfs_path *path,
 			    struct btrfs_key *new_key);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 8969fee23318..45bc3132b054 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1406,6 +1406,10 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 			     fs_info->btree_inode->i_mapping, GFP_NOFS);
 	fs_info->do_barriers = 1;
 
+	extent_io_tree_init(&fs_info->reloc_mapping_tree,
+			    fs_info->btree_inode->i_mapping, GFP_NOFS);
+	INIT_LIST_HEAD(&fs_info->dead_reloc_roots);
+	btrfs_leaf_ref_tree_init(&fs_info->reloc_ref_tree);
 	btrfs_leaf_ref_tree_init(&fs_info->shared_ref_tree);
 
 	BTRFS_I(fs_info->btree_inode)->root = tree_root;
@@ -1421,6 +1425,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	mutex_init(&fs_info->transaction_kthread_mutex);
 	mutex_init(&fs_info->cleaner_mutex);
 	mutex_init(&fs_info->volume_mutex);
+	mutex_init(&fs_info->tree_reloc_mutex);
 	init_waitqueue_head(&fs_info->transaction_throttle);
 	init_waitqueue_head(&fs_info->transaction_wait);
 	init_waitqueue_head(&fs_info->async_submit_wait);
@@ -1627,6 +1632,10 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 		ret = btrfs_recover_log_trees(log_tree_root);
 		BUG_ON(ret);
 	}
+
+	ret = btrfs_cleanup_reloc_trees(tree_root);
+	BUG_ON(ret);
+
 	fs_info->last_trans_committed = btrfs_super_generation(disk_super);
 	return tree_root;
 
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 9ab099bc01a4..8043b9d584a9 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1834,6 +1834,7 @@ static int pin_down_bytes(struct btrfs_trans_handle *trans,
 		u64 header_owner = btrfs_header_owner(buf);
 		u64 header_transid = btrfs_header_generation(buf);
 		if (header_owner != BTRFS_TREE_LOG_OBJECTID &&
+		    header_owner != BTRFS_TREE_RELOC_OBJECTID &&
 		    header_transid == trans->transid &&
 		    !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
 			clean_tree_block(NULL, root, buf);
@@ -2487,6 +2488,7 @@ int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len)
 		return -ENOSPC;
 	}
 	btrfs_add_free_space(cache, start, len);
+	update_reserved_extents(root, start, len, 0);
 	maybe_unlock_mutex(root);
 	return 0;
 }
@@ -2947,6 +2949,10 @@ static int noinline walk_down_tree(struct btrfs_trans_handle *trans,
 		 */
 		if (*level == 1) {
 			ref = btrfs_lookup_leaf_ref(root, bytenr);
+			if (ref && ref->generation != ptr_gen) {
+				btrfs_free_leaf_ref(root, ref);
+				ref = NULL;
+			}
 			if (ref) {
 				ret = cache_drop_leaf_ref(trans, root, ref);
 				BUG_ON(ret);
@@ -3153,34 +3159,6 @@ out:
 	return ret;
 }
 
-int btrfs_free_block_groups(struct btrfs_fs_info *info)
-{
-	struct btrfs_block_group_cache *block_group;
-	struct rb_node *n;
-
-	mutex_lock(&info->alloc_mutex);
-	spin_lock(&info->block_group_cache_lock);
-	while ((n = rb_last(&info->block_group_cache_tree)) != NULL) {
-		block_group = rb_entry(n, struct btrfs_block_group_cache,
-				       cache_node);
-
-		spin_unlock(&info->block_group_cache_lock);
-		btrfs_remove_free_space_cache(block_group);
-		spin_lock(&info->block_group_cache_lock);
-
-		rb_erase(&block_group->cache_node,
-			 &info->block_group_cache_tree);
-
-		spin_lock(&block_group->space_info->lock);
-		list_del(&block_group->list);
-		spin_unlock(&block_group->space_info->lock);
-		kfree(block_group);
-	}
-	spin_unlock(&info->block_group_cache_lock);
-	mutex_unlock(&info->alloc_mutex);
-	return 0;
-}
-
 static unsigned long calc_ra(unsigned long start, unsigned long last,
 			     unsigned long nr)
 {
@@ -3192,37 +3170,43 @@ static int noinline relocate_inode_pages(struct inode *inode, u64 start,
 {
 	u64 page_start;
 	u64 page_end;
+	unsigned long first_index;
 	unsigned long last_index;
 	unsigned long i;
 	struct page *page;
 	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
 	struct file_ra_state *ra;
-	unsigned long total_read = 0;
-	unsigned long ra_pages;
 	struct btrfs_ordered_extent *ordered;
-	struct btrfs_trans_handle *trans;
+	unsigned int total_read = 0;
+	unsigned int total_dirty = 0;
+	int ret = 0;
 
 	ra = kzalloc(sizeof(*ra), GFP_NOFS);
 
 	mutex_lock(&inode->i_mutex);
-	i = start >> PAGE_CACHE_SHIFT;
+	first_index = start >> PAGE_CACHE_SHIFT;
 	last_index = (start + len - 1) >> PAGE_CACHE_SHIFT;
 
-	ra_pages = BTRFS_I(inode)->root->fs_info->bdi.ra_pages;
+	/* make sure the dirty trick played by the caller work */
+	ret = invalidate_inode_pages2_range(inode->i_mapping,
+					    first_index, last_index);
+	if (ret)
+		goto out_unlock;
 
 	file_ra_state_init(ra, inode->i_mapping);
 
-	for (; i <= last_index; i++) {
-		if (total_read % ra_pages == 0) {
+	for (i = first_index ; i <= last_index; i++) {
+		if (total_read % ra->ra_pages == 0) {
 			btrfs_force_ra(inode->i_mapping, ra, NULL, i,
-				       calc_ra(i, last_index, ra_pages));
+				       calc_ra(i, last_index, ra->ra_pages));
 		}
 		total_read++;
 again:
 		if (((u64)i << PAGE_CACHE_SHIFT) > i_size_read(inode))
-			goto truncate_racing;
+			BUG_ON(1);
 		page = grab_cache_page(inode->i_mapping, i);
 		if (!page) {
+			ret = -ENOMEM;
 			goto out_unlock;
 		}
 		if (!PageUptodate(page)) {
@@ -3231,6 +3215,7 @@ again:
 			if (!PageUptodate(page)) {
 				unlock_page(page);
 				page_cache_release(page);
+				ret = -EIO;
 				goto out_unlock;
 			}
 		}
@@ -3251,14 +3236,13 @@ again:
 		}
 		set_page_extent_mapped(page);
 
-		/*
-		 * make sure page_mkwrite is called for this page if userland
-		 * wants to change it from mmap
-		 */
-		clear_page_dirty_for_io(page);
-
 		btrfs_set_extent_delalloc(inode, page_start, page_end);
+		if (i == first_index)
+			set_extent_bits(io_tree, page_start, page_end,
+					EXTENT_BOUNDARY, GFP_NOFS);
+
 		set_page_dirty(page);
+		total_dirty++;
 
 		unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
 		unlock_page(page);
@@ -3266,350 +3250,1460 @@ again:
 	}
 
 out_unlock:
-	/* we have to start the IO in order to get the ordered extents
-	 * instantiated.  This allows the relocation to code to wait
-	 * for all the ordered extents to hit the disk.
-	 *
-	 * Otherwise, it would constantly loop over the same extents
-	 * because the old ones don't get deleted  until the IO is
-	 * started
-	 */
-	btrfs_fdatawrite_range(inode->i_mapping, start, start + len - 1,
-			       WB_SYNC_NONE);
 	kfree(ra);
-	trans = btrfs_start_transaction(BTRFS_I(inode)->root, 1);
-	if (trans) {
-		btrfs_end_transaction(trans, BTRFS_I(inode)->root);
-		mark_inode_dirty(inode);
-	}
 	mutex_unlock(&inode->i_mutex);
-	return 0;
-
-truncate_racing:
-	vmtruncate(inode, inode->i_size);
-	balance_dirty_pages_ratelimited_nr(inode->i_mapping,
-					   total_read);
-	goto out_unlock;
+	balance_dirty_pages_ratelimited_nr(inode->i_mapping, total_dirty);
+	return ret;
 }
 
-/*
- * The back references tell us which tree holds a ref on a block,
- * but it is possible for the tree root field in the reference to
- * reflect the original root before a snapshot was made.  In this
- * case we should search through all the children of a given root
- * to find potential holders of references on a block.
- *
- * Instead, we do something a little less fancy and just search
- * all the roots for a given key/block combination.
- */
-static int find_root_for_ref(struct btrfs_root *root,
-			     struct btrfs_path *path,
-			     struct btrfs_key *key0,
-			     int level,
-			     int file_key,
-			     struct btrfs_root **found_root,
-			     u64 bytenr)
-{
-	struct btrfs_key root_location;
-	struct btrfs_root *cur_root = *found_root;
-	struct btrfs_file_extent_item *file_extent;
-	u64 root_search_start = BTRFS_FS_TREE_OBJECTID;
-	u64 found_bytenr;
-	int ret;
+static int noinline relocate_data_extent(struct inode *reloc_inode,
+					 struct btrfs_key *extent_key,
+					 u64 offset)
+{
+	struct btrfs_root *root = BTRFS_I(reloc_inode)->root;
+	struct extent_map_tree *em_tree = &BTRFS_I(reloc_inode)->extent_tree;
+	struct extent_map *em;
 
-	root_location.offset = (u64)-1;
-	root_location.type = BTRFS_ROOT_ITEM_KEY;
-	path->lowest_level = level;
-	path->reada = 0;
-	while(1) {
-		ret = btrfs_search_slot(NULL, cur_root, key0, path, 0, 0);
-		found_bytenr = 0;
-		if (ret == 0 && file_key) {
-			struct extent_buffer *leaf = path->nodes[0];
-			file_extent = btrfs_item_ptr(leaf, path->slots[0],
-					     struct btrfs_file_extent_item);
-			if (btrfs_file_extent_type(leaf, file_extent) ==
-			    BTRFS_FILE_EXTENT_REG) {
-				found_bytenr =
-					btrfs_file_extent_disk_bytenr(leaf,
-							       file_extent);
-		       }
-		} else if (!file_key) {
-			if (path->nodes[level])
-				found_bytenr = path->nodes[level]->start;
-		}
-
-		btrfs_release_path(cur_root, path);
-
-		if (found_bytenr == bytenr) {
-			*found_root = cur_root;
-			ret = 0;
-			goto out;
-		}
-		ret = btrfs_search_root(root->fs_info->tree_root,
-					root_search_start, &root_search_start);
-		if (ret)
-			break;
+	em = alloc_extent_map(GFP_NOFS);
+	BUG_ON(!em || IS_ERR(em));
 
-		root_location.objectid = root_search_start;
-		cur_root = btrfs_read_fs_root_no_name(root->fs_info,
-						      &root_location);
-		if (!cur_root) {
-			ret = 1;
+	em->start = extent_key->objectid - offset;
+	em->len = extent_key->offset;
+	em->block_start = extent_key->objectid;
+	em->bdev = root->fs_info->fs_devices->latest_bdev;
+	set_bit(EXTENT_FLAG_PINNED, &em->flags);
+
+	/* setup extent map to cheat btrfs_readpage */
+	mutex_lock(&BTRFS_I(reloc_inode)->extent_mutex);
+	while (1) {
+		int ret;
+		spin_lock(&em_tree->lock);
+		ret = add_extent_mapping(em_tree, em);
+		spin_unlock(&em_tree->lock);
+		if (ret != -EEXIST) {
+			free_extent_map(em);
 			break;
 		}
+		btrfs_drop_extent_cache(reloc_inode, em->start,
+					em->start + em->len - 1, 0);
 	}
-out:
-	path->lowest_level = 0;
-	return ret;
-}
+	mutex_unlock(&BTRFS_I(reloc_inode)->extent_mutex);
 
-/*
- * note, this releases the path
- */
-static int noinline relocate_one_reference(struct btrfs_root *extent_root,
-				  struct btrfs_path *path,
-				  struct btrfs_key *extent_key,
-				  u64 *last_file_objectid,
-				  u64 *last_file_offset,
-				  u64 *last_file_root,
-				  u64 last_extent)
-{
-	struct inode *inode;
-	struct btrfs_root *found_root;
-	struct btrfs_key root_location;
-	struct btrfs_key found_key;
-	struct btrfs_extent_ref *ref;
-	u64 ref_root;
-	u64 ref_gen;
-	u64 ref_objectid;
-	u64 ref_offset;
-	int ret;
-	int level;
+	return relocate_inode_pages(reloc_inode, extent_key->objectid - offset,
+				    extent_key->offset);
+}
 
-	WARN_ON(!mutex_is_locked(&extent_root->fs_info->alloc_mutex));
+struct btrfs_ref_path {
+	u64 extent_start;
+	u64 nodes[BTRFS_MAX_LEVEL];
+	u64 root_objectid;
+	u64 root_generation;
+	u64 owner_objectid;
+	u64 owner_offset;
+	u32 num_refs;
+	int lowest_level;
+	int current_level;
+};
 
-	ref = btrfs_item_ptr(path->nodes[0], path->slots[0],
-			     struct btrfs_extent_ref);
-	ref_root = btrfs_ref_root(path->nodes[0], ref);
-	ref_gen = btrfs_ref_generation(path->nodes[0], ref);
-	ref_objectid = btrfs_ref_objectid(path->nodes[0], ref);
-	ref_offset = btrfs_ref_offset(path->nodes[0], ref);
-	btrfs_release_path(extent_root, path);
+struct disk_extent {
+	u64 disk_bytenr;
+	u64 disk_num_bytes;
+	u64 offset;
+	u64 num_bytes;
+};
 
-	root_location.objectid = ref_root;
-	if (ref_gen == 0)
-		root_location.offset = 0;
-	else
-		root_location.offset = (u64)-1;
-	root_location.type = BTRFS_ROOT_ITEM_KEY;
+static int is_cowonly_root(u64 root_objectid)
+{
+	if (root_objectid == BTRFS_ROOT_TREE_OBJECTID ||
+	    root_objectid == BTRFS_EXTENT_TREE_OBJECTID ||
+	    root_objectid == BTRFS_CHUNK_TREE_OBJECTID ||
+	    root_objectid == BTRFS_DEV_TREE_OBJECTID ||
+	    root_objectid == BTRFS_TREE_LOG_OBJECTID)
+		return 1;
+	return 0;
+}
 
-	found_root = btrfs_read_fs_root_no_name(extent_root->fs_info,
-						&root_location);
-	BUG_ON(!found_root);
-	mutex_unlock(&extent_root->fs_info->alloc_mutex);
+static int noinline __next_ref_path(struct btrfs_trans_handle *trans,
+				    struct btrfs_root *extent_root,
+				    struct btrfs_ref_path *ref_path,
+				    int first_time)
+{
+	struct extent_buffer *leaf;
+	struct btrfs_path *path;
+	struct btrfs_extent_ref *ref;
+	struct btrfs_key key;
+	struct btrfs_key found_key;
+	u64 bytenr;
+	u32 nritems;
+	int level;
+	int ret = 1;
 
-	if (ref_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
-		found_key.objectid = ref_objectid;
-		found_key.type = BTRFS_EXTENT_DATA_KEY;
-		found_key.offset = ref_offset;
-		level = 0;
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
 
-		if (last_extent == extent_key->objectid &&
-		    *last_file_objectid == ref_objectid &&
-		    *last_file_offset == ref_offset &&
-		    *last_file_root == ref_root)
-			goto out;
+	mutex_lock(&extent_root->fs_info->alloc_mutex);
 
-		ret = find_root_for_ref(extent_root, path, &found_key,
-					level, 1, &found_root,
-					extent_key->objectid);
+	if (first_time) {
+		ref_path->lowest_level = -1;
+		ref_path->current_level = -1;
+		goto walk_up;
+	}
+walk_down:
+	level = ref_path->current_level - 1;
+	while (level >= -1) {
+		u64 parent;
+		if (level < ref_path->lowest_level)
+			break;
 
-		if (ret)
-			goto out;
+		if (level >= 0) {
+			bytenr = ref_path->nodes[level];
+		} else {
+			bytenr = ref_path->extent_start;
+		}
+		BUG_ON(bytenr == 0);
 
-		if (last_extent == extent_key->objectid &&
-		    *last_file_objectid == ref_objectid &&
-		    *last_file_offset == ref_offset &&
-		    *last_file_root == ref_root)
-			goto out;
+		parent = ref_path->nodes[level + 1];
+		ref_path->nodes[level + 1] = 0;
+		ref_path->current_level = level;
+		BUG_ON(parent == 0);
 
-		inode = btrfs_iget_locked(extent_root->fs_info->sb,
-					  ref_objectid, found_root);
-		if (inode->i_state & I_NEW) {
-			/* the inode and parent dir are two different roots */
-			BTRFS_I(inode)->root = found_root;
-			BTRFS_I(inode)->location.objectid = ref_objectid;
-			BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
-			BTRFS_I(inode)->location.offset = 0;
-			btrfs_read_locked_inode(inode);
-			unlock_new_inode(inode);
+		key.objectid = bytenr;
+		key.offset = parent + 1;
+		key.type = BTRFS_EXTENT_REF_KEY;
 
-		}
-		/* this can happen if the reference is not against
-		 * the latest version of the tree root
-		 */
-		if (is_bad_inode(inode))
+		ret = btrfs_search_slot(trans, extent_root, &key, path, 0, 0);
+		if (ret < 0)
 			goto out;
+		BUG_ON(ret == 0);
 
-		*last_file_objectid = inode->i_ino;
-		*last_file_root = found_root->root_key.objectid;
-		*last_file_offset = ref_offset;
+		leaf = path->nodes[0];
+		nritems = btrfs_header_nritems(leaf);
+		if (path->slots[0] >= nritems) {
+			ret = btrfs_next_leaf(extent_root, path);
+			if (ret < 0)
+				goto out;
+			if (ret > 0)
+				goto next;
+			leaf = path->nodes[0];
+		}
 
-		relocate_inode_pages(inode, ref_offset, extent_key->offset);
-		iput(inode);
-	} else {
-		struct btrfs_trans_handle *trans;
-		struct extent_buffer *eb;
-		int needs_lock = 0;
+		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+		if (found_key.objectid == bytenr &&
+				found_key.type == BTRFS_EXTENT_REF_KEY)
+			goto found;
+next:
+		level--;
+		btrfs_release_path(extent_root, path);
+		if (need_resched()) {
+			mutex_unlock(&extent_root->fs_info->alloc_mutex);
+			cond_resched();
+			mutex_lock(&extent_root->fs_info->alloc_mutex);
+		}
+	}
+	/* reached lowest level */
+	ret = 1;
+	goto out;
+walk_up:
+	level = ref_path->current_level;
+	while (level < BTRFS_MAX_LEVEL - 1) {
+		u64 ref_objectid;
+		if (level >= 0) {
+			bytenr = ref_path->nodes[level];
+		} else {
+			bytenr = ref_path->extent_start;
+		}
+		BUG_ON(bytenr == 0);
 
-		eb = read_tree_block(found_root, extent_key->objectid,
-				     extent_key->offset, 0);
-		btrfs_tree_lock(eb);
-		level = btrfs_header_level(eb);
+		key.objectid = bytenr;
+		key.offset = 0;
+		key.type = BTRFS_EXTENT_REF_KEY;
 
-		if (level == 0)
-			btrfs_item_key_to_cpu(eb, &found_key, 0);
-		else
-			btrfs_node_key_to_cpu(eb, &found_key, 0);
+		ret = btrfs_search_slot(trans, extent_root, &key, path, 0, 0);
+		if (ret < 0)
+			goto out;
 
-		btrfs_tree_unlock(eb);
-		free_extent_buffer(eb);
+		leaf = path->nodes[0];
+		nritems = btrfs_header_nritems(leaf);
+		if (path->slots[0] >= nritems) {
+			ret = btrfs_next_leaf(extent_root, path);
+			if (ret < 0)
+				goto out;
+			if (ret > 0) {
+				/* the extent was freed by someone */
+				if (ref_path->lowest_level == level)
+					goto out;
+				btrfs_release_path(extent_root, path);
+				goto walk_down;
+			}
+			leaf = path->nodes[0];
+		}
 
-		ret = find_root_for_ref(extent_root, path, &found_key,
-					level, 0, &found_root,
-					extent_key->objectid);
+		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+		if (found_key.objectid != bytenr ||
+				found_key.type != BTRFS_EXTENT_REF_KEY) {
+			/* the extent was freed by someone */
+			if (ref_path->lowest_level == level) {
+				ret = 1;
+				goto out;
+			}
+			btrfs_release_path(extent_root, path);
+			goto walk_down;
+		}
+found:
+		ref = btrfs_item_ptr(leaf, path->slots[0],
+				struct btrfs_extent_ref);
+		ref_objectid = btrfs_ref_objectid(leaf, ref);
+		if (ref_objectid < BTRFS_FIRST_FREE_OBJECTID) {
+			if (first_time) {
+				level = (int)ref_objectid;
+				BUG_ON(level >= BTRFS_MAX_LEVEL);
+				ref_path->lowest_level = level;
+				ref_path->current_level = level;
+				ref_path->nodes[level] = bytenr;
+			} else {
+				WARN_ON(ref_objectid != level);
+			}
+		} else {
+			WARN_ON(level != -1);
+		}
+		first_time = 0;
 
-		if (ret)
-			goto out;
+		if (ref_path->lowest_level == level) {
+			ref_path->owner_objectid = ref_objectid;
+			ref_path->owner_offset = btrfs_ref_offset(leaf, ref);
+			ref_path->num_refs = btrfs_ref_num_refs(leaf, ref);
+		}
 
 		/*
-		 * right here almost anything could happen to our key,
-		 * but that's ok.  The cow below will either relocate it
-		 * or someone else will have relocated it.  Either way,
-		 * it is in a different spot than it was before and
-		 * we're happy.
+		 * the block is tree root or the block isn't in reference
+		 * counted tree.
 		 */
+		if (found_key.objectid == found_key.offset ||
+		    is_cowonly_root(btrfs_ref_root(leaf, ref))) {
+			ref_path->root_objectid = btrfs_ref_root(leaf, ref);
+			ref_path->root_generation =
+				btrfs_ref_generation(leaf, ref);
+			if (level < 0) {
+				/* special reference from the tree log */
+				ref_path->nodes[0] = found_key.offset;
+				ref_path->current_level = 0;
+			}
+			ret = 0;
+			goto out;
+		}
 
-		trans = btrfs_start_transaction(found_root, 1);
+		level++;
+		BUG_ON(ref_path->nodes[level] != 0);
+		ref_path->nodes[level] = found_key.offset;
+		ref_path->current_level = level;
 
-		if (found_root == extent_root->fs_info->extent_root ||
-		    found_root == extent_root->fs_info->chunk_root ||
-		    found_root == extent_root->fs_info->dev_root) {
-			needs_lock = 1;
-			mutex_lock(&extent_root->fs_info->alloc_mutex);
+		/*
+		 * the reference was created in the running transaction,
+		 * no need to continue walking up.
+		 */
+		if (btrfs_ref_generation(leaf, ref) == trans->transid) {
+			ref_path->root_objectid = btrfs_ref_root(leaf, ref);
+			ref_path->root_generation =
+				btrfs_ref_generation(leaf, ref);
+			ret = 0;
+			goto out;
 		}
 
-		path->lowest_level = level;
-		path->reada = 2;
-		ret = btrfs_search_slot(trans, found_root, &found_key, path,
-					0, 1);
-		path->lowest_level = 0;
-		btrfs_release_path(found_root, path);
-
-		if (found_root == found_root->fs_info->extent_root)
-			btrfs_extent_post_op(trans, found_root);
-		if (needs_lock)
+		btrfs_release_path(extent_root, path);
+		if (need_resched()) {
 			mutex_unlock(&extent_root->fs_info->alloc_mutex);
-
-		btrfs_end_transaction(trans, found_root);
-
+			cond_resched();
+			mutex_lock(&extent_root->fs_info->alloc_mutex);
+		}
 	}
+	/* reached max tree level, but no tree root found. */
+	BUG();
 out:
-	mutex_lock(&extent_root->fs_info->alloc_mutex);
-	return 0;
+	mutex_unlock(&extent_root->fs_info->alloc_mutex);
+	btrfs_free_path(path);
+	return ret;
 }
 
-static int noinline del_extent_zero(struct btrfs_root *extent_root,
-				    struct btrfs_path *path,
-				    struct btrfs_key *extent_key)
+static int btrfs_first_ref_path(struct btrfs_trans_handle *trans,
+				struct btrfs_root *extent_root,
+				struct btrfs_ref_path *ref_path,
+				u64 extent_start)
 {
-	int ret;
-	struct btrfs_trans_handle *trans;
+	memset(ref_path, 0, sizeof(*ref_path));
+	ref_path->extent_start = extent_start;
 
-	trans = btrfs_start_transaction(extent_root, 1);
-	ret = btrfs_search_slot(trans, extent_root, extent_key, path, -1, 1);
-	if (ret > 0) {
-		ret = -EIO;
-		goto out;
-	}
-	if (ret < 0)
-		goto out;
-	ret = btrfs_del_item(trans, extent_root, path);
-out:
-	btrfs_end_transaction(trans, extent_root);
-	return ret;
+	return __next_ref_path(trans, extent_root, ref_path, 1);
 }
 
-static int noinline relocate_one_extent(struct btrfs_root *extent_root,
-					struct btrfs_path *path,
-					struct btrfs_key *extent_key)
+static int btrfs_next_ref_path(struct btrfs_trans_handle *trans,
+			       struct btrfs_root *extent_root,
+			       struct btrfs_ref_path *ref_path)
 {
-	struct btrfs_key key;
-	struct btrfs_key found_key;
+	return __next_ref_path(trans, extent_root, ref_path, 0);
+}
+
+static int noinline get_new_locations(struct inode *reloc_inode,
+				      struct btrfs_key *extent_key,
+				      u64 offset, int no_fragment,
+				      struct disk_extent **extents,
+				      int *nr_extents)
+{
+	struct btrfs_root *root = BTRFS_I(reloc_inode)->root;
+	struct btrfs_path *path;
+	struct btrfs_file_extent_item *fi;
 	struct extent_buffer *leaf;
-	u64 last_file_objectid = 0;
-	u64 last_file_root = 0;
-	u64 last_file_offset = (u64)-1;
-	u64 last_extent = 0;
+	struct disk_extent *exts = *extents;
+	struct btrfs_key found_key;
+	u64 cur_pos;
+	u64 last_byte;
 	u32 nritems;
-	u32 item_size;
-	int ret = 0;
+	int nr = 0;
+	int max = *nr_extents;
+	int ret;
 
-	if (extent_key->objectid == 0) {
-		ret = del_extent_zero(extent_root, path, extent_key);
-		goto out;
+	WARN_ON(!no_fragment && *extents);
+	if (!exts) {
+		max = 1;
+		exts = kmalloc(sizeof(*exts) * max, GFP_NOFS);
+		if (!exts)
+			return -ENOMEM;
 	}
-	key.objectid = extent_key->objectid;
-	key.type = BTRFS_EXTENT_REF_KEY;
-	key.offset = 0;
 
-	while(1) {
-		ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
+	path = btrfs_alloc_path();
+	BUG_ON(!path);
 
-		if (ret < 0)
-			goto out;
+	cur_pos = extent_key->objectid - offset;
+	last_byte = extent_key->objectid + extent_key->offset;
+	ret = btrfs_lookup_file_extent(NULL, root, path, reloc_inode->i_ino,
+				       cur_pos, 0);
+	if (ret < 0)
+		goto out;
+	if (ret > 0) {
+		ret = -ENOENT;
+		goto out;
+	}
 
-		ret = 0;
+	while (1) {
 		leaf = path->nodes[0];
 		nritems = btrfs_header_nritems(leaf);
-		if (path->slots[0] == nritems) {
-			ret = btrfs_next_leaf(extent_root, path);
-			if (ret > 0) {
-				ret = 0;
-				goto out;
-			}
+		if (path->slots[0] >= nritems) {
+			ret = btrfs_next_leaf(root, path);
 			if (ret < 0)
 				goto out;
+			if (ret > 0)
+				break;
 			leaf = path->nodes[0];
 		}
 
 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
-		if (found_key.objectid != extent_key->objectid) {
+		if (found_key.offset != cur_pos ||
+		    found_key.type != BTRFS_EXTENT_DATA_KEY ||
+		    found_key.objectid != reloc_inode->i_ino)
 			break;
-		}
 
-		if (found_key.type != BTRFS_EXTENT_REF_KEY) {
+		fi = btrfs_item_ptr(leaf, path->slots[0],
+				    struct btrfs_file_extent_item);
+		if (btrfs_file_extent_type(leaf, fi) !=
+		    BTRFS_FILE_EXTENT_REG ||
+		    btrfs_file_extent_disk_bytenr(leaf, fi) == 0)
 			break;
+
+		if (nr == max) {
+			struct disk_extent *old = exts;
+			max *= 2;
+			exts = kzalloc(sizeof(*exts) * max, GFP_NOFS);
+			memcpy(exts, old, sizeof(*exts) * nr);
+			if (old != *extents)
+				kfree(old);
 		}
 
-		key.offset = found_key.offset + 1;
-		item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+		exts[nr].disk_bytenr =
+			btrfs_file_extent_disk_bytenr(leaf, fi);
+		exts[nr].disk_num_bytes =
+			btrfs_file_extent_disk_num_bytes(leaf, fi);
+		exts[nr].offset = btrfs_file_extent_offset(leaf, fi);
+		exts[nr].num_bytes = btrfs_file_extent_num_bytes(leaf, fi);
+		WARN_ON(exts[nr].offset > 0);
+		WARN_ON(exts[nr].num_bytes != exts[nr].disk_num_bytes);
 
-		ret = relocate_one_reference(extent_root, path, extent_key,
-					     &last_file_objectid,
-					     &last_file_offset,
-					     &last_file_root, last_extent);
-		if (ret)
+		cur_pos += exts[nr].num_bytes;
+		nr++;
+
+		if (cur_pos + offset >= last_byte)
+			break;
+
+		if (no_fragment) {
+			ret = 1;
 			goto out;
-		last_extent = extent_key->objectid;
+		}
+		path->slots[0]++;
+	}
+
+	WARN_ON(cur_pos + offset > last_byte);
+	if (cur_pos + offset < last_byte) {
+		ret = -ENOENT;
+		goto out;
 	}
 	ret = 0;
 out:
-	btrfs_release_path(extent_root, path);
-	return ret;
-}
-
+	btrfs_free_path(path);
+	if (ret) {
+		if (exts != *extents)
+			kfree(exts);
+	} else {
+		*extents = exts;
+		*nr_extents = nr;
+	}
+	return ret;
+}
+
+static int noinline replace_one_extent(struct btrfs_trans_handle *trans,
+					struct btrfs_root *root,
+					struct btrfs_path *path,
+					struct btrfs_key *extent_key,
+					struct btrfs_key *leaf_key,
+					struct btrfs_ref_path *ref_path,
+					struct disk_extent *new_extents,
+					int nr_extents)
+{
+	struct extent_buffer *leaf;
+	struct btrfs_file_extent_item *fi;
+	struct inode *inode = NULL;
+	struct btrfs_key key;
+	u64 lock_start = 0;
+	u64 lock_end = 0;
+	u64 num_bytes;
+	u64 ext_offset;
+	u64 first_pos;
+	u32 nritems;
+	int extent_locked = 0;
+	int ret;
+
+	first_pos = ref_path->owner_offset;
+	if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS) {
+		key.objectid = ref_path->owner_objectid;
+		key.offset = ref_path->owner_offset;
+		key.type = BTRFS_EXTENT_DATA_KEY;
+	} else {
+		memcpy(&key, leaf_key, sizeof(key));
+	}
+
+	while (1) {
+		ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
+		if (ret < 0)
+			goto out;
+
+		leaf = path->nodes[0];
+		nritems = btrfs_header_nritems(leaf);
+next:
+		if (extent_locked && ret > 0) {
+			/*
+			 * the file extent item was modified by someone
+			 * before the extent got locked.
+			 */
+			mutex_unlock(&BTRFS_I(inode)->extent_mutex);
+			unlock_extent(&BTRFS_I(inode)->io_tree, lock_start,
+				      lock_end, GFP_NOFS);
+			extent_locked = 0;
+		}
+
+		if (path->slots[0] >= nritems) {
+			if (ref_path->owner_objectid ==
+			    BTRFS_MULTIPLE_OBJECTIDS)
+				break;
+
+			BUG_ON(extent_locked);
+			ret = btrfs_next_leaf(root, path);
+			if (ret < 0)
+				goto out;
+			if (ret > 0)
+				break;
+			leaf = path->nodes[0];
+			nritems = btrfs_header_nritems(leaf);
+		}
+
+		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+
+		if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS) {
+			if ((key.objectid > ref_path->owner_objectid) ||
+			    (key.objectid == ref_path->owner_objectid &&
+			     key.type > BTRFS_EXTENT_DATA_KEY) ||
+			    (key.offset >= first_pos + extent_key->offset))
+				break;
+		}
+
+		if (inode && key.objectid != inode->i_ino) {
+			BUG_ON(extent_locked);
+			btrfs_release_path(root, path);
+			mutex_unlock(&inode->i_mutex);
+			iput(inode);
+			inode = NULL;
+			continue;
+		}
+
+		if (key.type != BTRFS_EXTENT_DATA_KEY) {
+			path->slots[0]++;
+			ret = 1;
+			goto next;
+		}
+		fi = btrfs_item_ptr(leaf, path->slots[0],
+				    struct btrfs_file_extent_item);
+		if ((btrfs_file_extent_type(leaf, fi) !=
+		     BTRFS_FILE_EXTENT_REG) ||
+		    (btrfs_file_extent_disk_bytenr(leaf, fi) !=
+		     extent_key->objectid)) {
+			path->slots[0]++;
+			ret = 1;
+			goto next;
+		}
+
+		num_bytes = btrfs_file_extent_num_bytes(leaf, fi);
+		ext_offset = btrfs_file_extent_offset(leaf, fi);
+
+		if (first_pos > key.offset - ext_offset)
+			first_pos = key.offset - ext_offset;
+
+		if (!extent_locked) {
+			lock_start = key.offset;
+			lock_end = lock_start + num_bytes - 1;
+		} else {
+			BUG_ON(lock_start != key.offset);
+			BUG_ON(lock_end - lock_start + 1 < num_bytes);
+		}
+
+		if (!inode) {
+			btrfs_release_path(root, path);
+
+			inode = btrfs_iget_locked(root->fs_info->sb,
+						  key.objectid, root);
+			if (inode->i_state & I_NEW) {
+				BTRFS_I(inode)->root = root;
+				BTRFS_I(inode)->location.objectid =
+					key.objectid;
+				BTRFS_I(inode)->location.type =
+					BTRFS_INODE_ITEM_KEY;
+				BTRFS_I(inode)->location.offset = 0;
+				btrfs_read_locked_inode(inode);
+				unlock_new_inode(inode);
+			}
+			/*
+			 * some code call btrfs_commit_transaction while
+			 * holding the i_mutex, so we can't use mutex_lock
+			 * here.
+			 */
+			if (is_bad_inode(inode) ||
+			    !mutex_trylock(&inode->i_mutex)) {
+				iput(inode);
+				inode = NULL;
+				key.offset = (u64)-1;
+				goto skip;
+			}
+		}
+
+		if (!extent_locked) {
+			struct btrfs_ordered_extent *ordered;
+
+			btrfs_release_path(root, path);
+
+			lock_extent(&BTRFS_I(inode)->io_tree, lock_start,
+				    lock_end, GFP_NOFS);
+			ordered = btrfs_lookup_first_ordered_extent(inode,
+								    lock_end);
+			if (ordered &&
+			    ordered->file_offset <= lock_end &&
+			    ordered->file_offset + ordered->len > lock_start) {
+				unlock_extent(&BTRFS_I(inode)->io_tree,
+					      lock_start, lock_end, GFP_NOFS);
+				btrfs_start_ordered_extent(inode, ordered, 1);
+				btrfs_put_ordered_extent(ordered);
+				key.offset += num_bytes;
+				goto skip;
+			}
+			if (ordered)
+				btrfs_put_ordered_extent(ordered);
+
+			mutex_lock(&BTRFS_I(inode)->extent_mutex);
+			extent_locked = 1;
+			continue;
+		}
+
+		if (nr_extents == 1) {
+			/* update extent pointer in place */
+			btrfs_set_file_extent_generation(leaf, fi,
+						trans->transid);
+			btrfs_set_file_extent_disk_bytenr(leaf, fi,
+						new_extents[0].disk_bytenr);
+			btrfs_set_file_extent_disk_num_bytes(leaf, fi,
+						new_extents[0].disk_num_bytes);
+			ext_offset += new_extents[0].offset;
+			btrfs_set_file_extent_offset(leaf, fi, ext_offset);
+			btrfs_mark_buffer_dirty(leaf);
+
+			btrfs_drop_extent_cache(inode, key.offset,
+						key.offset + num_bytes - 1, 0);
+
+			ret = btrfs_inc_extent_ref(trans, root,
+						new_extents[0].disk_bytenr,
+						new_extents[0].disk_num_bytes,
+						leaf->start,
+						root->root_key.objectid,
+						trans->transid,
+						key.objectid, key.offset);
+			BUG_ON(ret);
+
+			ret = btrfs_free_extent(trans, root,
+						extent_key->objectid,
+						extent_key->offset,
+						leaf->start,
+						btrfs_header_owner(leaf),
+						btrfs_header_generation(leaf),
+						key.objectid, key.offset, 0);
+			BUG_ON(ret);
+
+			btrfs_release_path(root, path);
+			key.offset += num_bytes;
+		} else {
+			u64 alloc_hint;
+			u64 extent_len;
+			int i;
+			/*
+			 * drop old extent pointer at first, then insert the
+			 * new pointers one bye one
+			 */
+			btrfs_release_path(root, path);
+			ret = btrfs_drop_extents(trans, root, inode, key.offset,
+						 key.offset + num_bytes,
+						 key.offset, &alloc_hint);
+			BUG_ON(ret);
+
+			for (i = 0; i < nr_extents; i++) {
+				if (ext_offset >= new_extents[i].num_bytes) {
+					ext_offset -= new_extents[i].num_bytes;
+					continue;
+				}
+				extent_len = min(new_extents[i].num_bytes -
+						 ext_offset, num_bytes);
+
+				ret = btrfs_insert_empty_item(trans, root,
+							      path, &key,
+							      sizeof(*fi));
+				BUG_ON(ret);
+
+				leaf = path->nodes[0];
+				fi = btrfs_item_ptr(leaf, path->slots[0],
+						struct btrfs_file_extent_item);
+				btrfs_set_file_extent_generation(leaf, fi,
+							trans->transid);
+				btrfs_set_file_extent_type(leaf, fi,
+							BTRFS_FILE_EXTENT_REG);
+				btrfs_set_file_extent_disk_bytenr(leaf, fi,
+						new_extents[i].disk_bytenr);
+				btrfs_set_file_extent_disk_num_bytes(leaf, fi,
+						new_extents[i].disk_num_bytes);
+				btrfs_set_file_extent_num_bytes(leaf, fi,
+							extent_len);
+				ext_offset += new_extents[i].offset;
+				btrfs_set_file_extent_offset(leaf, fi,
+							ext_offset);
+				btrfs_mark_buffer_dirty(leaf);
+
+				btrfs_drop_extent_cache(inode, key.offset,
+						key.offset + extent_len - 1, 0);
+
+				ret = btrfs_inc_extent_ref(trans, root,
+						new_extents[i].disk_bytenr,
+						new_extents[i].disk_num_bytes,
+						leaf->start,
+						root->root_key.objectid,
+						trans->transid,
+						key.objectid, key.offset);
+				BUG_ON(ret);
+				btrfs_release_path(root, path);
+
+				inode->i_blocks += extent_len >> 9;
+
+				ext_offset = 0;
+				num_bytes -= extent_len;
+				key.offset += extent_len;
+
+				if (num_bytes == 0)
+					break;
+			}
+			BUG_ON(i >= nr_extents);
+		}
+
+		if (extent_locked) {
+			mutex_unlock(&BTRFS_I(inode)->extent_mutex);
+			unlock_extent(&BTRFS_I(inode)->io_tree, lock_start,
+				      lock_end, GFP_NOFS);
+			extent_locked = 0;
+		}
+skip:
+		if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS &&
+		    key.offset >= first_pos + extent_key->offset)
+			break;
+
+		cond_resched();
+	}
+	ret = 0;
+out:
+	btrfs_release_path(root, path);
+	if (inode) {
+		mutex_unlock(&inode->i_mutex);
+		if (extent_locked) {
+			mutex_unlock(&BTRFS_I(inode)->extent_mutex);
+			unlock_extent(&BTRFS_I(inode)->io_tree, lock_start,
+				      lock_end, GFP_NOFS);
+		}
+		iput(inode);
+	}
+	return ret;
+}
+
+int btrfs_add_reloc_mapping(struct btrfs_root *root, u64 orig_bytenr,
+			    u64 num_bytes, u64 new_bytenr)
+{
+	set_extent_bits(&root->fs_info->reloc_mapping_tree,
+			orig_bytenr, orig_bytenr + num_bytes - 1,
+			EXTENT_LOCKED, GFP_NOFS);
+	set_state_private(&root->fs_info->reloc_mapping_tree,
+			  orig_bytenr, new_bytenr);
+	return 0;
+}
+
+int btrfs_get_reloc_mapping(struct btrfs_root *root, u64 orig_bytenr,
+			    u64 num_bytes, u64 *new_bytenr)
+{
+	u64 bytenr;
+	u64 cur_bytenr = orig_bytenr;
+	u64 prev_bytenr = orig_bytenr;
+	int ret;
+
+	while (1) {
+		ret = get_state_private(&root->fs_info->reloc_mapping_tree,
+					cur_bytenr, &bytenr);
+		if (ret)
+			break;
+		prev_bytenr = cur_bytenr;
+		cur_bytenr = bytenr;
+	}
+
+	if (orig_bytenr == cur_bytenr)
+		return -ENOENT;
+
+	if (prev_bytenr != orig_bytenr) {
+		set_state_private(&root->fs_info->reloc_mapping_tree,
+				  orig_bytenr, cur_bytenr);
+	}
+	*new_bytenr = cur_bytenr;
+	return 0;
+}
+
+void btrfs_free_reloc_mappings(struct btrfs_root *root)
+{
+	clear_extent_bits(&root->fs_info->reloc_mapping_tree,
+			  0, (u64)-1, -1, GFP_NOFS);
+}
+
+int btrfs_reloc_tree_cache_ref(struct btrfs_trans_handle *trans,
+			       struct btrfs_root *root,
+			       struct extent_buffer *buf, u64 orig_start)
+{
+	int level;
+	int ret;
+
+	BUG_ON(btrfs_header_generation(buf) != trans->transid);
+	BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
+
+	level = btrfs_header_level(buf);
+	if (level == 0) {
+		struct btrfs_leaf_ref *ref;
+		struct btrfs_leaf_ref *orig_ref;
+
+		orig_ref = btrfs_lookup_leaf_ref(root, orig_start);
+		if (!orig_ref)
+			return -ENOENT;
+
+		ref = btrfs_alloc_leaf_ref(root, orig_ref->nritems);
+		if (!ref) {
+			btrfs_free_leaf_ref(root, orig_ref);
+			return -ENOMEM;
+		}
+
+		ref->nritems = orig_ref->nritems;
+		memcpy(ref->extents, orig_ref->extents,
+			sizeof(ref->extents[0]) * ref->nritems);
+
+		btrfs_free_leaf_ref(root, orig_ref);
+
+		ref->root_gen = trans->transid;
+		ref->bytenr = buf->start;
+		ref->owner = btrfs_header_owner(buf);
+		ref->generation = btrfs_header_generation(buf);
+		ret = btrfs_add_leaf_ref(root, ref, 0);
+		WARN_ON(ret);
+		btrfs_free_leaf_ref(root, ref);
+	}
+	return 0;
+}
+
+static int noinline invalidate_extent_cache(struct btrfs_root *root,
+					struct extent_buffer *leaf,
+					struct btrfs_block_group_cache *group,
+					struct btrfs_root *target_root)
+{
+	struct btrfs_key key;
+	struct inode *inode = NULL;
+	struct btrfs_file_extent_item *fi;
+	u64 num_bytes;
+	u64 skip_objectid = 0;
+	u32 nritems;
+	u32 i;
+
+	nritems = btrfs_header_nritems(leaf);
+	for (i = 0; i < nritems; i++) {
+		btrfs_item_key_to_cpu(leaf, &key, i);
+		if (key.objectid == skip_objectid ||
+		    key.type != BTRFS_EXTENT_DATA_KEY)
+			continue;
+		fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
+		if (btrfs_file_extent_type(leaf, fi) ==
+		    BTRFS_FILE_EXTENT_INLINE)
+			continue;
+		if (btrfs_file_extent_disk_bytenr(leaf, fi) == 0)
+			continue;
+		if (!inode || inode->i_ino != key.objectid) {
+			iput(inode);
+			inode = btrfs_ilookup(target_root->fs_info->sb,
+					      key.objectid, target_root, 1);
+		}
+		if (!inode) {
+			skip_objectid = key.objectid;
+			continue;
+		}
+		num_bytes = btrfs_file_extent_num_bytes(leaf, fi);
+
+		lock_extent(&BTRFS_I(inode)->io_tree, key.offset,
+			    key.offset + num_bytes - 1, GFP_NOFS);
+		mutex_lock(&BTRFS_I(inode)->extent_mutex);
+		btrfs_drop_extent_cache(inode, key.offset,
+					key.offset + num_bytes - 1, 1);
+		mutex_unlock(&BTRFS_I(inode)->extent_mutex);
+		unlock_extent(&BTRFS_I(inode)->io_tree, key.offset,
+			      key.offset + num_bytes - 1, GFP_NOFS);
+		cond_resched();
+	}
+	iput(inode);
+	return 0;
+}
+
+static int noinline replace_extents_in_leaf(struct btrfs_trans_handle *trans,
+					struct btrfs_root *root,
+					struct extent_buffer *leaf,
+					struct btrfs_block_group_cache *group,
+					struct inode *reloc_inode)
+{
+	struct btrfs_key key;
+	struct btrfs_key extent_key;
+	struct btrfs_file_extent_item *fi;
+	struct btrfs_leaf_ref *ref;
+	struct disk_extent *new_extent;
+	u64 bytenr;
+	u64 num_bytes;
+	u32 nritems;
+	u32 i;
+	int ext_index;
+	int nr_extent;
+	int ret;
+
+	new_extent = kmalloc(sizeof(*new_extent), GFP_NOFS);
+	BUG_ON(!new_extent);
+
+	ref = btrfs_lookup_leaf_ref(root, leaf->start);
+	BUG_ON(!ref);
+
+	ext_index = -1;
+	nritems = btrfs_header_nritems(leaf);
+	for (i = 0; i < nritems; i++) {
+		btrfs_item_key_to_cpu(leaf, &key, i);
+		if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
+			continue;
+		fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
+		if (btrfs_file_extent_type(leaf, fi) ==
+		    BTRFS_FILE_EXTENT_INLINE)
+			continue;
+		bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
+		num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
+		if (bytenr == 0)
+			continue;
+
+		ext_index++;
+		if (bytenr >= group->key.objectid + group->key.offset ||
+		    bytenr + num_bytes <= group->key.objectid)
+			continue;
+
+		extent_key.objectid = bytenr;
+		extent_key.offset = num_bytes;
+		extent_key.type = BTRFS_EXTENT_ITEM_KEY;
+		nr_extent = 1;
+		ret = get_new_locations(reloc_inode, &extent_key,
+					group->key.objectid, 1,
+					&new_extent, &nr_extent);
+		if (ret > 0)
+			continue;
+		BUG_ON(ret < 0);
+
+		BUG_ON(ref->extents[ext_index].bytenr != bytenr);
+		BUG_ON(ref->extents[ext_index].num_bytes != num_bytes);
+		ref->extents[ext_index].bytenr = new_extent->disk_bytenr;
+		ref->extents[ext_index].num_bytes = new_extent->disk_num_bytes;
+
+		btrfs_set_file_extent_generation(leaf, fi, trans->transid);
+		btrfs_set_file_extent_disk_bytenr(leaf, fi,
+						new_extent->disk_bytenr);
+		btrfs_set_file_extent_disk_num_bytes(leaf, fi,
+						new_extent->disk_num_bytes);
+		new_extent->offset += btrfs_file_extent_offset(leaf, fi);
+		btrfs_set_file_extent_offset(leaf, fi, new_extent->offset);
+		btrfs_mark_buffer_dirty(leaf);
+
+		ret = btrfs_inc_extent_ref(trans, root,
+					new_extent->disk_bytenr,
+					new_extent->disk_num_bytes,
+					leaf->start,
+					root->root_key.objectid,
+					trans->transid,
+					key.objectid, key.offset);
+		BUG_ON(ret);
+		ret = btrfs_free_extent(trans, root,
+					bytenr, num_bytes, leaf->start,
+					btrfs_header_owner(leaf),
+					btrfs_header_generation(leaf),
+					key.objectid, key.offset, 0);
+		BUG_ON(ret);
+		cond_resched();
+	}
+	kfree(new_extent);
+	BUG_ON(ext_index + 1 != ref->nritems);
+	btrfs_free_leaf_ref(root, ref);
+	return 0;
+}
+
+int btrfs_free_reloc_root(struct btrfs_root *root)
+{
+	struct btrfs_root *reloc_root;
+
+	if (root->reloc_root) {
+		reloc_root = root->reloc_root;
+		root->reloc_root = NULL;
+		list_add(&reloc_root->dead_list,
+			 &root->fs_info->dead_reloc_roots);
+	}
+	return 0;
+}
+
+int btrfs_drop_dead_reloc_roots(struct btrfs_root *root)
+{
+	struct btrfs_trans_handle *trans;
+	struct btrfs_root *reloc_root;
+	struct btrfs_root *prev_root = NULL;
+	struct list_head dead_roots;
+	int ret;
+	unsigned long nr;
+
+	INIT_LIST_HEAD(&dead_roots);
+	list_splice_init(&root->fs_info->dead_reloc_roots, &dead_roots);
+
+	while (!list_empty(&dead_roots)) {
+		reloc_root = list_entry(dead_roots.prev,
+					struct btrfs_root, dead_list);
+		list_del_init(&reloc_root->dead_list);
+
+		BUG_ON(reloc_root->commit_root != NULL);
+		while (1) {
+			trans = btrfs_join_transaction(root, 1);
+			BUG_ON(!trans);
+
+			mutex_lock(&root->fs_info->drop_mutex);
+			ret = btrfs_drop_snapshot(trans, reloc_root);
+			if (ret != -EAGAIN)
+				break;
+			mutex_unlock(&root->fs_info->drop_mutex);
+
+			nr = trans->blocks_used;
+			ret = btrfs_end_transaction(trans, root);
+			BUG_ON(ret);
+			btrfs_btree_balance_dirty(root, nr);
+		}
+
+		free_extent_buffer(reloc_root->node);
+
+		ret = btrfs_del_root(trans, root->fs_info->tree_root,
+				     &reloc_root->root_key);
+		BUG_ON(ret);
+		mutex_unlock(&root->fs_info->drop_mutex);
+
+		nr = trans->blocks_used;
+		ret = btrfs_end_transaction(trans, root);
+		BUG_ON(ret);
+		btrfs_btree_balance_dirty(root, nr);
+
+		kfree(prev_root);
+		prev_root = reloc_root;
+	}
+	if (prev_root) {
+		btrfs_remove_leaf_refs(prev_root, (u64)-1, 0);
+		kfree(prev_root);
+	}
+	return 0;
+}
+
+int btrfs_add_dead_reloc_root(struct btrfs_root *root)
+{
+	list_add(&root->dead_list, &root->fs_info->dead_reloc_roots);
+	return 0;
+}
+
+int btrfs_cleanup_reloc_trees(struct btrfs_root *root)
+{
+	struct btrfs_root *reloc_root;
+	struct btrfs_trans_handle *trans;
+	struct btrfs_key location;
+	int found;
+	int ret;
+
+	mutex_lock(&root->fs_info->tree_reloc_mutex);
+	ret = btrfs_find_dead_roots(root, BTRFS_TREE_RELOC_OBJECTID, NULL);
+	BUG_ON(ret);
+	found = !list_empty(&root->fs_info->dead_reloc_roots);
+	mutex_unlock(&root->fs_info->tree_reloc_mutex);
+
+	if (found) {
+		trans = btrfs_start_transaction(root, 1);
+		BUG_ON(!trans);
+		ret = btrfs_commit_transaction(trans, root);
+		BUG_ON(ret);
+	}
+
+	location.objectid = BTRFS_DATA_RELOC_TREE_OBJECTID;
+	location.offset = (u64)-1;
+	location.type = BTRFS_ROOT_ITEM_KEY;
+
+	reloc_root = btrfs_read_fs_root_no_name(root->fs_info, &location);
+	BUG_ON(!reloc_root);
+	btrfs_orphan_cleanup(reloc_root);
+	return 0;
+}
+
+static int noinline init_reloc_tree(struct btrfs_trans_handle *trans,
+				    struct btrfs_root *root)
+{
+	struct btrfs_root *reloc_root;
+	struct extent_buffer *eb;
+	struct btrfs_root_item *root_item;
+	struct btrfs_key root_key;
+	int ret;
+
+	BUG_ON(!root->ref_cows);
+	if (root->reloc_root)
+		return 0;
+
+	root_item = kmalloc(sizeof(*root_item), GFP_NOFS);
+	BUG_ON(!root_item);
+
+	ret = btrfs_copy_root(trans, root, root->commit_root,
+			      &eb, BTRFS_TREE_RELOC_OBJECTID);
+	BUG_ON(ret);
+
+	root_key.objectid = BTRFS_TREE_RELOC_OBJECTID;
+	root_key.offset = root->root_key.objectid;
+	root_key.type = BTRFS_ROOT_ITEM_KEY;
+
+	memcpy(root_item, &root->root_item, sizeof(root_item));
+	btrfs_set_root_refs(root_item, 0);
+	btrfs_set_root_bytenr(root_item, eb->start);
+	btrfs_set_root_level(root_item, btrfs_header_level(eb));
+	memset(&root_item->drop_progress, 0, sizeof(root_item->drop_progress));
+	root_item->drop_level = 0;
+
+	btrfs_tree_unlock(eb);
+	free_extent_buffer(eb);
+
+	ret = btrfs_insert_root(trans, root->fs_info->tree_root,
+				&root_key, root_item);
+	BUG_ON(ret);
+	kfree(root_item);
+
+	reloc_root = btrfs_read_fs_root_no_radix(root->fs_info->tree_root,
+						 &root_key);
+	BUG_ON(!reloc_root);
+	reloc_root->last_trans = trans->transid;
+	reloc_root->commit_root = NULL;
+	reloc_root->ref_tree = &root->fs_info->reloc_ref_tree;
+
+	root->reloc_root = reloc_root;
+	return 0;
+}
+
+/*
+ * Core function of space balance.
+ *
+ * The idea is using reloc trees to relocate tree blocks in reference
+ * counted roots. There is one reloc tree for each subvol, all reloc
+ * trees share same key objectid. Reloc trees are snapshots of the
+ * latest committed roots (subvol root->commit_root). To relocate a tree
+ * block referenced by a subvol, the code COW the block through the reloc
+ * tree, then update pointer in the subvol to point to the new block.
+ * Since all reloc trees share same key objectid, we can easily do special
+ * handing to share tree blocks between reloc trees. Once a tree block has
+ * been COWed in one reloc tree, we can use the result when the same block
+ * is COWed again through other reloc trees.
+ */
+static int noinline relocate_one_path(struct btrfs_trans_handle *trans,
+				      struct btrfs_root *root,
+				      struct btrfs_path *path,
+				      struct btrfs_key *first_key,
+				      struct btrfs_ref_path *ref_path,
+				      struct btrfs_block_group_cache *group,
+				      struct inode *reloc_inode)
+{
+	struct btrfs_root *reloc_root;
+	struct extent_buffer *eb = NULL;
+	struct btrfs_key *keys;
+	u64 *nodes;
+	int level;
+	int lowest_merge;
+	int lowest_level = 0;
+	int update_refs;
+	int ret;
+
+	if (ref_path->owner_objectid < BTRFS_FIRST_FREE_OBJECTID)
+		lowest_level = ref_path->owner_objectid;
+
+	if (is_cowonly_root(ref_path->root_objectid)) {
+		path->lowest_level = lowest_level;
+		ret = btrfs_search_slot(trans, root, first_key, path, 0, 1);
+		BUG_ON(ret < 0);
+		path->lowest_level = 0;
+		btrfs_release_path(root, path);
+		return 0;
+	}
+
+	keys = kzalloc(sizeof(*keys) * BTRFS_MAX_LEVEL, GFP_NOFS);
+	BUG_ON(!keys);
+	nodes = kzalloc(sizeof(*nodes) * BTRFS_MAX_LEVEL, GFP_NOFS);
+	BUG_ON(!nodes);
+
+	mutex_lock(&root->fs_info->tree_reloc_mutex);
+	ret = init_reloc_tree(trans, root);
+	BUG_ON(ret);
+	reloc_root = root->reloc_root;
+
+	path->lowest_level = lowest_level;
+	ret = btrfs_search_slot(trans, reloc_root, first_key, path, 0, 0);
+	BUG_ON(ret);
+	/*
+	 * get relocation mapping for tree blocks in the path
+	 */
+	lowest_merge = BTRFS_MAX_LEVEL;
+	for (level = BTRFS_MAX_LEVEL - 1; level >= lowest_level; level--) {
+		u64 new_bytenr;
+		eb = path->nodes[level];
+		if (!eb || eb == reloc_root->node)
+			continue;
+		ret = btrfs_get_reloc_mapping(reloc_root, eb->start, eb->len,
+					      &new_bytenr);
+		if (ret)
+			continue;
+		if (level == 0)
+			btrfs_item_key_to_cpu(eb, &keys[level], 0);
+		else
+			btrfs_node_key_to_cpu(eb, &keys[level], 0);
+		nodes[level] = new_bytenr;
+		lowest_merge = level;
+	}
+
+	update_refs = 0;
+	if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
+		eb = path->nodes[0];
+		if (btrfs_header_generation(eb) < trans->transid)
+			update_refs = 1;
+	}
+
+	btrfs_release_path(reloc_root, path);
+	/*
+	 * merge tree blocks that already relocated in other reloc trees
+	 */
+	if (lowest_merge != BTRFS_MAX_LEVEL) {
+		ret = btrfs_merge_path(trans, reloc_root, keys, nodes,
+				       lowest_merge);
+		BUG_ON(ret < 0);
+	}
+	/*
+	 * cow any tree blocks that still haven't been relocated
+	 */
+	ret = btrfs_search_slot(trans, reloc_root, first_key, path, 0, 1);
+	BUG_ON(ret);
+	/*
+	 * if we are relocating data block group, update extent pointers
+	 * in the newly created tree leaf.
+	 */
+	eb = path->nodes[0];
+	if (update_refs && nodes[0] != eb->start) {
+		ret = replace_extents_in_leaf(trans, reloc_root, eb, group,
+					      reloc_inode);
+		BUG_ON(ret);
+	}
+
+	memset(keys, 0, sizeof(*keys) * BTRFS_MAX_LEVEL);
+	memset(nodes, 0, sizeof(*nodes) * BTRFS_MAX_LEVEL);
+	for (level = BTRFS_MAX_LEVEL - 1; level >= lowest_level; level--) {
+		eb = path->nodes[level];
+		if (!eb || eb == reloc_root->node)
+			continue;
+		BUG_ON(btrfs_header_owner(eb) != BTRFS_TREE_RELOC_OBJECTID);
+		nodes[level] = eb->start;
+		if (level == 0)
+			btrfs_item_key_to_cpu(eb, &keys[level], 0);
+		else
+			btrfs_node_key_to_cpu(eb, &keys[level], 0);
+	}
+
+	if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
+		eb = path->nodes[0];
+		extent_buffer_get(eb);
+	}
+	btrfs_release_path(reloc_root, path);
+	/*
+	 * replace tree blocks in the fs tree with tree blocks in
+	 * the reloc tree.
+	 */
+	ret = btrfs_merge_path(trans, root, keys, nodes, lowest_level);
+	BUG_ON(ret < 0);
+
+	if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
+		ret = invalidate_extent_cache(reloc_root, eb, group, root);
+		BUG_ON(ret);
+		free_extent_buffer(eb);
+	}
+	mutex_unlock(&root->fs_info->tree_reloc_mutex);
+
+	path->lowest_level = 0;
+	kfree(nodes);
+	kfree(keys);
+	return 0;
+}
+
+static int noinline relocate_tree_block(struct btrfs_trans_handle *trans,
+					struct btrfs_root *root,
+					struct btrfs_path *path,
+					struct btrfs_key *first_key,
+					struct btrfs_ref_path *ref_path)
+{
+	int ret;
+	int needs_lock = 0;
+
+	if (root == root->fs_info->extent_root ||
+	    root == root->fs_info->chunk_root ||
+	    root == root->fs_info->dev_root) {
+		needs_lock = 1;
+		mutex_lock(&root->fs_info->alloc_mutex);
+	}
+
+	ret = relocate_one_path(trans, root, path, first_key,
+				ref_path, NULL, NULL);
+	BUG_ON(ret);
+
+	if (root == root->fs_info->extent_root)
+		btrfs_extent_post_op(trans, root);
+	if (needs_lock)
+		mutex_unlock(&root->fs_info->alloc_mutex);
+
+	return 0;
+}
+
+static int noinline del_extent_zero(struct btrfs_trans_handle *trans,
+				    struct btrfs_root *extent_root,
+				    struct btrfs_path *path,
+				    struct btrfs_key *extent_key)
+{
+	int ret;
+
+	mutex_lock(&extent_root->fs_info->alloc_mutex);
+	ret = btrfs_search_slot(trans, extent_root, extent_key, path, -1, 1);
+	if (ret)
+		goto out;
+	ret = btrfs_del_item(trans, extent_root, path);
+out:
+	btrfs_release_path(extent_root, path);
+	mutex_unlock(&extent_root->fs_info->alloc_mutex);
+	return ret;
+}
+
+static struct btrfs_root noinline *read_ref_root(struct btrfs_fs_info *fs_info,
+						struct btrfs_ref_path *ref_path)
+{
+	struct btrfs_key root_key;
+
+	root_key.objectid = ref_path->root_objectid;
+	root_key.type = BTRFS_ROOT_ITEM_KEY;
+	if (is_cowonly_root(ref_path->root_objectid))
+		root_key.offset = 0;
+	else
+		root_key.offset = (u64)-1;
+
+	return btrfs_read_fs_root_no_name(fs_info, &root_key);
+}
+
+static int noinline relocate_one_extent(struct btrfs_root *extent_root,
+					struct btrfs_path *path,
+					struct btrfs_key *extent_key,
+					struct btrfs_block_group_cache *group,
+					struct inode *reloc_inode, int pass)
+{
+	struct btrfs_trans_handle *trans;
+	struct btrfs_root *found_root;
+	struct btrfs_ref_path *ref_path = NULL;
+	struct disk_extent *new_extents = NULL;
+	int nr_extents = 0;
+	int loops;
+	int ret;
+	int level;
+	struct btrfs_key first_key;
+	u64 prev_block = 0;
+
+	mutex_unlock(&extent_root->fs_info->alloc_mutex);
+
+	trans = btrfs_start_transaction(extent_root, 1);
+	BUG_ON(!trans);
+
+	if (extent_key->objectid == 0) {
+		ret = del_extent_zero(trans, extent_root, path, extent_key);
+		goto out;
+	}
+
+	ref_path = kmalloc(sizeof(*ref_path), GFP_NOFS);
+	if (!ref_path) {
+	       ret = -ENOMEM;
+	       goto out;
+	}
+
+	for (loops = 0; ; loops++) {
+		if (loops == 0) {
+			ret = btrfs_first_ref_path(trans, extent_root, ref_path,
+						   extent_key->objectid);
+		} else {
+			ret = btrfs_next_ref_path(trans, extent_root, ref_path);
+		}
+		if (ret < 0)
+			goto out;
+		if (ret > 0)
+			break;
+
+		if (ref_path->root_objectid == BTRFS_TREE_LOG_OBJECTID ||
+		    ref_path->root_objectid == BTRFS_TREE_RELOC_OBJECTID)
+			continue;
+
+		found_root = read_ref_root(extent_root->fs_info, ref_path);
+		BUG_ON(!found_root);
+		/*
+		 * for reference counted tree, only process reference paths
+		 * rooted at the latest committed root.
+		 */
+		if (found_root->ref_cows &&
+		    ref_path->root_generation != found_root->root_key.offset)
+			continue;
+
+		if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
+			if (pass == 0) {
+				/*
+				 * copy data extents to new locations
+				 */
+				u64 group_start = group->key.objectid;
+				ret = relocate_data_extent(reloc_inode,
+							   extent_key,
+							   group_start);
+				if (ret < 0)
+					goto out;
+				break;
+			}
+			level = 0;
+		} else {
+			level = ref_path->owner_objectid;
+		}
+
+		if (prev_block != ref_path->nodes[level]) {
+			struct extent_buffer *eb;
+			u64 block_start = ref_path->nodes[level];
+			u64 block_size = btrfs_level_size(found_root, level);
+
+			eb = read_tree_block(found_root, block_start,
+					     block_size, 0);
+			btrfs_tree_lock(eb);
+			BUG_ON(level != btrfs_header_level(eb));
+
+			if (level == 0)
+				btrfs_item_key_to_cpu(eb, &first_key, 0);
+			else
+				btrfs_node_key_to_cpu(eb, &first_key, 0);
+
+			btrfs_tree_unlock(eb);
+			free_extent_buffer(eb);
+			prev_block = block_start;
+		}
+
+		if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID &&
+		    pass >= 2) {
+			/*
+			 * use fallback method to process the remaining
+			 * references.
+			 */
+			if (!new_extents) {
+				u64 group_start = group->key.objectid;
+				ret = get_new_locations(reloc_inode,
+							extent_key,
+							group_start, 0,
+							&new_extents,
+							&nr_extents);
+				if (ret < 0)
+					goto out;
+			}
+			btrfs_record_root_in_trans(found_root);
+			ret = replace_one_extent(trans, found_root,
+						path, extent_key,
+						&first_key, ref_path,
+						new_extents, nr_extents);
+			if (ret < 0)
+				goto out;
+			continue;
+		}
+
+		btrfs_record_root_in_trans(found_root);
+		if (ref_path->owner_objectid < BTRFS_FIRST_FREE_OBJECTID) {
+			ret = relocate_tree_block(trans, found_root, path,
+						  &first_key, ref_path);
+		} else {
+			/*
+			 * try to update data extent references while
+			 * keeping metadata shared between snapshots.
+			 */
+			ret = relocate_one_path(trans, found_root, path,
+						&first_key, ref_path,
+						group, reloc_inode);
+		}
+		if (ret < 0)
+			goto out;
+	}
+	ret = 0;
+out:
+	btrfs_end_transaction(trans, extent_root);
+	kfree(new_extents);
+	kfree(ref_path);
+	mutex_lock(&extent_root->fs_info->alloc_mutex);
+	return ret;
+}
+
 static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
 {
 	u64 num_devices;
@@ -3686,84 +4780,155 @@ int __alloc_chunk_for_shrink(struct btrfs_root *root,
 	return 0;
 }
 
-int btrfs_shrink_extent_tree(struct btrfs_root *root, u64 shrink_start)
+static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
+				 struct btrfs_root *root,
+				 u64 objectid, u64 size)
+{
+	struct btrfs_path *path;
+	struct btrfs_inode_item *item;
+	struct extent_buffer *leaf;
+	int ret;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	ret = btrfs_insert_empty_inode(trans, root, path, objectid);
+	if (ret)
+		goto out;
+
+	leaf = path->nodes[0];
+	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item);
+	memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item));
+	btrfs_set_inode_generation(leaf, item, 1);
+	btrfs_set_inode_size(leaf, item, size);
+	btrfs_set_inode_mode(leaf, item, S_IFREG | 0600);
+	btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NODATASUM);
+	btrfs_mark_buffer_dirty(leaf);
+	btrfs_release_path(root, path);
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+static struct inode noinline *create_reloc_inode(struct btrfs_fs_info *fs_info,
+					struct btrfs_block_group_cache *group)
+{
+	struct inode *inode = NULL;
+	struct btrfs_trans_handle *trans;
+	struct btrfs_root *root;
+	struct btrfs_key root_key;
+	u64 objectid = BTRFS_FIRST_FREE_OBJECTID;
+	int err = 0;
+
+	root_key.objectid = BTRFS_DATA_RELOC_TREE_OBJECTID;
+	root_key.type = BTRFS_ROOT_ITEM_KEY;
+	root_key.offset = (u64)-1;
+	root = btrfs_read_fs_root_no_name(fs_info, &root_key);
+	if (IS_ERR(root))
+		return ERR_CAST(root);
+
+	trans = btrfs_start_transaction(root, 1);
+	BUG_ON(!trans);
+
+	err = btrfs_find_free_objectid(trans, root, objectid, &objectid);
+	if (err)
+		goto out;
+
+	err = __insert_orphan_inode(trans, root, objectid, group->key.offset);
+	BUG_ON(err);
+
+	err = btrfs_insert_file_extent(trans, root, objectid, 0, 0, 0,
+				       group->key.offset, 0);
+	BUG_ON(err);
+
+	inode = btrfs_iget_locked(root->fs_info->sb, objectid, root);
+	if (inode->i_state & I_NEW) {
+		BTRFS_I(inode)->root = root;
+		BTRFS_I(inode)->location.objectid = objectid;
+		BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
+		BTRFS_I(inode)->location.offset = 0;
+		btrfs_read_locked_inode(inode);
+		unlock_new_inode(inode);
+		BUG_ON(is_bad_inode(inode));
+	} else {
+		BUG_ON(1);
+	}
+
+	err = btrfs_orphan_add(trans, inode);
+out:
+	btrfs_end_transaction(trans, root);
+	if (err) {
+		if (inode)
+			iput(inode);
+		inode = ERR_PTR(err);
+	}
+	return inode;
+}
+
+int btrfs_relocate_block_group(struct btrfs_root *root, u64 group_start)
 {
 	struct btrfs_trans_handle *trans;
-	struct btrfs_root *tree_root = root->fs_info->tree_root;
 	struct btrfs_path *path;
+	struct btrfs_fs_info *info = root->fs_info;
+	struct extent_buffer *leaf;
+	struct inode *reloc_inode;
+	struct btrfs_block_group_cache *block_group;
+	struct btrfs_key key;
 	u64 cur_byte;
 	u64 total_found;
-	u64 shrink_last_byte;
-	struct btrfs_block_group_cache *shrink_block_group;
-	struct btrfs_key key;
-	struct btrfs_key found_key;
-	struct extent_buffer *leaf;
 	u32 nritems;
 	int ret;
 	int progress;
+	int pass = 0;
 
-	mutex_lock(&root->fs_info->alloc_mutex);
-	shrink_block_group = btrfs_lookup_block_group(root->fs_info,
-						      shrink_start);
-	BUG_ON(!shrink_block_group);
+	root = root->fs_info->extent_root;
+
+	block_group = btrfs_lookup_block_group(info, group_start);
+	BUG_ON(!block_group);
 
-	shrink_last_byte = shrink_block_group->key.objectid +
-		shrink_block_group->key.offset;
+	printk("btrfs relocating block group %llu flags %llu\n",
+	       (unsigned long long)block_group->key.objectid,
+	       (unsigned long long)block_group->flags);
 
-	shrink_block_group->space_info->total_bytes -=
-		shrink_block_group->key.offset;
 	path = btrfs_alloc_path();
-	root = root->fs_info->extent_root;
-	path->reada = 2;
+	BUG_ON(!path);
 
-	printk("btrfs relocating block group %llu flags %llu\n",
-	       (unsigned long long)shrink_start,
-	       (unsigned long long)shrink_block_group->flags);
+	reloc_inode = create_reloc_inode(info, block_group);
+	BUG_ON(IS_ERR(reloc_inode));
 
-	__alloc_chunk_for_shrink(root, shrink_block_group, 1);
+	mutex_lock(&root->fs_info->alloc_mutex);
 
-again:
+	__alloc_chunk_for_shrink(root, block_group, 1);
+	block_group->ro = 1;
+	block_group->space_info->total_bytes -= block_group->key.offset;
 
-	shrink_block_group->ro = 1;
+	mutex_unlock(&root->fs_info->alloc_mutex);
 
+	btrfs_start_delalloc_inodes(info->tree_root);
+	btrfs_wait_ordered_extents(info->tree_root, 0);
+again:
 	total_found = 0;
 	progress = 0;
-	key.objectid = shrink_start;
+	key.objectid = block_group->key.objectid;
 	key.offset = 0;
 	key.type = 0;
 	cur_byte = key.objectid;
 
-	mutex_unlock(&root->fs_info->alloc_mutex);
+	trans = btrfs_start_transaction(info->tree_root, 1);
+	btrfs_commit_transaction(trans, info->tree_root);
 
-	btrfs_start_delalloc_inodes(root);
-	btrfs_wait_ordered_extents(tree_root, 0);
+	mutex_lock(&root->fs_info->cleaner_mutex);
+	btrfs_clean_old_snapshots(info->tree_root);
+	btrfs_remove_leaf_refs(info->tree_root, (u64)-1, 1);
+	mutex_unlock(&root->fs_info->cleaner_mutex);
 
 	mutex_lock(&root->fs_info->alloc_mutex);
 
-	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
-	if (ret < 0)
-		goto out;
-
-	ret = btrfs_previous_item(root, path, 0, BTRFS_EXTENT_ITEM_KEY);
-	if (ret < 0)
-		goto out;
-
-	if (ret == 0) {
-		leaf = path->nodes[0];
-		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
-		if (found_key.objectid + found_key.offset > shrink_start &&
-		    found_key.objectid < shrink_last_byte) {
-			cur_byte = found_key.objectid;
-			key.objectid = cur_byte;
-		}
-	}
-	btrfs_release_path(root, path);
-
 	while(1) {
 		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
 		if (ret < 0)
 			goto out;
-
 next:
 		leaf = path->nodes[0];
 		nritems = btrfs_header_nritems(leaf);
@@ -3779,109 +4944,76 @@ next:
 			nritems = btrfs_header_nritems(leaf);
 		}
 
-		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
 
-		if (found_key.objectid >= shrink_last_byte)
+		if (key.objectid >= block_group->key.objectid +
+		    block_group->key.offset)
 			break;
 
 		if (progress && need_resched()) {
-			memcpy(&key, &found_key, sizeof(key));
-			cond_resched();
 			btrfs_release_path(root, path);
-			btrfs_search_slot(NULL, root, &key, path, 0, 0);
+			mutex_unlock(&root->fs_info->alloc_mutex);
+			cond_resched();
+			mutex_lock(&root->fs_info->alloc_mutex);
 			progress = 0;
-			goto next;
+			continue;
 		}
 		progress = 1;
 
-		if (btrfs_key_type(&found_key) != BTRFS_EXTENT_ITEM_KEY ||
-		    found_key.objectid + found_key.offset <= cur_byte) {
-			memcpy(&key, &found_key, sizeof(key));
-			key.offset++;
+		if (btrfs_key_type(&key) != BTRFS_EXTENT_ITEM_KEY ||
+		    key.objectid + key.offset <= cur_byte) {
 			path->slots[0]++;
 			goto next;
 		}
 
 		total_found++;
-		cur_byte = found_key.objectid + found_key.offset;
-		key.objectid = cur_byte;
+		cur_byte = key.objectid + key.offset;
 		btrfs_release_path(root, path);
-		ret = relocate_one_extent(root, path, &found_key);
-		__alloc_chunk_for_shrink(root, shrink_block_group, 0);
-	}
-
-	btrfs_release_path(root, path);
-
-	if (total_found > 0) {
-		printk("btrfs relocate found %llu last extent was %llu\n",
-		       (unsigned long long)total_found,
-		       (unsigned long long)found_key.objectid);
-		mutex_unlock(&root->fs_info->alloc_mutex);
-		trans = btrfs_start_transaction(tree_root, 1);
-		btrfs_commit_transaction(trans, tree_root);
 
-		btrfs_clean_old_snapshots(tree_root);
+		__alloc_chunk_for_shrink(root, block_group, 0);
+		ret = relocate_one_extent(root, path, &key, block_group,
+					  reloc_inode, pass);
+		BUG_ON(ret < 0);
 
-		btrfs_start_delalloc_inodes(root);
-		btrfs_wait_ordered_extents(tree_root, 0);
-
-		trans = btrfs_start_transaction(tree_root, 1);
-		btrfs_commit_transaction(trans, tree_root);
-		mutex_lock(&root->fs_info->alloc_mutex);
-		goto again;
+		key.objectid = cur_byte;
+		key.type = 0;
+		key.offset = 0;
 	}
 
-	/*
-	 * we've freed all the extents, now remove the block
-	 * group item from the tree
-	 */
+	btrfs_release_path(root, path);
 	mutex_unlock(&root->fs_info->alloc_mutex);
 
-	trans = btrfs_start_transaction(root, 1);
-
-	mutex_lock(&root->fs_info->alloc_mutex);
-	memcpy(&key, &shrink_block_group->key, sizeof(key));
-
-	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
-	if (ret > 0)
-		ret = -EIO;
-	if (ret < 0) {
-		btrfs_end_transaction(trans, root);
-		goto out;
+	if (pass == 0) {
+		btrfs_wait_ordered_range(reloc_inode, 0, (u64)-1);
+		invalidate_mapping_pages(reloc_inode->i_mapping, 0, -1);
+		WARN_ON(reloc_inode->i_mapping->nrpages);
 	}
 
-	spin_lock(&root->fs_info->block_group_cache_lock);
-	rb_erase(&shrink_block_group->cache_node,
-		 &root->fs_info->block_group_cache_tree);
-	spin_unlock(&root->fs_info->block_group_cache_lock);
-
-	ret = btrfs_remove_free_space(shrink_block_group, key.objectid,
-				      key.offset);
-	if (ret) {
-		btrfs_end_transaction(trans, root);
-		goto out;
+	if (total_found > 0) {
+		printk("btrfs found %llu extents in pass %d\n",
+		       (unsigned long long)total_found, pass);
+		pass++;
+		goto again;
 	}
-	/*
-	memset(shrink_block_group, 0, sizeof(*shrink_block_group));
-	kfree(shrink_block_group);
-	*/
 
-	btrfs_del_item(trans, root, path);
-	btrfs_release_path(root, path);
-	mutex_unlock(&root->fs_info->alloc_mutex);
-	btrfs_commit_transaction(trans, root);
+	/* delete reloc_inode */
+	iput(reloc_inode);
+
+	/* unpin extents in this range */
+	trans = btrfs_start_transaction(info->tree_root, 1);
+	btrfs_commit_transaction(trans, info->tree_root);
 
 	mutex_lock(&root->fs_info->alloc_mutex);
 
-	/* the code to unpin extents might set a few bits in the free
-	 * space cache for this range again
-	 */
-	/* XXX? */
-	ret = btrfs_remove_free_space(shrink_block_group, key.objectid,
-				      key.offset);
+	spin_lock(&block_group->lock);
+	WARN_ON(block_group->pinned > 0);
+	WARN_ON(block_group->reserved > 0);
+	WARN_ON(btrfs_block_group_used(&block_group->item) > 0);
+	spin_unlock(&block_group->lock);
+	ret = 0;
 out:
-	btrfs_free_path(path);
 	mutex_unlock(&root->fs_info->alloc_mutex);
+	btrfs_free_path(path);
 	return ret;
 }
 
@@ -3922,6 +5054,33 @@ out:
 	return ret;
 }
 
+int btrfs_free_block_groups(struct btrfs_fs_info *info)
+{
+	struct btrfs_block_group_cache *block_group;
+	struct rb_node *n;
+
+	mutex_lock(&info->alloc_mutex);
+	spin_lock(&info->block_group_cache_lock);
+	while ((n = rb_last(&info->block_group_cache_tree)) != NULL) {
+		block_group = rb_entry(n, struct btrfs_block_group_cache,
+				       cache_node);
+
+		spin_unlock(&info->block_group_cache_lock);
+		btrfs_remove_free_space_cache(block_group);
+		spin_lock(&info->block_group_cache_lock);
+
+		rb_erase(&block_group->cache_node,
+			 &info->block_group_cache_tree);
+		spin_lock(&block_group->space_info->lock);
+		list_del(&block_group->list);
+		spin_unlock(&block_group->space_info->lock);
+		kfree(block_group);
+	}
+	spin_unlock(&info->block_group_cache_lock);
+	mutex_unlock(&info->alloc_mutex);
+	return 0;
+}
+
 int btrfs_read_block_groups(struct btrfs_root *root)
 {
 	struct btrfs_path *path;
@@ -4039,3 +5198,46 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
 
 	return 0;
 }
+
+int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
+			     struct btrfs_root *root, u64 group_start)
+{
+	struct btrfs_path *path;
+	struct btrfs_block_group_cache *block_group;
+	struct btrfs_key key;
+	int ret;
+
+	BUG_ON(!mutex_is_locked(&root->fs_info->alloc_mutex));
+	root = root->fs_info->extent_root;
+
+	block_group = btrfs_lookup_block_group(root->fs_info, group_start);
+	BUG_ON(!block_group);
+
+	memcpy(&key, &block_group->key, sizeof(key));
+
+	path = btrfs_alloc_path();
+	BUG_ON(!path);
+
+	btrfs_remove_free_space_cache(block_group);
+	rb_erase(&block_group->cache_node,
+		 &root->fs_info->block_group_cache_tree);
+	spin_lock(&block_group->space_info->lock);
+	list_del(&block_group->list);
+	spin_unlock(&block_group->space_info->lock);
+
+	/*
+	memset(shrink_block_group, 0, sizeof(*shrink_block_group));
+	kfree(shrink_block_group);
+	*/
+
+	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+	if (ret > 0)
+		ret = -EIO;
+	if (ret < 0)
+		goto out;
+
+	ret = btrfs_del_item(trans, root, path);
+out:
+	btrfs_free_path(path);
+	return ret;
+}
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index e3984f902e71..0091c01abb06 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -210,7 +210,10 @@ again:
 			goto err;
 		}
 
-		ret = btrfs_add_dead_root(dead_root, latest);
+		if (objectid == BTRFS_TREE_RELOC_OBJECTID)
+			ret = btrfs_add_dead_reloc_root(dead_root);
+		else
+			ret = btrfs_add_dead_root(dead_root, latest);
 		if (ret)
 			goto err;
 		goto again;
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 8c83cf464c83..444abe0796ae 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -477,6 +477,7 @@ static noinline int add_dirty_roots(struct btrfs_trans_handle *trans,
 			dirty = root->dirty_root;
 
 			btrfs_free_log(trans, root);
+			btrfs_free_reloc_root(root);
 
 			if (root->commit_root == root->node) {
 				WARN_ON(root->node->start !=
@@ -855,6 +856,11 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 	 * with the tree-log code.
 	 */
 	mutex_lock(&root->fs_info->tree_log_mutex);
+	/*
+	 * keep tree reloc code from adding new reloc trees
+	 */
+	mutex_lock(&root->fs_info->tree_reloc_mutex);
+
 
 	ret = add_dirty_roots(trans, &root->fs_info->fs_roots_radix,
 			      &dirty_fs_roots);
@@ -865,6 +871,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 	 */
 	btrfs_free_log_root_tree(trans, root->fs_info);
 
+	btrfs_free_reloc_mappings(root);
+
 	ret = btrfs_commit_tree_roots(trans, root);
 	BUG_ON(ret);
 
@@ -910,10 +918,13 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 	mutex_unlock(&root->fs_info->tree_log_mutex);
 
 	btrfs_finish_extent_commit(trans, root, pinned_copy);
-	mutex_lock(&root->fs_info->trans_mutex);
-
 	kfree(pinned_copy);
 
+	btrfs_drop_dead_reloc_roots(root);
+	mutex_unlock(&root->fs_info->tree_reloc_mutex);
+
+	mutex_lock(&root->fs_info->trans_mutex);
+
 	cur_trans->commit_done = 1;
 	root->fs_info->last_trans_committed = cur_trans->transid;
 	wake_up(&cur_trans->commit_wait);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index ddf89626498a..51f113119b25 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1268,7 +1268,7 @@ int btrfs_relocate_chunk(struct btrfs_root *root,
 	em_tree = &root->fs_info->mapping_tree.map_tree;
 
 	/* step one, relocate all the extents inside this chunk */
-	ret = btrfs_shrink_extent_tree(extent_root, chunk_offset);
+	ret = btrfs_relocate_block_group(extent_root, chunk_offset);
 	BUG_ON(ret);
 
 	trans = btrfs_start_transaction(root, 1);
@@ -1308,15 +1308,18 @@ int btrfs_relocate_chunk(struct btrfs_root *root,
 		BUG_ON(ret);
 	}
 
+	ret = btrfs_remove_block_group(trans, extent_root, chunk_offset);
+	BUG_ON(ret);
+
 	spin_lock(&em_tree->lock);
 	remove_extent_mapping(em_tree, em);
+	spin_unlock(&em_tree->lock);
+
 	kfree(map);
 	em->bdev = NULL;
 
 	/* once for the tree */
 	free_extent_map(em);
-	spin_unlock(&em_tree->lock);
-
 	/* once for us */
 	free_extent_map(em);
 
-- 
cgit v1.2.3


From 8c8bee1d7ca47fc75b6bd24a8085c525a2394c02 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 29 Sep 2008 11:19:10 -0400
Subject: Btrfs: Wait for IO on the block device inodes of newly added devices

btrfs-vol -a /dev/xxx will zero the first and last two MB of the device.
The kernel code needs to wait for this IO to finish before it adds
the device.

btrfs metadata IO does not happen through the block device inode.  A
separate address space is used, allowing the zero filled buffer heads in
the block device inode to be written to disk after FS metadata starts
going down to the disk via the btrfs metadata inode.

The end result is zero filled metadata blocks after adding new devices
into the filesystem.

The fix is a simple filemap_write_and_wait on the block device inode
before actually inserting it into the pool of available devices.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h   |  1 +
 fs/btrfs/disk-io.c | 11 ++++++++---
 fs/btrfs/inode.c   | 13 ++++++++++++-
 fs/btrfs/volumes.c |  1 +
 4 files changed, 22 insertions(+), 4 deletions(-)

(limited to 'fs/btrfs/volumes.c')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 2775e270881e..0079b60b18f3 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -610,6 +610,7 @@ struct btrfs_fs_info {
 	struct list_head dead_roots;
 
 	atomic_t nr_async_submits;
+	atomic_t async_submit_draining;
 	atomic_t nr_async_bios;
 	atomic_t tree_log_writers;
 	atomic_t tree_log_commit;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 45bc3132b054..45b4f7285275 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -460,6 +460,13 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
 	async->submit_bio_hook = submit_bio_hook;
 	async->work.func = run_one_async_submit;
 	async->work.flags = 0;
+
+	while(atomic_read(&fs_info->async_submit_draining) &&
+	      atomic_read(&fs_info->nr_async_submits)) {
+		wait_event(fs_info->async_submit_wait,
+			   (atomic_read(&fs_info->nr_async_submits) == 0));
+	}
+
 	atomic_inc(&fs_info->nr_async_submits);
 	btrfs_queue_worker(&fs_info->workers, &async->work);
 
@@ -495,11 +502,8 @@ static int __btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
 				 int mirror_num)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
-	u64 offset;
 	int ret;
 
-	offset = bio->bi_sector << 9;
-
 	/*
 	 * when we're called for a write, we're already in the async
 	 * submission context.  Just jump into btrfs_map_bio
@@ -1360,6 +1364,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	INIT_LIST_HEAD(&fs_info->space_info);
 	btrfs_mapping_init(&fs_info->mapping_tree);
 	atomic_set(&fs_info->nr_async_submits, 0);
+	atomic_set(&fs_info->async_submit_draining, 0);
 	atomic_set(&fs_info->nr_async_bios, 0);
 	atomic_set(&fs_info->throttles, 0);
 	atomic_set(&fs_info->throttle_gen, 0);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 4516fbf01671..404704d26822 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3440,13 +3440,24 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root)
 			list_del_init(&binode->delalloc_inodes);
 		spin_unlock_irqrestore(&root->fs_info->delalloc_lock, flags);
 		if (inode) {
-			filemap_write_and_wait(inode->i_mapping);
+			filemap_flush(inode->i_mapping);
 			iput(inode);
 		}
 		cond_resched();
 		spin_lock_irqsave(&root->fs_info->delalloc_lock, flags);
 	}
 	spin_unlock_irqrestore(&root->fs_info->delalloc_lock, flags);
+
+	/* the filemap_flush will queue IO into the worker threads, but
+	 * we have to make sure the IO is actually started and that
+	 * ordered extents get created before we return
+	 */
+	atomic_inc(&root->fs_info->async_submit_draining);
+	while(atomic_read(&root->fs_info->nr_async_submits)) {
+		wait_event(root->fs_info->async_submit_wait,
+		   (atomic_read(&root->fs_info->nr_async_submits) == 0));
+	}
+	atomic_dec(&root->fs_info->async_submit_draining);
 	return 0;
 }
 
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 51f113119b25..f63cf7621a01 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1038,6 +1038,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
 		return -EIO;
 	}
 
+	filemap_write_and_wait(bdev->bd_inode->i_mapping);
 	mutex_lock(&root->fs_info->volume_mutex);
 
 	trans = btrfs_start_transaction(root, 1);
-- 
cgit v1.2.3


From a62b940160d8125016e85046e68ae621c99e751f Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 3 Oct 2008 16:31:08 -0400
Subject: Btrfs: cast bio->bi_sector to a u64 before shifting

On 32 bit machines without CONFIG_LBD, the bi_sector field is only 32 bits.
Btrfs needs to cast it before shifting up, or we end up doing IO into
the wrong place.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/inode.c   | 2 +-
 fs/btrfs/volumes.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs/btrfs/volumes.c')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index e5c9261dcbaa..ff0c35976657 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -386,7 +386,7 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
 {
 	struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
 	struct btrfs_mapping_tree *map_tree;
-	u64 logical = bio->bi_sector << 9;
+	u64 logical = (u64)bio->bi_sector << 9;
 	u64 length = 0;
 	u64 map_length;
 	int ret;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index f63cf7621a01..2eed7f91f51a 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -2187,7 +2187,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
 	struct btrfs_mapping_tree *map_tree;
 	struct btrfs_device *dev;
 	struct bio *first_bio = bio;
-	u64 logical = bio->bi_sector << 9;
+	u64 logical = (u64)bio->bi_sector << 9;
 	u64 length = 0;
 	u64 map_length;
 	struct btrfs_multi_bio *multi = NULL;
-- 
cgit v1.2.3


From c8b978188c9a0fd3d535c13debd19d522b726f1f Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 29 Oct 2008 14:49:59 -0400
Subject: Btrfs: Add zlib compression support

This is a large change for adding compression on reading and writing,
both for inline and regular extents.  It does some fairly large
surgery to the writeback paths.

Compression is off by default and enabled by mount -o compress.  Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.

If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.

* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler.  This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.

* Inline extents are inserted at delalloc time now.  This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.

* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.

From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field.  Neither the encryption or the
'other' field are currently used.

In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k.  This is a
software only limit, the disk format supports u64 sized compressed extents.

In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k.  This is a software only limit
and will be subject to tuning later.

Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data.  This way additional encodings can be
layered on without having to figure out which encoding to checksum.

Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread.  This makes it tricky to
spread the compression load across all the cpus on the box.  We'll have to
look at parallel pdflush walks of dirty inodes at a later time.

Decompression is hooked into readpages and it does spread across CPUs nicely.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/Kconfig              |   2 +
 fs/btrfs/Makefile       |   3 +-
 fs/btrfs/compression.c  | 454 ++++++++++++++++++++++++++++++++++
 fs/btrfs/compression.h  |  47 ++++
 fs/btrfs/ctree.h        |  99 ++++++--
 fs/btrfs/disk-io.c      |  18 +-
 fs/btrfs/disk-io.h      |   1 +
 fs/btrfs/extent-tree.c  |  27 +-
 fs/btrfs/extent_io.c    | 411 ++++++++++++++++++++++++-------
 fs/btrfs/extent_io.h    |  17 +-
 fs/btrfs/extent_map.c   |   9 +
 fs/btrfs/extent_map.h   |   6 +-
 fs/btrfs/file-item.c    |  75 +++++-
 fs/btrfs/file.c         | 263 +++++---------------
 fs/btrfs/inode.c        | 584 ++++++++++++++++++++++++++++++++++++++++----
 fs/btrfs/ordered-data.c |   9 +-
 fs/btrfs/ordered-data.h |  10 +-
 fs/btrfs/print-tree.c   |   7 +-
 fs/btrfs/super.c        |  10 +-
 fs/btrfs/tree-log.c     |   3 +-
 fs/btrfs/volumes.c      |   2 +
 fs/btrfs/zlib.c         | 637 ++++++++++++++++++++++++++++++++++++++++++++++++
 22 files changed, 2315 insertions(+), 379 deletions(-)
 create mode 100644 fs/btrfs/compression.c
 create mode 100644 fs/btrfs/compression.h
 create mode 100644 fs/btrfs/zlib.c

(limited to 'fs/btrfs/volumes.c')

diff --git a/fs/Kconfig b/fs/Kconfig
index 18f5a85b47c6..31cce5d88b1a 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -501,6 +501,8 @@ config BTRFS_FS
 	tristate "Btrfs filesystem (EXPERIMENTAL) Unstable disk format"
 	depends on EXPERIMENTAL
 	select LIBCRC32C
+	select ZLIB_INFLATE
+	select ZLIB_DEFLATE
 	help
 	  Btrfs is a new filesystem with extents, writable snapshotting,
 	  support for multiple devices and many more features.
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 7125716e142b..d2cf5a54a4b8 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -7,7 +7,8 @@ btrfs-y := super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
 	   transaction.o inode.o file.o tree-defrag.o \
 	   extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \
 	   extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
-	   ref-cache.o export.o tree-log.o acl.o free-space-cache.o
+	   ref-cache.o export.o tree-log.o acl.o free-space-cache.o zlib.o \
+	   compression.o
 else
 
 # Normal Makefile
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
new file mode 100644
index 000000000000..c5470367ca5c
--- /dev/null
+++ b/fs/btrfs/compression.c
@@ -0,0 +1,454 @@
+/*
+ * Copyright (C) 2008 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/kernel.h>
+#include <linux/bio.h>
+#include <linux/buffer_head.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/pagemap.h>
+#include <linux/highmem.h>
+#include <linux/time.h>
+#include <linux/init.h>
+#include <linux/string.h>
+#include <linux/smp_lock.h>
+#include <linux/backing-dev.h>
+#include <linux/mpage.h>
+#include <linux/swap.h>
+#include <linux/writeback.h>
+#include <linux/bit_spinlock.h>
+#include <linux/version.h>
+#include "ctree.h"
+#include "disk-io.h"
+#include "transaction.h"
+#include "btrfs_inode.h"
+#include "volumes.h"
+#include "ordered-data.h"
+#include "compat.h"
+#include "compression.h"
+#include "extent_io.h"
+#include "extent_map.h"
+
+struct compressed_bio {
+	/* number of bios pending for this compressed extent */
+	atomic_t pending_bios;
+
+	/* the pages with the compressed data on them */
+	struct page **compressed_pages;
+
+	/* inode that owns this data */
+	struct inode *inode;
+
+	/* starting offset in the inode for our pages */
+	u64 start;
+
+	/* number of bytes in the inode we're working on */
+	unsigned long len;
+
+	/* number of bytes on disk */
+	unsigned long compressed_len;
+
+	/* number of compressed pages in the array */
+	unsigned long nr_pages;
+
+	/* IO errors */
+	int errors;
+
+	/* for reads, this is the bio we are copying the data into */
+	struct bio *orig_bio;
+};
+
+static struct bio *compressed_bio_alloc(struct block_device *bdev,
+					u64 first_byte, gfp_t gfp_flags)
+{
+	struct bio *bio;
+	int nr_vecs;
+
+	nr_vecs = bio_get_nr_vecs(bdev);
+	bio = bio_alloc(gfp_flags, nr_vecs);
+
+	if (bio == NULL && (current->flags & PF_MEMALLOC)) {
+		while (!bio && (nr_vecs /= 2))
+			bio = bio_alloc(gfp_flags, nr_vecs);
+	}
+
+	if (bio) {
+		bio->bi_size = 0;
+		bio->bi_bdev = bdev;
+		bio->bi_sector = first_byte >> 9;
+	}
+	return bio;
+}
+
+/* when we finish reading compressed pages from the disk, we
+ * decompress them and then run the bio end_io routines on the
+ * decompressed pages (in the inode address space).
+ *
+ * This allows the checksumming and other IO error handling routines
+ * to work normally
+ *
+ * The compressed pages are freed here, and it must be run
+ * in process context
+ */
+static void end_compressed_bio_read(struct bio *bio, int err)
+{
+	struct extent_io_tree *tree;
+	struct compressed_bio *cb = bio->bi_private;
+	struct inode *inode;
+	struct page *page;
+	unsigned long index;
+	int ret;
+
+	if (err)
+		cb->errors = 1;
+
+	/* if there are more bios still pending for this compressed
+	 * extent, just exit
+	 */
+	if (!atomic_dec_and_test(&cb->pending_bios))
+		goto out;
+
+	/* ok, we're the last bio for this extent, lets start
+	 * the decompression.
+	 */
+	inode = cb->inode;
+	tree = &BTRFS_I(inode)->io_tree;
+	ret = btrfs_zlib_decompress_biovec(cb->compressed_pages,
+					cb->start,
+					cb->orig_bio->bi_io_vec,
+					cb->orig_bio->bi_vcnt,
+					cb->compressed_len);
+	if (ret)
+		cb->errors = 1;
+
+	/* release the compressed pages */
+	index = 0;
+	for (index = 0; index < cb->nr_pages; index++) {
+		page = cb->compressed_pages[index];
+		page->mapping = NULL;
+		page_cache_release(page);
+	}
+
+	/* do io completion on the original bio */
+	if (cb->errors)
+		bio_io_error(cb->orig_bio);
+	else
+		bio_endio(cb->orig_bio, 0);
+
+	/* finally free the cb struct */
+	kfree(cb->compressed_pages);
+	kfree(cb);
+out:
+	bio_put(bio);
+}
+
+/*
+ * Clear the writeback bits on all of the file
+ * pages for a compressed write
+ */
+static noinline int end_compressed_writeback(struct inode *inode, u64 start,
+					     unsigned long ram_size)
+{
+	unsigned long index = start >> PAGE_CACHE_SHIFT;
+	unsigned long end_index = (start + ram_size - 1) >> PAGE_CACHE_SHIFT;
+	struct page *pages[16];
+	unsigned long nr_pages = end_index - index + 1;
+	int i;
+	int ret;
+
+	while(nr_pages > 0) {
+		ret = find_get_pages_contig(inode->i_mapping, index,
+				     min(nr_pages, ARRAY_SIZE(pages)), pages);
+		if (ret == 0) {
+			nr_pages -= 1;
+			index += 1;
+			continue;
+		}
+		for (i = 0; i < ret; i++) {
+			end_page_writeback(pages[i]);
+			page_cache_release(pages[i]);
+		}
+		nr_pages -= ret;
+		index += ret;
+	}
+	/* the inode may be gone now */
+	return 0;
+}
+
+/*
+ * do the cleanup once all the compressed pages hit the disk.
+ * This will clear writeback on the file pages and free the compressed
+ * pages.
+ *
+ * This also calls the writeback end hooks for the file pages so that
+ * metadata and checksums can be updated in the file.
+ */
+static void end_compressed_bio_write(struct bio *bio, int err)
+{
+	struct extent_io_tree *tree;
+	struct compressed_bio *cb = bio->bi_private;
+	struct inode *inode;
+	struct page *page;
+	unsigned long index;
+
+	if (err)
+		cb->errors = 1;
+
+	/* if there are more bios still pending for this compressed
+	 * extent, just exit
+	 */
+	if (!atomic_dec_and_test(&cb->pending_bios))
+		goto out;
+
+	/* ok, we're the last bio for this extent, step one is to
+	 * call back into the FS and do all the end_io operations
+	 */
+	inode = cb->inode;
+	tree = &BTRFS_I(inode)->io_tree;
+	tree->ops->writepage_end_io_hook(cb->compressed_pages[0],
+					 cb->start,
+					 cb->start + cb->len - 1,
+					 NULL, 1);
+
+	end_compressed_writeback(inode, cb->start, cb->len);
+	/* note, our inode could be gone now */
+
+	/*
+	 * release the compressed pages, these came from alloc_page and
+	 * are not attached to the inode at all
+	 */
+	index = 0;
+	for (index = 0; index < cb->nr_pages; index++) {
+		page = cb->compressed_pages[index];
+		page->mapping = NULL;
+		page_cache_release(page);
+	}
+
+	/* finally free the cb struct */
+	kfree(cb->compressed_pages);
+	kfree(cb);
+out:
+	bio_put(bio);
+}
+
+/*
+ * worker function to build and submit bios for previously compressed pages.
+ * The corresponding pages in the inode should be marked for writeback
+ * and the compressed pages should have a reference on them for dropping
+ * when the IO is complete.
+ *
+ * This also checksums the file bytes and gets things ready for
+ * the end io hooks.
+ */
+int btrfs_submit_compressed_write(struct inode *inode, u64 start,
+				 unsigned long len, u64 disk_start,
+				 unsigned long compressed_len,
+				 struct page **compressed_pages,
+				 unsigned long nr_pages)
+{
+	struct bio *bio = NULL;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct compressed_bio *cb;
+	unsigned long bytes_left;
+	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+	int page_index = 0;
+	struct page *page;
+	u64 first_byte = disk_start;
+	struct block_device *bdev;
+	int ret;
+
+	WARN_ON(start & ((u64)PAGE_CACHE_SIZE - 1));
+	cb = kmalloc(sizeof(*cb), GFP_NOFS);
+	atomic_set(&cb->pending_bios, 0);
+	cb->errors = 0;
+	cb->inode = inode;
+	cb->start = start;
+	cb->len = len;
+	cb->compressed_pages = compressed_pages;
+	cb->compressed_len = compressed_len;
+	cb->orig_bio = NULL;
+	cb->nr_pages = nr_pages;
+
+	bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
+
+	ret = btrfs_csum_file_bytes(root, inode, start, len);
+	BUG_ON(ret);
+
+	bio = compressed_bio_alloc(bdev, first_byte, GFP_NOFS);
+	bio->bi_private = cb;
+	bio->bi_end_io = end_compressed_bio_write;
+	atomic_inc(&cb->pending_bios);
+
+	/* create and submit bios for the compressed pages */
+	bytes_left = compressed_len;
+	while(bytes_left > 0) {
+		page = compressed_pages[page_index];
+		page->mapping = inode->i_mapping;
+		if (bio->bi_size)
+			ret = io_tree->ops->merge_bio_hook(page, 0,
+							   PAGE_CACHE_SIZE,
+							   bio, 0);
+		else
+			ret = 0;
+
+		if (ret || bio_add_page(bio, page, PAGE_CACHE_SIZE, 0) <
+		    PAGE_CACHE_SIZE) {
+			bio_get(bio);
+
+			ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
+			BUG_ON(ret);
+
+			ret = btrfs_map_bio(root, WRITE, bio, 0, 1);
+			BUG_ON(ret);
+
+			bio_put(bio);
+
+			bio = compressed_bio_alloc(bdev, first_byte, GFP_NOFS);
+			atomic_inc(&cb->pending_bios);
+			bio->bi_private = cb;
+			bio->bi_end_io = end_compressed_bio_write;
+			bio_add_page(bio, page, PAGE_CACHE_SIZE, 0);
+		}
+		page_index++;
+		bytes_left -= PAGE_CACHE_SIZE;
+		first_byte += PAGE_CACHE_SIZE;
+	}
+	bio_get(bio);
+
+	ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
+	BUG_ON(ret);
+
+	ret = btrfs_map_bio(root, WRITE, bio, 0, 1);
+	BUG_ON(ret);
+
+	bio_put(bio);
+	return 0;
+}
+
+/*
+ * for a compressed read, the bio we get passed has all the inode pages
+ * in it.  We don't actually do IO on those pages but allocate new ones
+ * to hold the compressed pages on disk.
+ *
+ * bio->bi_sector points to the compressed extent on disk
+ * bio->bi_io_vec points to all of the inode pages
+ * bio->bi_vcnt is a count of pages
+ *
+ * After the compressed pages are read, we copy the bytes into the
+ * bio we were passed and then call the bio end_io calls
+ */
+int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
+				 int mirror_num, unsigned long bio_flags)
+{
+	struct extent_io_tree *tree;
+	struct extent_map_tree *em_tree;
+	struct compressed_bio *cb;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	unsigned long uncompressed_len = bio->bi_vcnt * PAGE_CACHE_SIZE;
+	unsigned long compressed_len;
+	unsigned long nr_pages;
+	unsigned long page_index;
+	struct page *page;
+	struct block_device *bdev;
+	struct bio *comp_bio;
+	u64 cur_disk_byte = (u64)bio->bi_sector << 9;
+	struct extent_map *em;
+	int ret;
+
+	tree = &BTRFS_I(inode)->io_tree;
+	em_tree = &BTRFS_I(inode)->extent_tree;
+
+	/* we need the actual starting offset of this extent in the file */
+	spin_lock(&em_tree->lock);
+	em = lookup_extent_mapping(em_tree,
+				   page_offset(bio->bi_io_vec->bv_page),
+				   PAGE_CACHE_SIZE);
+	spin_unlock(&em_tree->lock);
+
+	cb = kmalloc(sizeof(*cb), GFP_NOFS);
+	atomic_set(&cb->pending_bios, 0);
+	cb->errors = 0;
+	cb->inode = inode;
+
+	cb->start = em->start;
+	compressed_len = em->block_len;
+	free_extent_map(em);
+
+	cb->len = uncompressed_len;
+	cb->compressed_len = compressed_len;
+	cb->orig_bio = bio;
+
+	nr_pages = (compressed_len + PAGE_CACHE_SIZE - 1) /
+				 PAGE_CACHE_SIZE;
+	cb->compressed_pages = kmalloc(sizeof(struct page *) * nr_pages,
+				       GFP_NOFS);
+	bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
+
+	for (page_index = 0; page_index < nr_pages; page_index++) {
+		cb->compressed_pages[page_index] = alloc_page(GFP_NOFS |
+							      __GFP_HIGHMEM);
+	}
+	cb->nr_pages = nr_pages;
+
+	comp_bio = compressed_bio_alloc(bdev, cur_disk_byte, GFP_NOFS);
+	comp_bio->bi_private = cb;
+	comp_bio->bi_end_io = end_compressed_bio_read;
+	atomic_inc(&cb->pending_bios);
+
+	for (page_index = 0; page_index < nr_pages; page_index++) {
+		page = cb->compressed_pages[page_index];
+		page->mapping = inode->i_mapping;
+		if (comp_bio->bi_size)
+			ret = tree->ops->merge_bio_hook(page, 0,
+							PAGE_CACHE_SIZE,
+							comp_bio, 0);
+		else
+			ret = 0;
+
+		if (ret || bio_add_page(comp_bio, page, PAGE_CACHE_SIZE, 0) <
+		    PAGE_CACHE_SIZE) {
+			bio_get(comp_bio);
+
+			ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, 0);
+			BUG_ON(ret);
+
+			ret = btrfs_map_bio(root, READ, comp_bio, 0, 0);
+			BUG_ON(ret);
+
+			bio_put(comp_bio);
+
+			comp_bio = compressed_bio_alloc(bdev, cur_disk_byte,
+							GFP_NOFS);
+			atomic_inc(&cb->pending_bios);
+			bio->bi_private = cb;
+			bio->bi_end_io = end_compressed_bio_write;
+			bio_add_page(bio, page, PAGE_CACHE_SIZE, 0);
+		}
+		cur_disk_byte += PAGE_CACHE_SIZE;
+	}
+	bio_get(comp_bio);
+
+	ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, 0);
+	BUG_ON(ret);
+
+	ret = btrfs_map_bio(root, READ, comp_bio, 0, 0);
+	BUG_ON(ret);
+
+	bio_put(comp_bio);
+	return 0;
+}
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
new file mode 100644
index 000000000000..421f5b4aa715
--- /dev/null
+++ b/fs/btrfs/compression.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright (C) 2008 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef __BTRFS_COMPRESSION_
+#define __BTRFS_COMPRESSION_
+
+int btrfs_zlib_decompress(unsigned char *data_in,
+			  struct page *dest_page,
+			  unsigned long start_byte,
+			  size_t srclen, size_t destlen);
+int btrfs_zlib_compress_pages(struct address_space *mapping,
+			      u64 start, unsigned long len,
+			      struct page **pages,
+			      unsigned long nr_dest_pages,
+			      unsigned long *out_pages,
+			      unsigned long *total_in,
+			      unsigned long *total_out,
+			      unsigned long max_out);
+int btrfs_zlib_decompress_biovec(struct page **pages_in,
+			      u64 disk_start,
+			      struct bio_vec *bvec,
+			      int vcnt,
+			      size_t srclen);
+void btrfs_zlib_exit(void);
+int btrfs_submit_compressed_write(struct inode *inode, u64 start,
+				  unsigned long len, u64 disk_start,
+				  unsigned long compressed_len,
+				  struct page **compressed_pages,
+				  unsigned long nr_pages);
+int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
+				 int mirror_num, unsigned long bio_flags);
+#endif
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 8559f39fd47f..793d8fdda244 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -400,10 +400,18 @@ struct btrfs_timespec {
 	__le32 nsec;
 } __attribute__ ((__packed__));
 
-/*
- * there is no padding here on purpose.  If you want to extent the inode,
- * make a new item type
- */
+typedef enum {
+	BTRFS_COMPRESS_NONE = 0,
+	BTRFS_COMPRESS_ZLIB = 1,
+	BTRFS_COMPRESS_LAST = 2,
+} btrfs_compression_type;
+
+/* we don't understand any encryption methods right now */
+typedef enum {
+	BTRFS_ENCRYPTION_NONE = 0,
+	BTRFS_ENCRYPTION_LAST = 1,
+} btrfs_encryption_type;
+
 struct btrfs_inode_item {
 	/* nfs style generation number */
 	__le64 generation;
@@ -419,6 +427,7 @@ struct btrfs_inode_item {
 	__le64 rdev;
 	__le16 flags;
 	__le16 compat_flags;
+
 	struct btrfs_timespec atime;
 	struct btrfs_timespec ctime;
 	struct btrfs_timespec mtime;
@@ -454,8 +463,33 @@ struct btrfs_root_item {
 #define BTRFS_FILE_EXTENT_INLINE 1
 
 struct btrfs_file_extent_item {
+	/*
+	 * transaction id that created this extent
+	 */
 	__le64 generation;
+	/*
+	 * max number of bytes to hold this extent in ram
+	 * when we split a compressed extent we can't know how big
+	 * each of the resulting pieces will be.  So, this is
+	 * an upper limit on the size of the extent in ram instead of
+	 * an exact limit.
+	 */
+	__le64 ram_bytes;
+
+	/*
+	 * 32 bits for the various ways we might encode the data,
+	 * including compression and encryption.  If any of these
+	 * are set to something a given disk format doesn't understand
+	 * it is treated like an incompat flag for reading and writing,
+	 * but not for stat.
+	 */
+	u8 compression;
+	u8 encryption;
+	__le16 other_encoding; /* spare for later use */
+
+	/* are we inline data or a real extent? */
 	u8 type;
+
 	/*
 	 * disk space consumed by the extent, checksum blocks are included
 	 * in these numbers
@@ -471,9 +505,11 @@ struct btrfs_file_extent_item {
 	 */
 	__le64 offset;
 	/*
-	 * the logical number of file blocks (no csums included)
+	 * the logical number of file blocks (no csums included).  This
+	 * always reflects the size uncompressed and without encoding.
 	 */
 	__le64 num_bytes;
+
 } __attribute__ ((__packed__));
 
 struct btrfs_csum_item {
@@ -814,6 +850,7 @@ struct btrfs_root {
 #define BTRFS_MOUNT_NOBARRIER		(1 << 2)
 #define BTRFS_MOUNT_SSD			(1 << 3)
 #define BTRFS_MOUNT_DEGRADED		(1 << 4)
+#define BTRFS_MOUNT_COMPRESS		(1 << 5)
 
 #define btrfs_clear_opt(o, opt)		((o) &= ~BTRFS_MOUNT_##opt)
 #define btrfs_set_opt(o, opt)		((o) |= BTRFS_MOUNT_##opt)
@@ -825,6 +862,7 @@ struct btrfs_root {
 #define BTRFS_INODE_NODATASUM		(1 << 0)
 #define BTRFS_INODE_NODATACOW		(1 << 1)
 #define BTRFS_INODE_READONLY		(1 << 2)
+#define BTRFS_INODE_NOCOMPRESS		(1 << 3)
 #define btrfs_clear_flag(inode, flag)	(BTRFS_I(inode)->flags &= \
 					 ~BTRFS_INODE_##flag)
 #define btrfs_set_flag(inode, flag)	(BTRFS_I(inode)->flags |= \
@@ -1424,14 +1462,6 @@ static inline u32 btrfs_file_extent_calc_inline_size(u32 datasize)
 	return offsetof(struct btrfs_file_extent_item, disk_bytenr) + datasize;
 }
 
-static inline u32 btrfs_file_extent_inline_len(struct extent_buffer *eb,
-					       struct btrfs_item *e)
-{
-	unsigned long offset;
-	offset = offsetof(struct btrfs_file_extent_item, disk_bytenr);
-	return btrfs_item_size(eb, e) - offset;
-}
-
 BTRFS_SETGET_FUNCS(file_extent_disk_bytenr, struct btrfs_file_extent_item,
 		   disk_bytenr, 64);
 BTRFS_SETGET_FUNCS(file_extent_generation, struct btrfs_file_extent_item,
@@ -1442,6 +1472,36 @@ BTRFS_SETGET_FUNCS(file_extent_offset, struct btrfs_file_extent_item,
 		  offset, 64);
 BTRFS_SETGET_FUNCS(file_extent_num_bytes, struct btrfs_file_extent_item,
 		   num_bytes, 64);
+BTRFS_SETGET_FUNCS(file_extent_ram_bytes, struct btrfs_file_extent_item,
+		   ram_bytes, 64);
+BTRFS_SETGET_FUNCS(file_extent_compression, struct btrfs_file_extent_item,
+		   compression, 8);
+BTRFS_SETGET_FUNCS(file_extent_encryption, struct btrfs_file_extent_item,
+		   encryption, 8);
+BTRFS_SETGET_FUNCS(file_extent_other_encoding, struct btrfs_file_extent_item,
+		   other_encoding, 16);
+
+/* this returns the number of file bytes represented by the inline item.
+ * If an item is compressed, this is the uncompressed size
+ */
+static inline u32 btrfs_file_extent_inline_len(struct extent_buffer *eb,
+					       struct btrfs_file_extent_item *e)
+{
+	return btrfs_file_extent_ram_bytes(eb, e);
+}
+
+/*
+ * this returns the number of bytes used by the item on disk, minus the
+ * size of any extent headers.  If a file is compressed on disk, this is
+ * the compressed size
+ */
+static inline u32 btrfs_file_extent_inline_item_len(struct extent_buffer *eb,
+						    struct btrfs_item *e)
+{
+	unsigned long offset;
+	offset = offsetof(struct btrfs_file_extent_item, disk_bytenr);
+	return btrfs_item_size(eb, e) - offset;
+}
 
 static inline struct btrfs_root *btrfs_sb(struct super_block *sb)
 {
@@ -1745,10 +1805,11 @@ int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root
 int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
 			  struct bio *bio);
 int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
-			       struct btrfs_root *root,
-			       u64 objectid, u64 pos, u64 disk_offset,
-			       u64 disk_num_bytes,
-			     u64 num_bytes, u64 offset);
+			     struct btrfs_root *root,
+			     u64 objectid, u64 pos,
+			     u64 disk_offset, u64 disk_num_bytes,
+			     u64 num_bytes, u64 offset, u64 ram_bytes,
+			     u8 compression, u8 encryption, u16 other_encoding);
 int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *root,
 			     struct btrfs_path *path, u64 objectid,
@@ -1758,6 +1819,8 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
 			   struct btrfs_ordered_sum *sums);
 int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
 		       struct bio *bio);
+int btrfs_csum_file_bytes(struct btrfs_root *root, struct inode *inode,
+			  u64 start, unsigned long len);
 struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans,
 					  struct btrfs_root *root,
 					  struct btrfs_path *path,
@@ -1799,7 +1862,7 @@ void btrfs_invalidate_dcache_root(struct btrfs_root *root, char *name,
 				  int namelen);
 
 int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
-			 size_t size, struct bio *bio);
+			 size_t size, struct bio *bio, unsigned long bio_flags);
 
 unsigned long btrfs_force_ra(struct address_space *mapping,
 			      struct file_ra_state *ra, struct file *file,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 0be044bb6194..dc95f636a11b 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -83,6 +83,7 @@ struct async_submit_bio {
 	extent_submit_bio_hook_t *submit_bio_hook;
 	int rw;
 	int mirror_num;
+	unsigned long bio_flags;
 	struct btrfs_work work;
 };
 
@@ -115,6 +116,7 @@ struct extent_map *btree_get_extent(struct inode *inode, struct page *page,
 	}
 	em->start = 0;
 	em->len = (u64)-1;
+	em->block_len = (u64)-1;
 	em->block_start = 0;
 	em->bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
 
@@ -469,12 +471,13 @@ static void run_one_async_submit(struct btrfs_work *work)
 		wake_up(&fs_info->async_submit_wait);
 
 	async->submit_bio_hook(async->inode, async->rw, async->bio,
-			       async->mirror_num);
+			       async->mirror_num, async->bio_flags);
 	kfree(async);
 }
 
 int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
 			int rw, struct bio *bio, int mirror_num,
+			unsigned long bio_flags,
 			extent_submit_bio_hook_t *submit_bio_hook)
 {
 	struct async_submit_bio *async;
@@ -491,6 +494,7 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
 	async->submit_bio_hook = submit_bio_hook;
 	async->work.func = run_one_async_submit;
 	async->work.flags = 0;
+	async->bio_flags = bio_flags;
 
 	while(atomic_read(&fs_info->async_submit_draining) &&
 	      atomic_read(&fs_info->nr_async_submits)) {
@@ -530,7 +534,7 @@ static int btree_csum_one_bio(struct bio *bio)
 }
 
 static int __btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
-				 int mirror_num)
+				 int mirror_num, unsigned long bio_flags)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	int ret;
@@ -556,17 +560,17 @@ static int __btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
 }
 
 static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
-				 int mirror_num)
+				 int mirror_num, unsigned long bio_flags)
 {
 	/*
 	 * kthread helpers are used to submit writes so that checksumming
 	 * can happen in parallel across all CPUs
 	 */
 	if (!(rw & (1 << BIO_RW))) {
-		return __btree_submit_bio_hook(inode, rw, bio, mirror_num);
+		return __btree_submit_bio_hook(inode, rw, bio, mirror_num, 0);
 	}
 	return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
-				   inode, rw, bio, mirror_num,
+				   inode, rw, bio, mirror_num, 0,
 				   __btree_submit_bio_hook);
 }
 
@@ -1407,6 +1411,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	fs_info->btree_inode = new_inode(sb);
 	fs_info->btree_inode->i_ino = 1;
 	fs_info->btree_inode->i_nlink = 1;
+
 	fs_info->thread_pool_size = min(num_online_cpus() + 2, 8);
 
 	INIT_LIST_HEAD(&fs_info->ordered_extents);
@@ -1508,6 +1513,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	 */
 	btrfs_init_workers(&fs_info->workers, "worker",
 			   fs_info->thread_pool_size);
+
 	btrfs_init_workers(&fs_info->submit_workers, "submit",
 			   min_t(u64, fs_devices->num_devices,
 			   fs_info->thread_pool_size));
@@ -1559,6 +1565,8 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	}
 
 	fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super);
+	fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages,
+				    4 * 1024 * 1024 / PAGE_CACHE_SIZE);
 
 	nodesize = btrfs_super_nodesize(disk_super);
 	leafsize = btrfs_super_leafsize(disk_super);
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index f84f5058dbbb..4eb1f1408d21 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -71,6 +71,7 @@ int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
 			int metadata);
 int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
 			int rw, struct bio *bio, int mirror_num,
+			unsigned long bio_flags,
 			extent_submit_bio_hook_t *submit_bio_hook);
 int btrfs_congested_async(struct btrfs_fs_info *info, int iodone);
 unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 280ac1aa9b6d..bbf04e80a1a3 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3278,6 +3278,7 @@ static int noinline relocate_data_extent(struct inode *reloc_inode,
 
 	em->start = extent_key->objectid - offset;
 	em->len = extent_key->offset;
+	em->block_len = extent_key->offset;
 	em->block_start = extent_key->objectid;
 	em->bdev = root->fs_info->fs_devices->latest_bdev;
 	set_bit(EXTENT_FLAG_PINNED, &em->flags);
@@ -3314,10 +3315,14 @@ struct btrfs_ref_path {
 };
 
 struct disk_extent {
+	u64 ram_bytes;
 	u64 disk_bytenr;
 	u64 disk_num_bytes;
 	u64 offset;
 	u64 num_bytes;
+	u8 compression;
+	u8 encryption;
+	u16 other_encoding;
 };
 
 static int is_cowonly_root(u64 root_objectid)
@@ -3631,6 +3636,11 @@ static int noinline get_new_locations(struct inode *reloc_inode,
 			btrfs_file_extent_disk_num_bytes(leaf, fi);
 		exts[nr].offset = btrfs_file_extent_offset(leaf, fi);
 		exts[nr].num_bytes = btrfs_file_extent_num_bytes(leaf, fi);
+		exts[nr].ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
+		exts[nr].compression = btrfs_file_extent_compression(leaf, fi);
+		exts[nr].encryption = btrfs_file_extent_encryption(leaf, fi);
+		exts[nr].other_encoding = btrfs_file_extent_other_encoding(leaf,
+									   fi);
 		WARN_ON(exts[nr].offset > 0);
 		WARN_ON(exts[nr].num_bytes != exts[nr].disk_num_bytes);
 
@@ -3846,6 +3856,8 @@ next:
 						new_extents[0].disk_bytenr);
 			btrfs_set_file_extent_disk_num_bytes(leaf, fi,
 						new_extents[0].disk_num_bytes);
+			btrfs_set_file_extent_ram_bytes(leaf, fi,
+						new_extents[0].ram_bytes);
 			ext_offset += new_extents[0].offset;
 			btrfs_set_file_extent_offset(leaf, fi, ext_offset);
 			btrfs_mark_buffer_dirty(leaf);
@@ -3911,6 +3923,16 @@ next:
 						new_extents[i].disk_bytenr);
 				btrfs_set_file_extent_disk_num_bytes(leaf, fi,
 						new_extents[i].disk_num_bytes);
+				btrfs_set_file_extent_ram_bytes(leaf, fi,
+						new_extents[i].ram_bytes);
+
+				btrfs_set_file_extent_compression(leaf, fi,
+						new_extents[i].compression);
+				btrfs_set_file_extent_encryption(leaf, fi,
+						new_extents[i].encryption);
+				btrfs_set_file_extent_other_encoding(leaf, fi,
+						new_extents[i].other_encoding);
+
 				btrfs_set_file_extent_num_bytes(leaf, fi,
 							extent_len);
 				ext_offset += new_extents[i].offset;
@@ -4169,6 +4191,8 @@ static int noinline replace_extents_in_leaf(struct btrfs_trans_handle *trans,
 		ref->extents[ext_index].num_bytes = new_extent->disk_num_bytes;
 
 		btrfs_set_file_extent_generation(leaf, fi, trans->transid);
+		btrfs_set_file_extent_ram_bytes(leaf, fi,
+						new_extent->ram_bytes);
 		btrfs_set_file_extent_disk_bytenr(leaf, fi,
 						new_extent->disk_bytenr);
 		btrfs_set_file_extent_disk_num_bytes(leaf, fi,
@@ -4847,7 +4871,8 @@ static struct inode noinline *create_reloc_inode(struct btrfs_fs_info *fs_info,
 	BUG_ON(err);
 
 	err = btrfs_insert_file_extent(trans, root, objectid, 0, 0, 0,
-				       group->key.offset, 0);
+				       group->key.offset, 0, group->key.offset,
+				       0, 0, 0);
 	BUG_ON(err);
 
 	inode = btrfs_iget_locked(root->fs_info->sb, objectid, root);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 563b2d12f4f2..314041fdfa43 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -30,6 +30,7 @@ static struct kmem_cache *extent_buffer_cache;
 static LIST_HEAD(buffers);
 static LIST_HEAD(states);
 
+#define LEAK_DEBUG 1
 #ifdef LEAK_DEBUG
 static spinlock_t leak_lock = SPIN_LOCK_UNLOCKED;
 #endif
@@ -1067,8 +1068,8 @@ EXPORT_SYMBOL(find_first_extent_bit_state);
  *
  * 1 is returned if we find something, 0 if nothing was in the tree
  */
-static noinline u64 find_lock_delalloc_range(struct extent_io_tree *tree,
-					     u64 *start, u64 *end, u64 max_bytes)
+static noinline u64 find_delalloc_range(struct extent_io_tree *tree,
+					u64 *start, u64 *end, u64 max_bytes)
 {
 	struct rb_node *node;
 	struct extent_state *state;
@@ -1077,11 +1078,11 @@ static noinline u64 find_lock_delalloc_range(struct extent_io_tree *tree,
 	u64 total_bytes = 0;
 
 	spin_lock_irq(&tree->lock);
+
 	/*
 	 * this search will find all the extents that end after
 	 * our range starts.
 	 */
-search_again:
 	node = tree_search(tree, cur_start);
 	if (!node) {
 		if (!found)
@@ -1100,40 +1101,6 @@ search_again:
 				*end = state->end;
 			goto out;
 		}
-		if (!found && !(state->state & EXTENT_BOUNDARY)) {
-			struct extent_state *prev_state;
-			struct rb_node *prev_node = node;
-			while(1) {
-				prev_node = rb_prev(prev_node);
-				if (!prev_node)
-					break;
-				prev_state = rb_entry(prev_node,
-						      struct extent_state,
-						      rb_node);
-				if ((prev_state->end + 1 != state->start) ||
-				    !(prev_state->state & EXTENT_DELALLOC))
-					break;
-				if ((cur_start - prev_state->start) * 2 >
-				     max_bytes)
-					break;
-				state = prev_state;
-				node = prev_node;
-			}
-		}
-		if (state->state & EXTENT_LOCKED) {
-			DEFINE_WAIT(wait);
-			atomic_inc(&state->refs);
-			prepare_to_wait(&state->wq, &wait,
-					TASK_UNINTERRUPTIBLE);
-			spin_unlock_irq(&tree->lock);
-			schedule();
-			spin_lock_irq(&tree->lock);
-			finish_wait(&state->wq, &wait);
-			free_extent_state(state);
-			goto search_again;
-		}
-		set_state_cb(tree, state, EXTENT_LOCKED);
-		state->state |= EXTENT_LOCKED;
 		if (!found)
 			*start = state->start;
 		found++;
@@ -1151,6 +1118,208 @@ out:
 	return found;
 }
 
+static noinline int __unlock_for_delalloc(struct inode *inode,
+					  struct page *locked_page,
+					  u64 start, u64 end)
+{
+	int ret;
+	struct page *pages[16];
+	unsigned long index = start >> PAGE_CACHE_SHIFT;
+	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
+	unsigned long nr_pages = end_index - index + 1;
+	int i;
+
+	if (index == locked_page->index && end_index == index)
+		return 0;
+
+	while(nr_pages > 0) {
+		ret = find_get_pages_contig(inode->i_mapping, index,
+				     min(nr_pages, ARRAY_SIZE(pages)), pages);
+		for (i = 0; i < ret; i++) {
+			if (pages[i] != locked_page)
+				unlock_page(pages[i]);
+			page_cache_release(pages[i]);
+		}
+		nr_pages -= ret;
+		index += ret;
+		cond_resched();
+	}
+	return 0;
+}
+
+static noinline int lock_delalloc_pages(struct inode *inode,
+					struct page *locked_page,
+					u64 delalloc_start,
+					u64 delalloc_end)
+{
+	unsigned long index = delalloc_start >> PAGE_CACHE_SHIFT;
+	unsigned long start_index = index;
+	unsigned long end_index = delalloc_end >> PAGE_CACHE_SHIFT;
+	unsigned long pages_locked = 0;
+	struct page *pages[16];
+	unsigned long nrpages;
+	int ret;
+	int i;
+
+	/* the caller is responsible for locking the start index */
+	if (index == locked_page->index && index == end_index)
+		return 0;
+
+	/* skip the page at the start index */
+	nrpages = end_index - index + 1;
+	while(nrpages > 0) {
+		ret = find_get_pages_contig(inode->i_mapping, index,
+				     min(nrpages, ARRAY_SIZE(pages)), pages);
+		if (ret == 0) {
+			ret = -EAGAIN;
+			goto done;
+		}
+		/* now we have an array of pages, lock them all */
+		for (i = 0; i < ret; i++) {
+			/*
+			 * the caller is taking responsibility for
+			 * locked_page
+			 */
+			if (pages[i] != locked_page)
+				lock_page(pages[i]);
+			page_cache_release(pages[i]);
+		}
+		pages_locked += ret;
+		nrpages -= ret;
+		index += ret;
+		cond_resched();
+	}
+	ret = 0;
+done:
+	if (ret && pages_locked) {
+		__unlock_for_delalloc(inode, locked_page,
+			      delalloc_start,
+			      ((u64)(start_index + pages_locked - 1)) <<
+			      PAGE_CACHE_SHIFT);
+	}
+	return ret;
+}
+
+/*
+ * find a contiguous range of bytes in the file marked as delalloc, not
+ * more than 'max_bytes'.  start and end are used to return the range,
+ *
+ * 1 is returned if we find something, 0 if nothing was in the tree
+ */
+static noinline u64 find_lock_delalloc_range(struct inode *inode,
+					     struct extent_io_tree *tree,
+					     struct page *locked_page,
+					     u64 *start, u64 *end,
+					     u64 max_bytes)
+{
+	u64 delalloc_start;
+	u64 delalloc_end;
+	u64 found;
+	int ret;
+	int loops = 0;
+
+again:
+	/* step one, find a bunch of delalloc bytes starting at start */
+	delalloc_start = *start;
+	delalloc_end = 0;
+	found = find_delalloc_range(tree, &delalloc_start, &delalloc_end,
+				    max_bytes);
+	if (!found) {
+		*start = delalloc_start;
+		*end = delalloc_end;
+		return found;
+	}
+
+	/*
+	 * make sure to limit the number of pages we try to lock down
+	 * if we're looping.
+	 */
+	if (delalloc_end + 1 - delalloc_start > max_bytes && loops) {
+		delalloc_end = (delalloc_start + PAGE_CACHE_SIZE - 1) &
+			~((u64)PAGE_CACHE_SIZE - 1);
+	}
+	/* step two, lock all the pages after the page that has start */
+	ret = lock_delalloc_pages(inode, locked_page,
+				  delalloc_start, delalloc_end);
+	if (ret == -EAGAIN) {
+		/* some of the pages are gone, lets avoid looping by
+		 * shortening the size of the delalloc range we're searching
+		 */
+		if (!loops) {
+			unsigned long offset = (*start) & (PAGE_CACHE_SIZE - 1);
+			max_bytes = PAGE_CACHE_SIZE - offset;
+			loops = 1;
+			goto again;
+		} else {
+			found = 0;
+			goto out_failed;
+		}
+	}
+	BUG_ON(ret);
+
+	/* step three, lock the state bits for the whole range */
+	lock_extent(tree, delalloc_start, delalloc_end, GFP_NOFS);
+
+	/* then test to make sure it is all still delalloc */
+	ret = test_range_bit(tree, delalloc_start, delalloc_end,
+			     EXTENT_DELALLOC, 1);
+	if (!ret) {
+		unlock_extent(tree, delalloc_start, delalloc_end, GFP_NOFS);
+		__unlock_for_delalloc(inode, locked_page,
+			      delalloc_start, delalloc_end);
+		cond_resched();
+		goto again;
+	}
+	*start = delalloc_start;
+	*end = delalloc_end;
+out_failed:
+	return found;
+}
+
+int extent_clear_unlock_delalloc(struct inode *inode,
+				struct extent_io_tree *tree,
+				u64 start, u64 end, struct page *locked_page,
+				int clear_dirty, int set_writeback,
+				int end_writeback)
+{
+	int ret;
+	struct page *pages[16];
+	unsigned long index = start >> PAGE_CACHE_SHIFT;
+	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
+	unsigned long nr_pages = end_index - index + 1;
+	int i;
+	int clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC;
+
+	if (clear_dirty)
+		clear_bits |= EXTENT_DIRTY;
+
+	clear_extent_bit(tree, start, end, clear_bits, 1, 0, GFP_NOFS);
+
+	while(nr_pages > 0) {
+		ret = find_get_pages_contig(inode->i_mapping, index,
+				     min(nr_pages, ARRAY_SIZE(pages)), pages);
+		for (i = 0; i < ret; i++) {
+			if (pages[i] == locked_page) {
+				page_cache_release(pages[i]);
+				continue;
+			}
+			if (clear_dirty)
+				clear_page_dirty_for_io(pages[i]);
+			if (set_writeback)
+				set_page_writeback(pages[i]);
+			if (end_writeback)
+				end_page_writeback(pages[i]);
+			unlock_page(pages[i]);
+			page_cache_release(pages[i]);
+		}
+		nr_pages -= ret;
+		index += ret;
+		cond_resched();
+	}
+	return 0;
+}
+EXPORT_SYMBOL(extent_clear_unlock_delalloc);
+
 /*
  * count the number of bytes in the tree that have a given bit(s)
  * set.  This can be fairly slow, except for EXTENT_DIRTY which is
@@ -1631,38 +1800,26 @@ extent_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
 	return bio;
 }
 
-static int submit_one_bio(int rw, struct bio *bio, int mirror_num)
+static int submit_one_bio(int rw, struct bio *bio, int mirror_num,
+			  unsigned long bio_flags)
 {
 	int ret = 0;
 	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
 	struct page *page = bvec->bv_page;
 	struct extent_io_tree *tree = bio->bi_private;
-	struct rb_node *node;
-	struct extent_state *state;
 	u64 start;
 	u64 end;
 
 	start = ((u64)page->index << PAGE_CACHE_SHIFT) + bvec->bv_offset;
 	end = start + bvec->bv_len - 1;
 
-	spin_lock_irq(&tree->lock);
-	node = __etree_search(tree, start, NULL, NULL);
-	BUG_ON(!node);
-	state = rb_entry(node, struct extent_state, rb_node);
-	while(state->end < end) {
-		node = rb_next(node);
-		state = rb_entry(node, struct extent_state, rb_node);
-	}
-	BUG_ON(state->end != end);
-	spin_unlock_irq(&tree->lock);
-
 	bio->bi_private = NULL;
 
 	bio_get(bio);
 
 	if (tree->ops && tree->ops->submit_bio_hook)
 		tree->ops->submit_bio_hook(page->mapping->host, rw, bio,
-					   mirror_num);
+					   mirror_num, bio_flags);
 	else
 		submit_bio(rw, bio);
 	if (bio_flagged(bio, BIO_EOPNOTSUPP))
@@ -1678,39 +1835,56 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree,
 			      struct bio **bio_ret,
 			      unsigned long max_pages,
 			      bio_end_io_t end_io_func,
-			      int mirror_num)
+			      int mirror_num,
+			      unsigned long prev_bio_flags,
+			      unsigned long bio_flags)
 {
 	int ret = 0;
 	struct bio *bio;
 	int nr;
+	int contig = 0;
+	int this_compressed = bio_flags & EXTENT_BIO_COMPRESSED;
+	int old_compressed = prev_bio_flags & EXTENT_BIO_COMPRESSED;
+	size_t page_size = min(size, PAGE_CACHE_SIZE);
 
 	if (bio_ret && *bio_ret) {
 		bio = *bio_ret;
-		if (bio->bi_sector + (bio->bi_size >> 9) != sector ||
+		if (old_compressed)
+			contig = bio->bi_sector == sector;
+		else
+			contig = bio->bi_sector + (bio->bi_size >> 9) ==
+				sector;
+
+		if (prev_bio_flags != bio_flags || !contig ||
 		    (tree->ops && tree->ops->merge_bio_hook &&
-		     tree->ops->merge_bio_hook(page, offset, size, bio)) ||
-		    bio_add_page(bio, page, size, offset) < size) {
-			ret = submit_one_bio(rw, bio, mirror_num);
+		     tree->ops->merge_bio_hook(page, offset, page_size, bio,
+					       bio_flags)) ||
+		    bio_add_page(bio, page, page_size, offset) < page_size) {
+			ret = submit_one_bio(rw, bio, mirror_num,
+					     prev_bio_flags);
 			bio = NULL;
 		} else {
 			return 0;
 		}
 	}
-	nr = bio_get_nr_vecs(bdev);
+	if (this_compressed)
+		nr = BIO_MAX_PAGES;
+	else
+		nr = bio_get_nr_vecs(bdev);
+
 	bio = extent_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH);
 	if (!bio) {
 		printk("failed to allocate bio nr %d\n", nr);
 	}
 
-
-	bio_add_page(bio, page, size, offset);
+	bio_add_page(bio, page, page_size, offset);
 	bio->bi_end_io = end_io_func;
 	bio->bi_private = tree;
 
 	if (bio_ret) {
 		*bio_ret = bio;
 	} else {
-		ret = submit_one_bio(rw, bio, mirror_num);
+		ret = submit_one_bio(rw, bio, mirror_num, bio_flags);
 	}
 
 	return ret;
@@ -1738,7 +1912,8 @@ void set_page_extent_head(struct page *page, unsigned long len)
 static int __extent_read_full_page(struct extent_io_tree *tree,
 				   struct page *page,
 				   get_extent_t *get_extent,
-				   struct bio **bio, int mirror_num)
+				   struct bio **bio, int mirror_num,
+				   unsigned long *bio_flags)
 {
 	struct inode *inode = page->mapping->host;
 	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
@@ -1756,13 +1931,27 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
 	int nr = 0;
 	size_t page_offset = 0;
 	size_t iosize;
+	size_t disk_io_size;
 	size_t blocksize = inode->i_sb->s_blocksize;
+	unsigned long this_bio_flag = 0;
 
 	set_page_extent_mapped(page);
 
 	end = page_end;
 	lock_extent(tree, start, end, GFP_NOFS);
 
+	if (page->index == last_byte >> PAGE_CACHE_SHIFT) {
+		char *userpage;
+		size_t zero_offset = last_byte & (PAGE_CACHE_SIZE - 1);
+
+		if (zero_offset) {
+			iosize = PAGE_CACHE_SIZE - zero_offset;
+			userpage = kmap_atomic(page, KM_USER0);
+			memset(userpage + zero_offset, 0, iosize);
+			flush_dcache_page(page);
+			kunmap_atomic(userpage, KM_USER0);
+		}
+	}
 	while (cur <= end) {
 		if (cur >= last_byte) {
 			char *userpage;
@@ -1793,10 +1982,19 @@ printk("2bad mapping end %Lu cur %Lu\n", end, cur);
 		}
 		BUG_ON(end < cur);
 
+		if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
+			this_bio_flag = EXTENT_BIO_COMPRESSED;
+
 		iosize = min(extent_map_end(em) - cur, end - cur + 1);
 		cur_end = min(extent_map_end(em) - 1, end);
 		iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1);
-		sector = (em->block_start + extent_offset) >> 9;
+		if (this_bio_flag & EXTENT_BIO_COMPRESSED) {
+			disk_io_size = em->block_len;
+			sector = em->block_start >> 9;
+		} else {
+			sector = (em->block_start + extent_offset) >> 9;
+			disk_io_size = iosize;
+		}
 		bdev = em->bdev;
 		block_start = em->block_start;
 		free_extent_map(em);
@@ -1845,10 +2043,13 @@ printk("2bad mapping end %Lu cur %Lu\n", end, cur);
 			unsigned long pnr = (last_byte >> PAGE_CACHE_SHIFT) + 1;
 			pnr -= page->index;
 			ret = submit_extent_page(READ, tree, page,
-					 sector, iosize, page_offset,
+					 sector, disk_io_size, page_offset,
 					 bdev, bio, pnr,
-					 end_bio_extent_readpage, mirror_num);
+					 end_bio_extent_readpage, mirror_num,
+					 *bio_flags,
+					 this_bio_flag);
 			nr++;
+			*bio_flags = this_bio_flag;
 		}
 		if (ret)
 			SetPageError(page);
@@ -1867,11 +2068,13 @@ int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
 			    get_extent_t *get_extent)
 {
 	struct bio *bio = NULL;
+	unsigned long bio_flags = 0;
 	int ret;
 
-	ret = __extent_read_full_page(tree, page, get_extent, &bio, 0);
+	ret = __extent_read_full_page(tree, page, get_extent, &bio, 0,
+				      &bio_flags);
 	if (bio)
-		submit_one_bio(READ, bio, 0);
+		submit_one_bio(READ, bio, 0, bio_flags);
 	return ret;
 }
 EXPORT_SYMBOL(extent_read_full_page);
@@ -1909,6 +2112,8 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
 	unsigned long end_index = i_size >> PAGE_CACHE_SHIFT;
 	u64 nr_delalloc;
 	u64 delalloc_end;
+	int page_started;
+	int compressed;
 
 	WARN_ON(!PageLocked(page));
 	pg_offset = i_size & (PAGE_CACHE_SIZE - 1);
@@ -1934,27 +2139,33 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
 
 	delalloc_start = start;
 	delalloc_end = 0;
+	page_started = 0;
 	while(delalloc_end < page_end) {
-		nr_delalloc = find_lock_delalloc_range(tree, &delalloc_start,
+		nr_delalloc = find_lock_delalloc_range(inode, tree,
+						       page,
+						       &delalloc_start,
 						       &delalloc_end,
 						       128 * 1024 * 1024);
 		if (nr_delalloc == 0) {
 			delalloc_start = delalloc_end + 1;
 			continue;
 		}
-		tree->ops->fill_delalloc(inode, delalloc_start,
-					 delalloc_end);
-		clear_extent_bit(tree, delalloc_start,
-				 delalloc_end,
-				 EXTENT_LOCKED | EXTENT_DELALLOC,
-				 1, 0, GFP_NOFS);
+		tree->ops->fill_delalloc(inode, page, delalloc_start,
+					 delalloc_end, &page_started);
 		delalloc_start = delalloc_end + 1;
 	}
+
+	/* did the fill delalloc function already unlock and start the IO? */
+	if (page_started) {
+		return 0;
+	}
+
 	lock_extent(tree, start, page_end, GFP_NOFS);
 	unlock_start = start;
 
 	if (tree->ops && tree->ops->writepage_start_hook) {
-		ret = tree->ops->writepage_start_hook(page, start, page_end);
+		ret = tree->ops->writepage_start_hook(page, start,
+						      page_end);
 		if (ret == -EAGAIN) {
 			unlock_extent(tree, start, page_end, GFP_NOFS);
 			redirty_page_for_writepage(wbc, page);
@@ -2006,10 +2217,15 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
 		sector = (em->block_start + extent_offset) >> 9;
 		bdev = em->bdev;
 		block_start = em->block_start;
+		compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
 		free_extent_map(em);
 		em = NULL;
 
-		if (block_start == EXTENT_MAP_HOLE ||
+		/*
+		 * compressed and inline extents are written through other
+		 * paths in the FS
+		 */
+		if (compressed || block_start == EXTENT_MAP_HOLE ||
 		    block_start == EXTENT_MAP_INLINE) {
 			clear_extent_dirty(tree, cur,
 					   cur + iosize - 1, GFP_NOFS);
@@ -2017,16 +2233,28 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
 			unlock_extent(tree, unlock_start, cur + iosize -1,
 				      GFP_NOFS);
 
-			if (tree->ops && tree->ops->writepage_end_io_hook)
+			/*
+			 * end_io notification does not happen here for
+			 * compressed extents
+			 */
+			if (!compressed && tree->ops &&
+			    tree->ops->writepage_end_io_hook)
 				tree->ops->writepage_end_io_hook(page, cur,
 							 cur + iosize - 1,
 							 NULL, 1);
-			cur = cur + iosize;
+			else if (compressed) {
+				/* we don't want to end_page_writeback on
+				 * a compressed extent.  this happens
+				 * elsewhere
+				 */
+				nr++;
+			}
+
+			cur += iosize;
 			pg_offset += iosize;
 			unlock_start = cur;
 			continue;
 		}
-
 		/* leave this out until we have a page_mkwrite call */
 		if (0 && !test_range_bit(tree, cur, cur + iosize - 1,
 				   EXTENT_DIRTY, 0)) {
@@ -2034,6 +2262,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
 			pg_offset += iosize;
 			continue;
 		}
+
 		clear_extent_dirty(tree, cur, cur + iosize - 1, GFP_NOFS);
 		if (tree->ops && tree->ops->writepage_io_hook) {
 			ret = tree->ops->writepage_io_hook(page, cur,
@@ -2057,7 +2286,8 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
 			ret = submit_extent_page(WRITE, tree, page, sector,
 						 iosize, pg_offset, bdev,
 						 &epd->bio, max_nr,
-						 end_bio_extent_writepage, 0);
+						 end_bio_extent_writepage,
+						 0, 0, 0);
 			if (ret)
 				SetPageError(page);
 		}
@@ -2226,7 +2456,7 @@ int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
 	extent_write_cache_pages(tree, mapping, &wbc_writepages,
 				 __extent_writepage, &epd);
 	if (epd.bio) {
-		submit_one_bio(WRITE, epd.bio, 0);
+		submit_one_bio(WRITE, epd.bio, 0, 0);
 	}
 	return ret;
 }
@@ -2248,7 +2478,7 @@ int extent_writepages(struct extent_io_tree *tree,
 	ret = extent_write_cache_pages(tree, mapping, wbc,
 				       __extent_writepage, &epd);
 	if (epd.bio) {
-		submit_one_bio(WRITE, epd.bio, 0);
+		submit_one_bio(WRITE, epd.bio, 0, 0);
 	}
 	return ret;
 }
@@ -2262,6 +2492,7 @@ int extent_readpages(struct extent_io_tree *tree,
 	struct bio *bio = NULL;
 	unsigned page_idx;
 	struct pagevec pvec;
+	unsigned long bio_flags = 0;
 
 	pagevec_init(&pvec, 0);
 	for (page_idx = 0; page_idx < nr_pages; page_idx++) {
@@ -2281,7 +2512,7 @@ int extent_readpages(struct extent_io_tree *tree,
 			if (!pagevec_add(&pvec, page))
 				__pagevec_lru_add(&pvec);
 			__extent_read_full_page(tree, page, get_extent,
-						&bio, 0);
+						&bio, 0, &bio_flags);
 		}
 		page_cache_release(page);
 	}
@@ -2289,7 +2520,7 @@ int extent_readpages(struct extent_io_tree *tree,
 		__pagevec_lru_add(&pvec);
 	BUG_ON(!list_empty(pages));
 	if (bio)
-		submit_one_bio(READ, bio, 0);
+		submit_one_bio(READ, bio, 0, bio_flags);
 	return 0;
 }
 EXPORT_SYMBOL(extent_readpages);
@@ -2414,7 +2645,8 @@ int extent_prepare_write(struct extent_io_tree *tree,
 			ret = submit_extent_page(READ, tree, page,
 					 sector, iosize, page_offset, em->bdev,
 					 NULL, 1,
-					 end_bio_extent_preparewrite, 0);
+					 end_bio_extent_preparewrite, 0,
+					 0, 0);
 			iocount++;
 			block_start = block_start + iosize;
 		} else {
@@ -2495,7 +2727,9 @@ int try_release_extent_mapping(struct extent_map_tree *map,
 			}
 			if (!test_range_bit(tree, em->start,
 					    extent_map_end(em) - 1,
-					    EXTENT_LOCKED, 0)) {
+					    EXTENT_LOCKED | EXTENT_WRITEBACK |
+					    EXTENT_ORDERED,
+					    0)) {
 				remove_extent_mapping(map, em);
 				/* once for the rb tree */
 				free_extent_map(em);
@@ -2923,6 +3157,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
 	int inc_all_pages = 0;
 	unsigned long num_pages;
 	struct bio *bio = NULL;
+	unsigned long bio_flags = 0;
 
 	if (eb->flags & EXTENT_UPTODATE)
 		return 0;
@@ -2973,7 +3208,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
 			ClearPageError(page);
 			err = __extent_read_full_page(tree, page,
 						      get_extent, &bio,
-						      mirror_num);
+						      mirror_num, &bio_flags);
 			if (err) {
 				ret = err;
 				printk("err %d from __extent_read_full_page\n", ret);
@@ -2984,7 +3219,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
 	}
 
 	if (bio)
-		submit_one_bio(READ, bio, mirror_num);
+		submit_one_bio(READ, bio, mirror_num, bio_flags);
 
 	if (ret || !wait) {
 		if (ret)
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index c9d1908a1ae3..86f859b87a6e 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -18,6 +18,9 @@
 #define EXTENT_BOUNDARY (1 << 11)
 #define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
 
+/* flags for bio submission */
+#define EXTENT_BIO_COMPRESSED 1
+
 /*
  * page->private values.  Every page that is controlled by the extent
  * map has page->private set to one.
@@ -28,14 +31,17 @@
 struct extent_state;
 
 typedef	int (extent_submit_bio_hook_t)(struct inode *inode, int rw,
-				       struct bio *bio, int mirror_num);
+				       struct bio *bio, int mirror_num,
+				       unsigned long bio_flags);
 struct extent_io_ops {
-	int (*fill_delalloc)(struct inode *inode, u64 start, u64 end);
+	int (*fill_delalloc)(struct inode *inode, struct page *locked_page,
+			     u64 start, u64 end, int *page_started);
 	int (*writepage_start_hook)(struct page *page, u64 start, u64 end);
 	int (*writepage_io_hook)(struct page *page, u64 start, u64 end);
 	extent_submit_bio_hook_t *submit_bio_hook;
 	int (*merge_bio_hook)(struct page *page, unsigned long offset,
-			      size_t size, struct bio *bio);
+			      size_t size, struct bio *bio,
+			      unsigned long bio_flags);
 	int (*readpage_io_hook)(struct page *page, u64 start, u64 end);
 	int (*readpage_io_failed_hook)(struct bio *bio, struct page *page,
 				       u64 start, u64 end,
@@ -245,4 +251,9 @@ void unmap_extent_buffer(struct extent_buffer *eb, char *token, int km);
 int release_extent_buffer_tail_pages(struct extent_buffer *eb);
 int extent_range_uptodate(struct extent_io_tree *tree,
 			  u64 start, u64 end);
+int extent_clear_unlock_delalloc(struct inode *inode,
+				struct extent_io_tree *tree,
+				u64 start, u64 end, struct page *locked_page,
+				int clear_dirty, int set_writeback,
+				int clear_writeback);
 #endif
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 74b2a29880d3..fd3ebfb8c3c5 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -184,6 +184,13 @@ static int mergable_maps(struct extent_map *prev, struct extent_map *next)
 	if (test_bit(EXTENT_FLAG_PINNED, &prev->flags))
 		return 0;
 
+	/*
+	 * don't merge compressed extents, we need to know their
+	 * actual size
+	 */
+	if (test_bit(EXTENT_FLAG_COMPRESSED, &prev->flags))
+		return 0;
+
 	if (extent_map_end(prev) == next->start &&
 	    prev->flags == next->flags &&
 	    prev->bdev == next->bdev &&
@@ -239,6 +246,7 @@ int add_extent_mapping(struct extent_map_tree *tree,
 		if (rb && mergable_maps(merge, em)) {
 			em->start = merge->start;
 			em->len += merge->len;
+			em->block_len += merge->block_len;
 			em->block_start = merge->block_start;
 			merge->in_tree = 0;
 			rb_erase(&merge->rb_node, &tree->map);
@@ -250,6 +258,7 @@ int add_extent_mapping(struct extent_map_tree *tree,
 		merge = rb_entry(rb, struct extent_map, rb_node);
 	if (rb && mergable_maps(em, merge)) {
 		em->len += merge->len;
+		em->block_len += merge->len;
 		rb_erase(&merge->rb_node, &tree->map);
 		merge->in_tree = 0;
 		free_extent_map(merge);
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index 26ac6fe0b268..abbcbeb28c79 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -10,6 +10,7 @@
 
 /* bits for the flags field */
 #define EXTENT_FLAG_PINNED 0 /* this entry not yet on disk, don't free it */
+#define EXTENT_FLAG_COMPRESSED 1
 
 struct extent_map {
 	struct rb_node rb_node;
@@ -18,6 +19,7 @@ struct extent_map {
 	u64 start;
 	u64 len;
 	u64 block_start;
+	u64 block_len;
 	unsigned long flags;
 	struct block_device *bdev;
 	atomic_t refs;
@@ -38,9 +40,9 @@ static inline u64 extent_map_end(struct extent_map *em)
 
 static inline u64 extent_map_block_end(struct extent_map *em)
 {
-	if (em->block_start + em->len < em->block_start)
+	if (em->block_start + em->block_len < em->block_start)
 		return (u64)-1;
-	return em->block_start + em->len;
+	return em->block_start + em->block_len;
 }
 
 void extent_map_tree_init(struct extent_map_tree *tree, gfp_t mask);
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 6dbe88b9d7d4..f4d3fa71bc41 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -31,7 +31,8 @@ int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *root,
 			     u64 objectid, u64 pos,
 			     u64 disk_offset, u64 disk_num_bytes,
-			     u64 num_bytes, u64 offset)
+			     u64 num_bytes, u64 offset, u64 ram_bytes,
+			     u8 compression, u8 encryption, u16 other_encoding)
 {
 	int ret = 0;
 	struct btrfs_file_extent_item *item;
@@ -57,8 +58,13 @@ int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
 	btrfs_set_file_extent_disk_num_bytes(leaf, item, disk_num_bytes);
 	btrfs_set_file_extent_offset(leaf, item, offset);
 	btrfs_set_file_extent_num_bytes(leaf, item, num_bytes);
+	btrfs_set_file_extent_ram_bytes(leaf, item, ram_bytes);
 	btrfs_set_file_extent_generation(leaf, item, trans->transid);
 	btrfs_set_file_extent_type(leaf, item, BTRFS_FILE_EXTENT_REG);
+	btrfs_set_file_extent_compression(leaf, item, compression);
+	btrfs_set_file_extent_encryption(leaf, item, encryption);
+	btrfs_set_file_extent_other_encoding(leaf, item, other_encoding);
+
 	btrfs_mark_buffer_dirty(leaf);
 out:
 	btrfs_free_path(path);
@@ -213,6 +219,73 @@ found:
 	return 0;
 }
 
+int btrfs_csum_file_bytes(struct btrfs_root *root, struct inode *inode,
+			  u64 start, unsigned long len)
+{
+	struct btrfs_ordered_sum *sums;
+	struct btrfs_sector_sum *sector_sum;
+	struct btrfs_ordered_extent *ordered;
+	char *data;
+	struct page *page;
+	unsigned long total_bytes = 0;
+	unsigned long this_sum_bytes = 0;
+
+	sums = kzalloc(btrfs_ordered_sum_size(root, len), GFP_NOFS);
+	if (!sums)
+		return -ENOMEM;
+
+	sector_sum = sums->sums;
+	sums->file_offset = start;
+	sums->len = len;
+	INIT_LIST_HEAD(&sums->list);
+	ordered = btrfs_lookup_ordered_extent(inode, sums->file_offset);
+	BUG_ON(!ordered);
+
+	while(len > 0) {
+		if (start >= ordered->file_offset + ordered->len ||
+		    start < ordered->file_offset) {
+			sums->len = this_sum_bytes;
+			this_sum_bytes = 0;
+			btrfs_add_ordered_sum(inode, ordered, sums);
+			btrfs_put_ordered_extent(ordered);
+
+			sums = kzalloc(btrfs_ordered_sum_size(root, len),
+				       GFP_NOFS);
+			BUG_ON(!sums);
+			sector_sum = sums->sums;
+			sums->len = len;
+			sums->file_offset = start;
+			ordered = btrfs_lookup_ordered_extent(inode,
+						      sums->file_offset);
+			BUG_ON(!ordered);
+		}
+
+		page = find_get_page(inode->i_mapping,
+				     start >> PAGE_CACHE_SHIFT);
+
+		data = kmap_atomic(page, KM_USER0);
+		sector_sum->sum = ~(u32)0;
+		sector_sum->sum = btrfs_csum_data(root, data, sector_sum->sum,
+						  PAGE_CACHE_SIZE);
+		kunmap_atomic(data, KM_USER0);
+		btrfs_csum_final(sector_sum->sum,
+				 (char *)&sector_sum->sum);
+		sector_sum->offset = page_offset(page);
+		page_cache_release(page);
+
+		sector_sum++;
+		total_bytes += PAGE_CACHE_SIZE;
+		this_sum_bytes += PAGE_CACHE_SIZE;
+		start += PAGE_CACHE_SIZE;
+
+		WARN_ON(len < PAGE_CACHE_SIZE);
+		len -= PAGE_CACHE_SIZE;
+	}
+	btrfs_add_ordered_sum(inode, ordered, sums);
+	btrfs_put_ordered_extent(ordered);
+	return 0;
+}
+
 int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
 		       struct bio *bio)
 {
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 69abbe19add2..0aa15436590e 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -95,153 +95,6 @@ static void noinline btrfs_drop_pages(struct page **pages, size_t num_pages)
 	}
 }
 
-/* this does all the hard work for inserting an inline extent into
- * the btree.  Any existing inline extent is extended as required to make room,
- * otherwise things are inserted as required into the btree
- */
-static int noinline insert_inline_extent(struct btrfs_trans_handle *trans,
-				struct btrfs_root *root, struct inode *inode,
-				u64 offset, size_t size,
-				struct page **pages, size_t page_offset,
-				int num_pages)
-{
-	struct btrfs_key key;
-	struct btrfs_path *path;
-	struct extent_buffer *leaf;
-	char *kaddr;
-	unsigned long ptr;
-	struct btrfs_file_extent_item *ei;
-	struct page *page;
-	u32 datasize;
-	int err = 0;
-	int ret;
-	int i;
-	ssize_t cur_size;
-
-	path = btrfs_alloc_path();
-	if (!path)
-		return -ENOMEM;
-
-	btrfs_set_trans_block_group(trans, inode);
-
-	key.objectid = inode->i_ino;
-	key.offset = offset;
-	btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY);
-
-	ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
-	if (ret < 0) {
-		err = ret;
-		goto fail;
-	}
-	if (ret == 1) {
-		struct btrfs_key found_key;
-
-		if (path->slots[0] == 0)
-			goto insert;
-
-		path->slots[0]--;
-		leaf = path->nodes[0];
-		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
-
-		if (found_key.objectid != inode->i_ino)
-			goto insert;
-
-		if (found_key.type != BTRFS_EXTENT_DATA_KEY)
-			goto insert;
-		ei = btrfs_item_ptr(leaf, path->slots[0],
-				    struct btrfs_file_extent_item);
-
-		if (btrfs_file_extent_type(leaf, ei) !=
-		    BTRFS_FILE_EXTENT_INLINE) {
-			goto insert;
-		}
-		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
-		ret = 0;
-	}
-	if (ret == 0) {
-		u32 found_size;
-		u64 found_end;
-
-		leaf = path->nodes[0];
-		ei = btrfs_item_ptr(leaf, path->slots[0],
-				    struct btrfs_file_extent_item);
-
-		if (btrfs_file_extent_type(leaf, ei) !=
-		    BTRFS_FILE_EXTENT_INLINE) {
-			err = ret;
-			btrfs_print_leaf(root, leaf);
-			printk("found wasn't inline offset %Lu inode %lu\n",
-			       offset, inode->i_ino);
-			goto fail;
-		}
-		found_size = btrfs_file_extent_inline_len(leaf,
-					  btrfs_item_nr(leaf, path->slots[0]));
-		found_end = key.offset + found_size;
-
-		if (found_end < offset + size) {
-			btrfs_release_path(root, path);
-			ret = btrfs_search_slot(trans, root, &key, path,
-						offset + size - found_end, 1);
-			BUG_ON(ret != 0);
-
-			ret = btrfs_extend_item(trans, root, path,
-						offset + size - found_end);
-			if (ret) {
-				err = ret;
-				goto fail;
-			}
-			leaf = path->nodes[0];
-			ei = btrfs_item_ptr(leaf, path->slots[0],
-					    struct btrfs_file_extent_item);
-			inode_add_bytes(inode, offset + size - found_end);
-		}
-		if (found_end < offset) {
-			ptr = btrfs_file_extent_inline_start(ei) + found_size;
-			memset_extent_buffer(leaf, 0, ptr, offset - found_end);
-		}
-	} else {
-insert:
-		btrfs_release_path(root, path);
-		datasize = offset + size - key.offset;
-		inode_add_bytes(inode, datasize);
-		datasize = btrfs_file_extent_calc_inline_size(datasize);
-		ret = btrfs_insert_empty_item(trans, root, path, &key,
-					      datasize);
-		if (ret) {
-			err = ret;
-			printk("got bad ret %d\n", ret);
-			goto fail;
-		}
-		leaf = path->nodes[0];
-		ei = btrfs_item_ptr(leaf, path->slots[0],
-				    struct btrfs_file_extent_item);
-		btrfs_set_file_extent_generation(leaf, ei, trans->transid);
-		btrfs_set_file_extent_type(leaf, ei, BTRFS_FILE_EXTENT_INLINE);
-	}
-	ptr = btrfs_file_extent_inline_start(ei) + offset - key.offset;
-
-	cur_size = size;
-	i = 0;
-	while (size > 0) {
-		page = pages[i];
-		kaddr = kmap_atomic(page, KM_USER0);
-		cur_size = min_t(size_t, PAGE_CACHE_SIZE - page_offset, size);
-		write_extent_buffer(leaf, kaddr + page_offset, ptr, cur_size);
-		kunmap_atomic(kaddr, KM_USER0);
-		page_offset = 0;
-		ptr += cur_size;
-		size -= cur_size;
-		if (i >= num_pages) {
-			printk("i %d num_pages %d\n", i, num_pages);
-		}
-		i++;
-	}
-	btrfs_mark_buffer_dirty(leaf);
-fail:
-	btrfs_free_path(path);
-	return err;
-}
-
 /*
  * after copy_from_user, pages need to be dirtied and we need to make
  * sure holes are created between the current EOF and the start of
@@ -267,8 +120,6 @@ static int noinline dirty_and_release_pages(struct btrfs_trans_handle *trans,
 	u64 start_pos;
 	u64 end_of_last_block;
 	u64 end_pos = pos + write_bytes;
-	u64 inline_size;
-	int did_inline = 0;
 	loff_t isize = i_size_read(inode);
 
 	start_pos = pos & ~((u64)root->sectorsize - 1);
@@ -314,7 +165,8 @@ static int noinline dirty_and_release_pages(struct btrfs_trans_handle *trans,
 			err = btrfs_insert_file_extent(trans, root,
 						       inode->i_ino,
 						       last_pos_in_file,
-						       0, 0, hole_size, 0);
+						       0, 0, hole_size, 0,
+						       hole_size, 0, 0, 0);
 			btrfs_drop_extent_cache(inode, last_pos_in_file,
 					last_pos_in_file + hole_size - 1, 0);
 			mutex_unlock(&BTRFS_I(inode)->extent_mutex);
@@ -324,57 +176,19 @@ static int noinline dirty_and_release_pages(struct btrfs_trans_handle *trans,
 			goto failed;
 	}
 
-	/*
-	 * either allocate an extent for the new bytes or setup the key
-	 * to show we are doing inline data in the extent
+	/* check for reserved extents on each page, we don't want
+	 * to reset the delalloc bit on things that already have
+	 * extents reserved.
 	 */
-	inline_size = end_pos;
-	if (isize >= BTRFS_MAX_INLINE_DATA_SIZE(root) ||
-	    inline_size > root->fs_info->max_inline ||
-	    (inline_size & (root->sectorsize -1)) == 0 ||
-	    inline_size >= BTRFS_MAX_INLINE_DATA_SIZE(root)) {
-		/* check for reserved extents on each page, we don't want
-		 * to reset the delalloc bit on things that already have
-		 * extents reserved.
-		 */
-		btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block);
-		for (i = 0; i < num_pages; i++) {
-			struct page *p = pages[i];
-			SetPageUptodate(p);
-			ClearPageChecked(p);
-			set_page_dirty(p);
-		}
-	} else {
-		u64 aligned_end;
-		/* step one, delete the existing extents in this range */
-		aligned_end = (pos + write_bytes + root->sectorsize - 1) &
-			~((u64)root->sectorsize - 1);
-		mutex_lock(&BTRFS_I(inode)->extent_mutex);
-		err = btrfs_drop_extents(trans, root, inode, start_pos,
-					 aligned_end, aligned_end, &hint_byte);
-		if (err)
-			goto failed;
-		if (isize > inline_size)
-			inline_size = min_t(u64, isize, aligned_end);
-		inline_size -= start_pos;
-		err = insert_inline_extent(trans, root, inode, start_pos,
-					   inline_size, pages, 0, num_pages);
-		btrfs_drop_extent_cache(inode, start_pos, aligned_end - 1, 0);
-		BUG_ON(err);
-		mutex_unlock(&BTRFS_I(inode)->extent_mutex);
-
-		/*
-		 * an ugly way to do all the prop accounting around
-		 * the page bits and mapping tags
-		 */
-		set_page_writeback(pages[0]);
-		end_page_writeback(pages[0]);
-		did_inline = 1;
+	btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block);
+	for (i = 0; i < num_pages; i++) {
+		struct page *p = pages[i];
+		SetPageUptodate(p);
+		ClearPageChecked(p);
+		set_page_dirty(p);
 	}
 	if (end_pos > isize) {
 		i_size_write(inode, end_pos);
-		if (did_inline)
-			BTRFS_I(inode)->disk_i_size = end_pos;
 		btrfs_update_inode(trans, root, inode);
 	}
 failed:
@@ -399,6 +213,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
 	int ret;
 	int testend = 1;
 	unsigned long flags;
+	int compressed = 0;
 
 	WARN_ON(end < start);
 	if (end == (u64)-1) {
@@ -434,6 +249,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
 			free_extent_map(em);
 			continue;
 		}
+		compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
 		clear_bit(EXTENT_FLAG_PINNED, &em->flags);
 		remove_extent_mapping(em_tree, em);
 
@@ -442,6 +258,12 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
 			split->start = em->start;
 			split->len = start - em->start;
 			split->block_start = em->block_start;
+
+			if (compressed)
+				split->block_len = em->block_len;
+			else
+				split->block_len = split->len;
+
 			split->bdev = em->bdev;
 			split->flags = flags;
 			ret = add_extent_mapping(em_tree, split);
@@ -459,7 +281,13 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
 			split->bdev = em->bdev;
 			split->flags = flags;
 
-			split->block_start = em->block_start + diff;
+			if (compressed) {
+				split->block_len = em->block_len;
+				split->block_start = em->block_start;
+			} else {
+				split->block_len = split->len;
+				split->block_start = em->block_start + diff;
+			}
 
 			ret = add_extent_mapping(em_tree, split);
 			BUG_ON(ret);
@@ -533,7 +361,7 @@ int btrfs_check_file(struct btrfs_root *root, struct inode *inode)
 			struct btrfs_item *item;
 			item = btrfs_item_nr(leaf, slot);
 			extent_end = found_key.offset +
-			     btrfs_file_extent_inline_len(leaf, item);
+			     btrfs_file_extent_inline_len(leaf, extent);
 			extent_end = (extent_end + root->sectorsize - 1) &
 				~((u64)root->sectorsize -1 );
 		}
@@ -573,6 +401,10 @@ int noinline btrfs_drop_extents(struct btrfs_trans_handle *trans,
 	u64 extent_end = 0;
 	u64 search_start = start;
 	u64 leaf_start;
+	u64 ram_bytes = 0;
+	u8 compression = 0;
+	u8 encryption = 0;
+	u16 other_encoding = 0;
 	u64 root_gen;
 	u64 root_owner;
 	struct extent_buffer *leaf;
@@ -589,6 +421,7 @@ int noinline btrfs_drop_extents(struct btrfs_trans_handle *trans,
 	int recow;
 	int ret;
 
+	inline_limit = 0;
 	btrfs_drop_extent_cache(inode, start, end - 1, 0);
 
 	path = btrfs_alloc_path();
@@ -637,6 +470,12 @@ next_slot:
 			extent = btrfs_item_ptr(leaf, slot,
 						struct btrfs_file_extent_item);
 			found_type = btrfs_file_extent_type(leaf, extent);
+			compression = btrfs_file_extent_compression(leaf,
+								    extent);
+			encryption = btrfs_file_extent_encryption(leaf,
+								  extent);
+			other_encoding = btrfs_file_extent_other_encoding(leaf,
+								  extent);
 			if (found_type == BTRFS_FILE_EXTENT_REG) {
 				extent_end =
 				     btrfs_file_extent_disk_bytenr(leaf,
@@ -646,13 +485,13 @@ next_slot:
 
 				extent_end = key.offset +
 				     btrfs_file_extent_num_bytes(leaf, extent);
+				ram_bytes = btrfs_file_extent_ram_bytes(leaf,
+								extent);
 				found_extent = 1;
 			} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
-				struct btrfs_item *item;
-				item = btrfs_item_nr(leaf, slot);
 				found_inline = 1;
 				extent_end = key.offset +
-				     btrfs_file_extent_inline_len(leaf, item);
+				     btrfs_file_extent_inline_len(leaf, extent);
 			}
 		} else {
 			extent_end = search_start;
@@ -680,10 +519,9 @@ next_slot:
 			search_start = (extent_end + mask) & ~mask;
 		} else
 			search_start = extent_end;
-		if (end <= extent_end && start >= key.offset && found_inline) {
+
+		if (end <= extent_end && start >= key.offset && found_inline)
 			*hint_byte = EXTENT_MAP_INLINE;
-			goto out;
-		}
 
 		if (found_extent) {
 			read_extent_buffer(leaf, &old, (unsigned long)extent,
@@ -770,12 +608,27 @@ next_slot:
 			write_extent_buffer(leaf, &old,
 					    (unsigned long)extent, sizeof(old));
 
+			btrfs_set_file_extent_compression(leaf, extent,
+							  compression);
+			btrfs_set_file_extent_encryption(leaf, extent,
+							 encryption);
+			btrfs_set_file_extent_other_encoding(leaf, extent,
+							     other_encoding);
 			btrfs_set_file_extent_offset(leaf, extent,
 				    le64_to_cpu(old.offset) + end - key.offset);
 			WARN_ON(le64_to_cpu(old.num_bytes) <
 				(extent_end - end));
 			btrfs_set_file_extent_num_bytes(leaf, extent,
 							extent_end - end);
+
+			/*
+			 * set the ram bytes to the size of the full extent
+			 * before splitting.  This is a worst case flag,
+			 * but its the best we can do because we don't know
+			 * how splitting affects compression
+			 */
+			btrfs_set_file_extent_ram_bytes(leaf, extent,
+							ram_bytes);
 			btrfs_set_file_extent_type(leaf, extent,
 						   BTRFS_FILE_EXTENT_REG);
 
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index bf4bed6ca4d6..9797592dc86b 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -49,6 +49,7 @@
 #include "compat.h"
 #include "tree-log.h"
 #include "ref-cache.h"
+#include "compression.h"
 
 struct btrfs_iget_args {
 	u64 ino;
@@ -83,6 +84,7 @@ static unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = {
 };
 
 static void btrfs_truncate(struct inode *inode);
+static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end);
 
 /*
  * a very lame attempt at stopping writes when the FS is 85% full.  There
@@ -113,58 +115,375 @@ int btrfs_check_free_space(struct btrfs_root *root, u64 num_required,
 	return ret;
 }
 
+/*
+ * this does all the hard work for inserting an inline extent into
+ * the btree.  The caller should have done a btrfs_drop_extents so that
+ * no overlapping inline items exist in the btree
+ */
+static int noinline insert_inline_extent(struct btrfs_trans_handle *trans,
+				struct btrfs_root *root, struct inode *inode,
+				u64 start, size_t size, size_t compressed_size,
+				struct page **compressed_pages)
+{
+	struct btrfs_key key;
+	struct btrfs_path *path;
+	struct extent_buffer *leaf;
+	struct page *page = NULL;
+	char *kaddr;
+	unsigned long ptr;
+	struct btrfs_file_extent_item *ei;
+	int err = 0;
+	int ret;
+	size_t cur_size = size;
+	size_t datasize;
+	unsigned long offset;
+	int use_compress = 0;
+
+	if (compressed_size && compressed_pages) {
+		use_compress = 1;
+		cur_size = compressed_size;
+	}
+
+	path = btrfs_alloc_path(); if (!path)
+		return -ENOMEM;
+
+	btrfs_set_trans_block_group(trans, inode);
+
+	key.objectid = inode->i_ino;
+	key.offset = start;
+	btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY);
+	inode_add_bytes(inode, size);
+	datasize = btrfs_file_extent_calc_inline_size(cur_size);
+
+	inode_add_bytes(inode, size);
+	ret = btrfs_insert_empty_item(trans, root, path, &key,
+				      datasize);
+	BUG_ON(ret);
+	if (ret) {
+		err = ret;
+		printk("got bad ret %d\n", ret);
+		goto fail;
+	}
+	leaf = path->nodes[0];
+	ei = btrfs_item_ptr(leaf, path->slots[0],
+			    struct btrfs_file_extent_item);
+	btrfs_set_file_extent_generation(leaf, ei, trans->transid);
+	btrfs_set_file_extent_type(leaf, ei, BTRFS_FILE_EXTENT_INLINE);
+	btrfs_set_file_extent_encryption(leaf, ei, 0);
+	btrfs_set_file_extent_other_encoding(leaf, ei, 0);
+	btrfs_set_file_extent_ram_bytes(leaf, ei, size);
+	ptr = btrfs_file_extent_inline_start(ei);
+
+	if (use_compress) {
+		struct page *cpage;
+		int i = 0;
+		while(compressed_size > 0) {
+			cpage = compressed_pages[i];
+			cur_size = min(compressed_size,
+				       PAGE_CACHE_SIZE);
+
+			kaddr = kmap(cpage);
+			write_extent_buffer(leaf, kaddr, ptr, cur_size);
+			kunmap(cpage);
+
+			i++;
+			ptr += cur_size;
+			compressed_size -= cur_size;
+		}
+		btrfs_set_file_extent_compression(leaf, ei,
+						  BTRFS_COMPRESS_ZLIB);
+	} else {
+		page = find_get_page(inode->i_mapping,
+				     start >> PAGE_CACHE_SHIFT);
+		btrfs_set_file_extent_compression(leaf, ei, 0);
+		kaddr = kmap_atomic(page, KM_USER0);
+		offset = start & (PAGE_CACHE_SIZE - 1);
+		write_extent_buffer(leaf, kaddr + offset, ptr, size);
+		kunmap_atomic(kaddr, KM_USER0);
+		page_cache_release(page);
+	}
+	btrfs_mark_buffer_dirty(leaf);
+	btrfs_free_path(path);
+
+	BTRFS_I(inode)->disk_i_size = inode->i_size;
+	btrfs_update_inode(trans, root, inode);
+	return 0;
+fail:
+	btrfs_free_path(path);
+	return err;
+}
+
+
+/*
+ * conditionally insert an inline extent into the file.  This
+ * does the checks required to make sure the data is small enough
+ * to fit as an inline extent.
+ */
+static int cow_file_range_inline(struct btrfs_trans_handle *trans,
+				 struct btrfs_root *root,
+				 struct inode *inode, u64 start, u64 end,
+				 size_t compressed_size,
+				 struct page **compressed_pages)
+{
+	u64 isize = i_size_read(inode);
+	u64 actual_end = min(end + 1, isize);
+	u64 inline_len = actual_end - start;
+	u64 aligned_end = (end + root->sectorsize - 1) &
+			~((u64)root->sectorsize - 1);
+	u64 hint_byte;
+	u64 data_len = inline_len;
+	int ret;
+
+	if (compressed_size)
+		data_len = compressed_size;
+
+	if (start > 0 ||
+	    data_len >= BTRFS_MAX_INLINE_DATA_SIZE(root) ||
+	    (!compressed_size &&
+	    (actual_end & (root->sectorsize - 1)) == 0) ||
+	    end + 1 < isize ||
+	    data_len > root->fs_info->max_inline) {
+		return 1;
+	}
+
+	mutex_lock(&BTRFS_I(inode)->extent_mutex);
+	ret = btrfs_drop_extents(trans, root, inode, start,
+				 aligned_end, aligned_end, &hint_byte);
+	BUG_ON(ret);
+
+	if (isize > actual_end)
+		inline_len = min_t(u64, isize, actual_end);
+	ret = insert_inline_extent(trans, root, inode, start,
+				   inline_len, compressed_size,
+				   compressed_pages);
+	BUG_ON(ret);
+	btrfs_drop_extent_cache(inode, start, aligned_end, 0);
+	mutex_unlock(&BTRFS_I(inode)->extent_mutex);
+	return 0;
+}
+
 /*
  * when extent_io.c finds a delayed allocation range in the file,
  * the call backs end up in this code.  The basic idea is to
  * allocate extents on disk for the range, and create ordered data structs
  * in ram to track those extents.
+ *
+ * locked_page is the page that writepage had locked already.  We use
+ * it to make sure we don't do extra locks or unlocks.
+ *
+ * *page_started is set to one if we unlock locked_page and do everything
+ * required to start IO on it.  It may be clean and already done with
+ * IO when we return.
  */
-static int cow_file_range(struct inode *inode, u64 start, u64 end)
+static int cow_file_range(struct inode *inode, struct page *locked_page,
+			  u64 start, u64 end, int *page_started)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_trans_handle *trans;
 	u64 alloc_hint = 0;
 	u64 num_bytes;
+	unsigned long ram_size;
+	u64 orig_start;
+	u64 disk_num_bytes;
 	u64 cur_alloc_size;
 	u64 blocksize = root->sectorsize;
-	u64 orig_num_bytes;
+	u64 actual_end;
 	struct btrfs_key ins;
 	struct extent_map *em;
 	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
 	int ret = 0;
+	struct page **pages = NULL;
+	unsigned long nr_pages;
+	unsigned long nr_pages_ret = 0;
+	unsigned long total_compressed = 0;
+	unsigned long total_in = 0;
+	unsigned long max_compressed = 128 * 1024;
+	unsigned long max_uncompressed = 256 * 1024;
+	int i;
+	int will_compress;
 
 	trans = btrfs_join_transaction(root, 1);
 	BUG_ON(!trans);
 	btrfs_set_trans_block_group(trans, inode);
+	orig_start = start;
+
+	/*
+	 * compression made this loop a bit ugly, but the basic idea is to
+	 * compress some pages but keep the total size of the compressed
+	 * extent relatively small.  If compression is off, this goto target
+	 * is never used.
+	 */
+again:
+	will_compress = 0;
+	nr_pages = (end >> PAGE_CACHE_SHIFT) - (start >> PAGE_CACHE_SHIFT) + 1;
+	nr_pages = min(nr_pages, (128 * 1024UL) / PAGE_CACHE_SIZE);
 
+	actual_end = min_t(u64, i_size_read(inode), end + 1);
+	total_compressed = actual_end - start;
+
+	/* we want to make sure that amount of ram required to uncompress
+	 * an extent is reasonable, so we limit the total size in ram
+	 * of a compressed extent to 256k
+	 */
+	total_compressed = min(total_compressed, max_uncompressed);
 	num_bytes = (end - start + blocksize) & ~(blocksize - 1);
 	num_bytes = max(blocksize,  num_bytes);
-	orig_num_bytes = num_bytes;
+	disk_num_bytes = num_bytes;
+	total_in = 0;
+	ret = 0;
 
-	if (alloc_hint == EXTENT_MAP_INLINE)
-		goto out;
+	/* we do compression for mount -o compress and when the
+	 * inode has not been flagged as nocompress
+	 */
+	if (!btrfs_test_flag(inode, NOCOMPRESS) &&
+	    btrfs_test_opt(root, COMPRESS)) {
+		WARN_ON(pages);
+		pages = kmalloc(sizeof(struct page *) * nr_pages, GFP_NOFS);
+
+		/* we want to make sure the amount of IO required to satisfy
+		 * a random read is reasonably small, so we limit the size
+		 * of a compressed extent to 128k
+		 */
+		ret = btrfs_zlib_compress_pages(inode->i_mapping, start,
+						total_compressed, pages,
+						nr_pages, &nr_pages_ret,
+						&total_in,
+						&total_compressed,
+						max_compressed);
+
+		if (!ret) {
+			unsigned long offset = total_compressed &
+				(PAGE_CACHE_SIZE - 1);
+			struct page *page = pages[nr_pages_ret - 1];
+			char *kaddr;
+
+			/* zero the tail end of the last page, we might be
+			 * sending it down to disk
+			 */
+			if (offset) {
+				kaddr = kmap_atomic(page, KM_USER0);
+				memset(kaddr + offset, 0,
+				       PAGE_CACHE_SIZE - offset);
+				kunmap_atomic(kaddr, KM_USER0);
+			}
+			will_compress = 1;
+		}
+	}
+	if (start == 0) {
+		/* lets try to make an inline extent */
+		if (ret || total_in < (end - start + 1)) {
+			/* we didn't compress the entire range, try
+			 * to make an uncompressed inline extent.  This
+			 * is almost sure to fail, but maybe inline sizes
+			 * will get bigger later
+			 */
+			ret = cow_file_range_inline(trans, root, inode,
+						    start, end, 0, NULL);
+		} else {
+			ret = cow_file_range_inline(trans, root, inode,
+						    start, end,
+						    total_compressed, pages);
+		}
+		if (ret == 0) {
+			extent_clear_unlock_delalloc(inode,
+						     &BTRFS_I(inode)->io_tree,
+						     start, end, NULL,
+						     1, 1, 1);
+			*page_started = 1;
+			ret = 0;
+			goto free_pages_out;
+		}
+	}
+
+	if (will_compress) {
+		/*
+		 * we aren't doing an inline extent round the compressed size
+		 * up to a block size boundary so the allocator does sane
+		 * things
+		 */
+		total_compressed = (total_compressed + blocksize - 1) &
+			~(blocksize - 1);
+
+		/*
+		 * one last check to make sure the compression is really a
+		 * win, compare the page count read with the blocks on disk
+		 */
+		total_in = (total_in + PAGE_CACHE_SIZE - 1) &
+			~(PAGE_CACHE_SIZE - 1);
+		if (total_compressed >= total_in) {
+			will_compress = 0;
+		} else {
+			disk_num_bytes = total_compressed;
+			num_bytes = total_in;
+		}
+	}
+	if (!will_compress && pages) {
+		/*
+		 * the compression code ran but failed to make things smaller,
+		 * free any pages it allocated and our page pointer array
+		 */
+		for (i = 0; i < nr_pages_ret; i++) {
+			page_cache_release(pages[i]);
+		}
+		kfree(pages);
+		pages = NULL;
+		total_compressed = 0;
+		nr_pages_ret = 0;
+
+		/* flag the file so we don't compress in the future */
+		btrfs_set_flag(inode, NOCOMPRESS);
+	}
+
+	BUG_ON(disk_num_bytes >
+	       btrfs_super_total_bytes(&root->fs_info->super_copy));
 
-	BUG_ON(num_bytes > btrfs_super_total_bytes(&root->fs_info->super_copy));
 	mutex_lock(&BTRFS_I(inode)->extent_mutex);
 	btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0);
 	mutex_unlock(&BTRFS_I(inode)->extent_mutex);
 
-	while(num_bytes > 0) {
-		cur_alloc_size = min(num_bytes, root->fs_info->max_extent);
+	while(disk_num_bytes > 0) {
+		unsigned long min_bytes;
+
+		/*
+		 * the max size of a compressed extent is pretty small,
+		 * make the code a little less complex by forcing
+		 * the allocator to find a whole compressed extent at once
+		 */
+		if (will_compress)
+			min_bytes = disk_num_bytes;
+		else
+			min_bytes = root->sectorsize;
+
+		cur_alloc_size = min(disk_num_bytes, root->fs_info->max_extent);
 		ret = btrfs_reserve_extent(trans, root, cur_alloc_size,
-					   root->sectorsize, 0, alloc_hint,
+					   min_bytes, 0, alloc_hint,
 					   (u64)-1, &ins, 1);
 		if (ret) {
 			WARN_ON(1);
-			goto out;
+			goto free_pages_out_fail;
 		}
 		em = alloc_extent_map(GFP_NOFS);
 		em->start = start;
-		em->len = ins.offset;
+
+		if (will_compress) {
+			ram_size = num_bytes;
+			em->len = num_bytes;
+		} else {
+			/* ramsize == disk size */
+			ram_size = ins.offset;
+			em->len = ins.offset;
+		}
+
 		em->block_start = ins.objectid;
+		em->block_len = ins.offset;
 		em->bdev = root->fs_info->fs_devices->latest_bdev;
+
 		mutex_lock(&BTRFS_I(inode)->extent_mutex);
 		set_bit(EXTENT_FLAG_PINNED, &em->flags);
+
+		if (will_compress)
+			set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
+
 		while(1) {
 			spin_lock(&em_tree->lock);
 			ret = add_extent_mapping(em_tree, em);
@@ -174,26 +493,95 @@ static int cow_file_range(struct inode *inode, u64 start, u64 end)
 				break;
 			}
 			btrfs_drop_extent_cache(inode, start,
-						start + ins.offset - 1, 0);
+						start + ram_size - 1, 0);
 		}
 		mutex_unlock(&BTRFS_I(inode)->extent_mutex);
 
 		cur_alloc_size = ins.offset;
 		ret = btrfs_add_ordered_extent(inode, start, ins.objectid,
-					       ins.offset, 0);
+					       ram_size, cur_alloc_size, 0,
+					       will_compress);
 		BUG_ON(ret);
-		if (num_bytes < cur_alloc_size) {
-			printk("num_bytes %Lu cur_alloc %Lu\n", num_bytes,
+
+		if (disk_num_bytes < cur_alloc_size) {
+			printk("num_bytes %Lu cur_alloc %Lu\n", disk_num_bytes,
 			       cur_alloc_size);
 			break;
 		}
+
+		if (will_compress) {
+			/*
+			 * we're doing compression, we and we need to
+			 * submit the compressed extents down to the device.
+			 *
+			 * We lock down all the file pages, clearing their
+			 * dirty bits and setting them writeback.  Everyone
+			 * that wants to modify the page will wait on the
+			 * ordered extent above.
+			 *
+			 * The writeback bits on the file pages are
+			 * cleared when the compressed pages are on disk
+			 */
+			btrfs_end_transaction(trans, root);
+
+			if (start <= page_offset(locked_page) &&
+			    page_offset(locked_page) < start + ram_size) {
+				*page_started = 1;
+			}
+
+			extent_clear_unlock_delalloc(inode,
+						     &BTRFS_I(inode)->io_tree,
+						     start,
+						     start + ram_size - 1,
+						     NULL, 1, 1, 0);
+
+			ret = btrfs_submit_compressed_write(inode, start,
+						 ram_size, ins.objectid,
+						 cur_alloc_size, pages,
+						 nr_pages_ret);
+
+			BUG_ON(ret);
+			trans = btrfs_join_transaction(root, 1);
+			if (start + ram_size < end) {
+				start += ram_size;
+				alloc_hint = ins.objectid + ins.offset;
+				/* pages will be freed at end_bio time */
+				pages = NULL;
+				goto again;
+			} else {
+				/* we've written everything, time to go */
+				break;
+			}
+		}
+		/* we're not doing compressed IO, don't unlock the first
+		 * page (which the caller expects to stay locked), don't
+		 * clear any dirty bits and don't set any writeback bits
+		 */
+		extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
+					     start, start + ram_size - 1,
+					     locked_page, 0, 0, 0);
+		disk_num_bytes -= cur_alloc_size;
 		num_bytes -= cur_alloc_size;
 		alloc_hint = ins.objectid + ins.offset;
 		start += cur_alloc_size;
 	}
+
+	ret = 0;
 out:
 	btrfs_end_transaction(trans, root);
+
 	return ret;
+
+free_pages_out_fail:
+	extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
+				     start, end, locked_page, 0, 0, 0);
+free_pages_out:
+	for (i = 0; i < nr_pages_ret; i++)
+		page_cache_release(pages[i]);
+	if (pages)
+		kfree(pages);
+
+	goto out;
 }
 
 /*
@@ -203,7 +591,8 @@ out:
  * If no cow copies or snapshots exist, we write directly to the existing
  * blocks on disk
  */
-static int run_delalloc_nocow(struct inode *inode, u64 start, u64 end)
+static int run_delalloc_nocow(struct inode *inode, struct page *locked_page,
+			      u64 start, u64 end, int *page_started)
 {
 	u64 extent_start;
 	u64 extent_end;
@@ -260,6 +649,11 @@ again:
 		extent_end = extent_start + extent_num_bytes;
 		err = 0;
 
+		if (btrfs_file_extent_compression(leaf, item) ||
+		    btrfs_file_extent_encryption(leaf,item) ||
+		    btrfs_file_extent_other_encoding(leaf, item))
+			goto not_found;
+
 		if (loops && start != extent_start)
 			goto not_found;
 
@@ -284,7 +678,8 @@ again:
 		bytenr += btrfs_file_extent_offset(leaf, item);
 		extent_num_bytes = min(end + 1, extent_end) - start;
 		ret = btrfs_add_ordered_extent(inode, start, bytenr,
-						extent_num_bytes, 1);
+						extent_num_bytes,
+						extent_num_bytes, 1, 0);
 		if (ret) {
 			err = ret;
 			goto out;
@@ -300,7 +695,8 @@ again:
 not_found:
 		btrfs_end_transaction(trans, root);
 		btrfs_free_path(path);
-		return cow_file_range(inode, start, end);
+		return cow_file_range(inode, locked_page, start, end,
+				      page_started);
 	}
 out:
 	WARN_ON(err);
@@ -312,16 +708,19 @@ out:
 /*
  * extent_io.c call back to do delayed allocation processing
  */
-static int run_delalloc_range(struct inode *inode, u64 start, u64 end)
+static int run_delalloc_range(struct inode *inode, struct page *locked_page,
+			      u64 start, u64 end, int *page_started)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	int ret;
 
 	if (btrfs_test_opt(root, NODATACOW) ||
 	    btrfs_test_flag(inode, NODATACOW))
-		ret = run_delalloc_nocow(inode, start, end);
+		ret = run_delalloc_nocow(inode, locked_page, start, end,
+					 page_started);
 	else
-		ret = cow_file_range(inode, start, end);
+		ret = cow_file_range(inode, locked_page, start, end,
+				     page_started);
 
 	return ret;
 }
@@ -383,7 +782,8 @@ int btrfs_clear_bit_hook(struct inode *inode, u64 start, u64 end,
  * we don't create bios that span stripes or chunks
  */
 int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
-			 size_t size, struct bio *bio)
+			 size_t size, struct bio *bio,
+			 unsigned long bio_flags)
 {
 	struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
 	struct btrfs_mapping_tree *map_tree;
@@ -413,7 +813,7 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
  * are inserted into the btree
  */
 int __btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
-			  int mirror_num)
+			  int mirror_num, unsigned long bio_flags)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	int ret = 0;
@@ -429,7 +829,7 @@ int __btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
  * or reading the csums from the tree before a read
  */
 int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
-			  int mirror_num)
+			  int mirror_num, unsigned long bio_flags)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	int ret = 0;
@@ -444,11 +844,17 @@ int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
 
 	if (!(rw & (1 << BIO_RW))) {
 		btrfs_lookup_bio_sums(root, inode, bio);
+
+		if (bio_flags & EXTENT_BIO_COMPRESSED) {
+			return btrfs_submit_compressed_read(inode, bio,
+						    mirror_num, bio_flags);
+		}
+
 		goto mapit;
 	}
 	return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
 				   inode, rw, bio, mirror_num,
-				   __btrfs_submit_bio_hook);
+				   bio_flags, __btrfs_submit_bio_hook);
 mapit:
 	return btrfs_map_bio(root, rw, bio, mirror_num, 0);
 }
@@ -539,7 +945,7 @@ out_page:
  * good idea.  This causes problems because we want to make sure COW
  * properly happens and the data=ordered rules are followed.
  *
- * In our case any range that doesn't have the EXTENT_ORDERED bit set
+ * In our case any range that doesn't have the ORDERED bit set
  * hasn't been properly setup for IO.  We kick off an async process
  * to fix it up.  The async helper will wait for ordered extents, set
  * the delalloc bit and make it safe to write the page.
@@ -632,10 +1038,21 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
 	btrfs_set_file_extent_disk_bytenr(leaf, extent_item,
 					  ordered_extent->start);
 	btrfs_set_file_extent_disk_num_bytes(leaf, extent_item,
-					     ordered_extent->len);
+					     ordered_extent->disk_len);
 	btrfs_set_file_extent_offset(leaf, extent_item, 0);
+
+	if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
+		btrfs_set_file_extent_compression(leaf, extent_item, 1);
+	else
+		btrfs_set_file_extent_compression(leaf, extent_item, 0);
+	btrfs_set_file_extent_encryption(leaf, extent_item, 0);
+	btrfs_set_file_extent_other_encoding(leaf, extent_item, 0);
+
+	/* ram bytes = extent_num_bytes for now */
 	btrfs_set_file_extent_num_bytes(leaf, extent_item,
 					ordered_extent->len);
+	btrfs_set_file_extent_ram_bytes(leaf, extent_item,
+					ordered_extent->len);
 	btrfs_mark_buffer_dirty(leaf);
 
 	btrfs_drop_extent_cache(inode, ordered_extent->file_offset,
@@ -644,7 +1061,7 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
 	mutex_unlock(&BTRFS_I(inode)->extent_mutex);
 
 	ins.objectid = ordered_extent->start;
-	ins.offset = ordered_extent->len;
+	ins.offset = ordered_extent->disk_len;
 	ins.type = BTRFS_EXTENT_ITEM_KEY;
 	ret = btrfs_alloc_reserved_extent(trans, root, leaf->start,
 					  root->root_key.objectid,
@@ -714,6 +1131,7 @@ int btrfs_io_failed_hook(struct bio *failed_bio,
 	int ret;
 	int rw;
 	u64 logical;
+	unsigned long bio_flags = 0;
 
 	ret = get_state_private(failure_tree, start, &private);
 	if (ret) {
@@ -738,6 +1156,8 @@ int btrfs_io_failed_hook(struct bio *failed_bio,
 		}
 		logical = start - em->start;
 		logical = em->block_start + logical;
+		if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
+			bio_flags = EXTENT_BIO_COMPRESSED;
 		failrec->logical = logical;
 		free_extent_map(em);
 		set_extent_bits(failure_tree, start, end, EXTENT_LOCKED |
@@ -781,7 +1201,8 @@ int btrfs_io_failed_hook(struct bio *failed_bio,
 		rw = READ;
 
 	BTRFS_I(inode)->io_tree.ops->submit_bio_hook(inode, rw, bio,
-						      failrec->last_mirror);
+						      failrec->last_mirror,
+						      bio_flags);
 	return 0;
 }
 
@@ -1644,10 +2065,8 @@ search_again:
 				item_end +=
 				    btrfs_file_extent_num_bytes(leaf, fi);
 			} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
-				struct btrfs_item *item = btrfs_item_nr(leaf,
-							        path->slots[0]);
 				item_end += btrfs_file_extent_inline_len(leaf,
-									 item);
+									 fi);
 			}
 			item_end--;
 		}
@@ -1715,7 +2134,14 @@ search_again:
 				root_owner = btrfs_header_owner(leaf);
 			}
 		} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
-			if (!del_item) {
+			/*
+			 * we can't truncate inline items that have had
+			 * special encodings
+			 */
+			if (!del_item &&
+			    btrfs_file_extent_compression(leaf, fi) == 0 &&
+			    btrfs_file_extent_encryption(leaf, fi) == 0 &&
+			    btrfs_file_extent_other_encoding(leaf, fi) == 0) {
 				u32 size = new_size - found_key.offset;
 
 				if (root->ref_cows) {
@@ -1926,7 +2352,8 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
 			err = btrfs_insert_file_extent(trans, root,
 						       inode->i_ino,
 						       hole_start, 0, 0,
-						       hole_size, 0);
+						       hole_size, 0, hole_size,
+						       0, 0, 0);
 			btrfs_drop_extent_cache(inode, hole_start,
 						(u64)-1, 0);
 			btrfs_check_file(root, inode);
@@ -2894,11 +3321,50 @@ static int merge_extent_mapping(struct extent_map_tree *em_tree,
 	start_diff = map_start - em->start;
 	em->start = map_start;
 	em->len = map_len;
-	if (em->block_start < EXTENT_MAP_LAST_BYTE)
+	if (em->block_start < EXTENT_MAP_LAST_BYTE &&
+	    !test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
 		em->block_start += start_diff;
+		em->block_len -= start_diff;
+	}
 	return add_extent_mapping(em_tree, em);
 }
 
+static noinline int uncompress_inline(struct btrfs_path *path,
+				      struct inode *inode, struct page *page,
+				      size_t pg_offset, u64 extent_offset,
+				      struct btrfs_file_extent_item *item)
+{
+	int ret;
+	struct extent_buffer *leaf = path->nodes[0];
+	char *tmp;
+	size_t max_size;
+	unsigned long inline_size;
+	unsigned long ptr;
+
+	WARN_ON(pg_offset != 0);
+	max_size = btrfs_file_extent_ram_bytes(leaf, item);
+	inline_size = btrfs_file_extent_inline_item_len(leaf,
+					btrfs_item_nr(leaf, path->slots[0]));
+	tmp = kmalloc(inline_size, GFP_NOFS);
+	ptr = btrfs_file_extent_inline_start(item);
+
+	read_extent_buffer(leaf, tmp, ptr, inline_size);
+
+	max_size = min(PAGE_CACHE_SIZE, max_size);
+	ret = btrfs_zlib_decompress(tmp, page, extent_offset,
+				    inline_size, max_size);
+	if (ret) {
+		char *kaddr = kmap_atomic(page, KM_USER0);
+		unsigned long copy_size = min_t(u64,
+				  PAGE_CACHE_SIZE - pg_offset,
+				  max_size - extent_offset);
+		memset(kaddr + pg_offset, 0, copy_size);
+		kunmap_atomic(kaddr, KM_USER0);
+	}
+	kfree(tmp);
+	return 0;
+}
+
 /*
  * a bit scary, this does extent mapping from logical file offset to the disk.
  * the ugly parts come from merging extents from the disk with the
@@ -2927,6 +3393,7 @@ struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
 	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
 	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
 	struct btrfs_trans_handle *trans = NULL;
+	int compressed;
 
 again:
 	spin_lock(&em_tree->lock);
@@ -2951,6 +3418,7 @@ again:
 	em->bdev = root->fs_info->fs_devices->latest_bdev;
 	em->start = EXTENT_MAP_HOLE;
 	em->len = (u64)-1;
+	em->block_len = (u64)-1;
 
 	if (!path) {
 		path = btrfs_alloc_path();
@@ -2983,6 +3451,7 @@ again:
 
 	found_type = btrfs_file_extent_type(leaf, item);
 	extent_start = found_key.offset;
+	compressed = btrfs_file_extent_compression(leaf, item);
 	if (found_type == BTRFS_FILE_EXTENT_REG) {
 		extent_end = extent_start +
 		       btrfs_file_extent_num_bytes(leaf, item);
@@ -3005,10 +3474,18 @@ again:
 			em->block_start = EXTENT_MAP_HOLE;
 			goto insert;
 		}
-		bytenr += btrfs_file_extent_offset(leaf, item);
-		em->block_start = bytenr;
 		em->start = extent_start;
 		em->len = extent_end - extent_start;
+		if (compressed) {
+			set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
+			em->block_start = bytenr;
+			em->block_len = btrfs_file_extent_disk_num_bytes(leaf,
+									 item);
+		} else {
+			bytenr += btrfs_file_extent_offset(leaf, item);
+			em->block_start = bytenr;
+			em->block_len = em->len;
+		}
 		goto insert;
 	} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
 		u64 page_start;
@@ -3018,8 +3495,7 @@ again:
 		size_t extent_offset;
 		size_t copy_size;
 
-		size = btrfs_file_extent_inline_len(leaf, btrfs_item_nr(leaf,
-						    path->slots[0]));
+		size = btrfs_file_extent_inline_len(leaf, item);
 		extent_end = (extent_start + size + root->sectorsize - 1) &
 			~((u64)root->sectorsize - 1);
 		if (start < extent_start || start >= extent_end) {
@@ -3035,9 +3511,10 @@ again:
 		}
 		em->block_start = EXTENT_MAP_INLINE;
 
-		if (!page) {
+		if (!page || create) {
 			em->start = extent_start;
-			em->len = size;
+			em->len = (size + root->sectorsize - 1) &
+			~((u64)root->sectorsize - 1);
 			goto out;
 		}
 
@@ -3048,11 +3525,22 @@ again:
 		em->start = extent_start + extent_offset;
 		em->len = (copy_size + root->sectorsize - 1) &
 			~((u64)root->sectorsize - 1);
-		map = kmap(page);
+		if (compressed)
+			set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
 		ptr = btrfs_file_extent_inline_start(item) + extent_offset;
 		if (create == 0 && !PageUptodate(page)) {
-			read_extent_buffer(leaf, map + pg_offset, ptr,
-					   copy_size);
+			if (btrfs_file_extent_compression(leaf, item) ==
+			    BTRFS_COMPRESS_ZLIB) {
+				ret = uncompress_inline(path, inode, page,
+							pg_offset,
+							extent_offset, item);
+				BUG_ON(ret);
+			} else {
+				map = kmap(page);
+				read_extent_buffer(leaf, map + pg_offset, ptr,
+						   copy_size);
+				kunmap(page);
+			}
 			flush_dcache_page(page);
 		} else if (create && PageUptodate(page)) {
 			if (!trans) {
@@ -3063,11 +3551,12 @@ again:
 				trans = btrfs_join_transaction(root, 1);
 				goto again;
 			}
+			map = kmap(page);
 			write_extent_buffer(leaf, map + pg_offset, ptr,
 					    copy_size);
+			kunmap(page);
 			btrfs_mark_buffer_dirty(leaf);
 		}
-		kunmap(page);
 		set_extent_uptodate(io_tree, em->start,
 				    extent_map_end(em) - 1, GFP_NOFS);
 		goto insert;
@@ -3779,6 +4268,11 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
 	btrfs_set_file_extent_generation(leaf, ei, trans->transid);
 	btrfs_set_file_extent_type(leaf, ei,
 				   BTRFS_FILE_EXTENT_INLINE);
+	btrfs_set_file_extent_encryption(leaf, ei, 0);
+	btrfs_set_file_extent_compression(leaf, ei, 0);
+	btrfs_set_file_extent_other_encoding(leaf, ei, 0);
+	btrfs_set_file_extent_ram_bytes(leaf, ei, name_len);
+
 	ptr = btrfs_file_extent_inline_start(ei);
 	write_extent_buffer(leaf, symname, ptr, name_len);
 	btrfs_mark_buffer_dirty(leaf);
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 2eb6caba57c2..b5745bb96d40 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -165,7 +165,8 @@ static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree,
  * inserted.
  */
 int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
-			     u64 start, u64 len, int nocow)
+			     u64 start, u64 len, u64 disk_len, int nocow,
+			     int compressed)
 {
 	struct btrfs_ordered_inode_tree *tree;
 	struct rb_node *node;
@@ -180,9 +181,12 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
 	entry->file_offset = file_offset;
 	entry->start = start;
 	entry->len = len;
+	entry->disk_len = disk_len;
 	entry->inode = inode;
 	if (nocow)
 		set_bit(BTRFS_ORDERED_NOCOW, &entry->flags);
+	if (compressed)
+		set_bit(BTRFS_ORDERED_COMPRESSED, &entry->flags);
 
 	/* one ref for the tree */
 	atomic_set(&entry->refs, 1);
@@ -389,9 +393,10 @@ void btrfs_start_ordered_extent(struct inode *inode,
 	 * for pdflush to find them
 	 */
 	btrfs_fdatawrite_range(inode->i_mapping, start, end, WB_SYNC_NONE);
-	if (wait)
+	if (wait) {
 		wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE,
 						 &entry->flags));
+	}
 }
 
 /*
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index f50f8870a144..1ef464145d22 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -66,6 +66,8 @@ struct btrfs_ordered_sum {
 
 #define BTRFS_ORDERED_NOCOW 2 /* set when we want to write in place */
 
+#define BTRFS_ORDERED_COMPRESSED 3 /* writing a compressed extent */
+
 struct btrfs_ordered_extent {
 	/* logical offset in the file */
 	u64 file_offset;
@@ -73,9 +75,12 @@ struct btrfs_ordered_extent {
 	/* disk byte number */
 	u64 start;
 
-	/* length of the extent in bytes */
+	/* ram length of the extent in bytes */
 	u64 len;
 
+	/* extent length on disk */
+	u64 disk_len;
+
 	/* flags (described above) */
 	unsigned long flags;
 
@@ -127,7 +132,8 @@ int btrfs_remove_ordered_extent(struct inode *inode,
 int btrfs_dec_test_ordered_pending(struct inode *inode,
 				       u64 file_offset, u64 io_size);
 int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
-			     u64 start, u64 len, int nocow);
+			     u64 start, u64 len, u64 disk_len, int nocow,
+			     int compressed);
 int btrfs_add_ordered_sum(struct inode *inode,
 			  struct btrfs_ordered_extent *entry,
 			  struct btrfs_ordered_sum *sum);
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index bd9ab3e9a7f2..64725c13aa11 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -115,15 +115,16 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
 			if (btrfs_file_extent_type(l, fi) ==
 			    BTRFS_FILE_EXTENT_INLINE) {
 				printk("\t\tinline extent data size %u\n",
-			           btrfs_file_extent_inline_len(l, item));
+			           btrfs_file_extent_inline_len(l, fi));
 				break;
 			}
 			printk("\t\textent data disk bytenr %llu nr %llu\n",
 			       (unsigned long long)btrfs_file_extent_disk_bytenr(l, fi),
 			       (unsigned long long)btrfs_file_extent_disk_num_bytes(l, fi));
-			printk("\t\textent data offset %llu nr %llu\n",
+			printk("\t\textent data offset %llu nr %llu ram %llu\n",
 			  (unsigned long long)btrfs_file_extent_offset(l, fi),
-			  (unsigned long long)btrfs_file_extent_num_bytes(l, fi));
+			  (unsigned long long)btrfs_file_extent_num_bytes(l, fi),
+			  (unsigned long long)btrfs_file_extent_ram_bytes(l, fi));
 			break;
 		case BTRFS_BLOCK_GROUP_ITEM_KEY:
 			bi = btrfs_item_ptr(l, i,
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 2e6039825b7b..431fdf144b58 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -47,6 +47,7 @@
 #include "volumes.h"
 #include "version.h"
 #include "export.h"
+#include "compression.h"
 
 #define BTRFS_SUPER_MAGIC 0x9123683E
 
@@ -69,7 +70,7 @@ static void btrfs_put_super (struct super_block * sb)
 enum {
 	Opt_degraded, Opt_subvol, Opt_device, Opt_nodatasum, Opt_nodatacow,
 	Opt_max_extent, Opt_max_inline, Opt_alloc_start, Opt_nobarrier,
-	Opt_ssd, Opt_thread_pool, Opt_noacl,  Opt_err,
+	Opt_ssd, Opt_thread_pool, Opt_noacl,  Opt_compress, Opt_err,
 };
 
 static match_table_t tokens = {
@@ -83,6 +84,7 @@ static match_table_t tokens = {
 	{Opt_max_inline, "max_inline=%s"},
 	{Opt_alloc_start, "alloc_start=%s"},
 	{Opt_thread_pool, "thread_pool=%d"},
+	{Opt_compress, "compress"},
 	{Opt_ssd, "ssd"},
 	{Opt_noacl, "noacl"},
 	{Opt_err, NULL},
@@ -163,6 +165,10 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
 			btrfs_set_opt(info->mount_opt, NODATACOW);
 			btrfs_set_opt(info->mount_opt, NODATASUM);
 			break;
+		case Opt_compress:
+			printk(KERN_INFO "btrfs: use compression\n");
+			btrfs_set_opt(info->mount_opt, COMPRESS);
+			break;
 		case Opt_ssd:
 			printk(KERN_INFO "btrfs: use ssd allocation scheme\n");
 			btrfs_set_opt(info->mount_opt, SSD);
@@ -622,6 +628,7 @@ static int __init init_btrfs_fs(void)
 	err = btrfs_interface_init();
 	if (err)
 		goto free_extent_map;
+
 	err = register_filesystem(&btrfs_fs_type);
 	if (err)
 		goto unregister_ioctl;
@@ -651,6 +658,7 @@ static void __exit exit_btrfs_fs(void)
 	unregister_filesystem(&btrfs_fs_type);
 	btrfs_exit_sysfs();
 	btrfs_cleanup_fs_uuids();
+	btrfs_zlib_exit();
 }
 
 module_init(init_btrfs_fs)
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index cf618cc8b34a..e6d579053a47 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -540,8 +540,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
 	if (found_type == BTRFS_FILE_EXTENT_REG)
 		extent_end = start + btrfs_file_extent_num_bytes(eb, item);
 	else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
-		size = btrfs_file_extent_inline_len(eb,
-						    btrfs_item_nr(eb, slot));
+		size = btrfs_file_extent_inline_len(eb, item);
 		extent_end = (start + size + mask) & ~mask;
 	} else {
 		ret = 0;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 2eed7f91f51a..7db4cfd03a98 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1816,6 +1816,7 @@ again:
 	em->start = key.offset;
 	em->len = *num_bytes;
 	em->block_start = 0;
+	em->block_len = em->len;
 
 	if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
 		ret = btrfs_add_system_chunk(trans, chunk_root, &key,
@@ -2323,6 +2324,7 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
 	em->start = logical;
 	em->len = length;
 	em->block_start = 0;
+	em->block_len = em->len;
 
 	map->num_stripes = num_stripes;
 	map->io_width = btrfs_chunk_io_width(leaf, chunk);
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
new file mode 100644
index 000000000000..e99309180a11
--- /dev/null
+++ b/fs/btrfs/zlib.c
@@ -0,0 +1,637 @@
+/*
+ * Copyright (C) 2008 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Based on jffs2 zlib code:
+ * Copyright © 2001-2007 Red Hat, Inc.
+ * Created by David Woodhouse <dwmw2@infradead.org>
+ */
+
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/zlib.h>
+#include <linux/zutil.h>
+#include <linux/vmalloc.h>
+#include <linux/init.h>
+#include <linux/err.h>
+#include <linux/sched.h>
+#include <linux/pagemap.h>
+#include <linux/bio.h>
+
+/* Plan: call deflate() with avail_in == *sourcelen,
+	avail_out = *dstlen - 12 and flush == Z_FINISH.
+	If it doesn't manage to finish,	call it again with
+	avail_in == 0 and avail_out set to the remaining 12
+	bytes for it to clean up.
+   Q: Is 12 bytes sufficient?
+*/
+#define STREAM_END_SPACE 12
+
+struct workspace {
+	z_stream inf_strm;
+	z_stream def_strm;
+	char *buf;
+	struct list_head list;
+};
+
+static LIST_HEAD(idle_workspace);
+static DEFINE_SPINLOCK(workspace_lock);
+static unsigned long num_workspace;
+static atomic_t alloc_workspace = ATOMIC_INIT(0);
+static DECLARE_WAIT_QUEUE_HEAD(workspace_wait);
+
+/*
+ * this finds an available zlib workspace or allocates a new one
+ * NULL or an ERR_PTR is returned if things go bad.
+ */
+static struct workspace *find_zlib_workspace(void)
+{
+	struct workspace *workspace;
+	int ret;
+	int cpus = num_online_cpus();
+
+again:
+	spin_lock(&workspace_lock);
+	if (!list_empty(&idle_workspace)) {
+		workspace = list_entry(idle_workspace.next, struct workspace,
+				       list);
+		list_del(&workspace->list);
+		num_workspace--;
+		spin_unlock(&workspace_lock);
+		return workspace;
+
+	}
+	spin_unlock(&workspace_lock);
+	if (atomic_read(&alloc_workspace) > cpus) {
+		DEFINE_WAIT(wait);
+		prepare_to_wait(&workspace_wait, &wait, TASK_UNINTERRUPTIBLE);
+		if (atomic_read(&alloc_workspace) > cpus)
+			schedule();
+		finish_wait(&workspace_wait, &wait);
+		goto again;
+	}
+	atomic_inc(&alloc_workspace);
+	workspace = kzalloc(sizeof(*workspace), GFP_NOFS);
+	if (!workspace) {
+		ret = -ENOMEM;
+		goto fail;
+	}
+
+	workspace->def_strm.workspace = vmalloc(zlib_deflate_workspacesize());
+	if (!workspace->def_strm.workspace) {
+		ret = -ENOMEM;
+		goto fail;
+	}
+	workspace->inf_strm.workspace = vmalloc(zlib_inflate_workspacesize());
+	if (!workspace->inf_strm.workspace) {
+		ret = -ENOMEM;
+		goto fail_inflate;
+	}
+	workspace->buf = kmalloc(PAGE_CACHE_SIZE, GFP_NOFS);
+	if (!workspace->buf) {
+		ret = -ENOMEM;
+		goto fail_kmalloc;
+	}
+	return workspace;
+
+fail_kmalloc:
+	vfree(workspace->inf_strm.workspace);
+fail_inflate:
+	vfree(workspace->def_strm.workspace);
+fail:
+	kfree(workspace);
+	atomic_dec(&alloc_workspace);
+	wake_up(&workspace_wait);
+	return ERR_PTR(ret);
+}
+
+/*
+ * put a workspace struct back on the list or free it if we have enough
+ * idle ones sitting around
+ */
+static int free_workspace(struct workspace *workspace)
+{
+	spin_lock(&workspace_lock);
+	if (num_workspace < num_online_cpus()) {
+		list_add_tail(&workspace->list, &idle_workspace);
+		num_workspace++;
+		spin_unlock(&workspace_lock);
+		if (waitqueue_active(&workspace_wait))
+			wake_up(&workspace_wait);
+		return 0;
+	}
+	spin_unlock(&workspace_lock);
+	vfree(workspace->def_strm.workspace);
+	vfree(workspace->inf_strm.workspace);
+	kfree(workspace->buf);
+	kfree(workspace);
+
+	atomic_dec(&alloc_workspace);
+	if (waitqueue_active(&workspace_wait))
+		wake_up(&workspace_wait);
+	return 0;
+}
+
+/*
+ * cleanup function for module exit
+ */
+static void free_workspaces(void)
+{
+	struct workspace *workspace;
+	while(!list_empty(&idle_workspace)) {
+		workspace = list_entry(idle_workspace.next, struct workspace,
+				       list);
+		list_del(&workspace->list);
+		vfree(workspace->def_strm.workspace);
+		vfree(workspace->inf_strm.workspace);
+		kfree(workspace->buf);
+		kfree(workspace);
+		atomic_dec(&alloc_workspace);
+	}
+}
+
+/*
+ * given an address space and start/len, compress the bytes.
+ *
+ * pages are allocated to hold the compressed result and stored
+ * in 'pages'
+ *
+ * out_pages is used to return the number of pages allocated.  There
+ * may be pages allocated even if we return an error
+ *
+ * total_in is used to return the number of bytes actually read.  It
+ * may be smaller then len if we had to exit early because we
+ * ran out of room in the pages array or because we cross the
+ * max_out threshold.
+ *
+ * total_out is used to return the total number of compressed bytes
+ *
+ * max_out tells us the max number of bytes that we're allowed to
+ * stuff into pages
+ */
+int btrfs_zlib_compress_pages(struct address_space *mapping,
+			      u64 start, unsigned long len,
+			      struct page **pages,
+			      unsigned long nr_dest_pages,
+			      unsigned long *out_pages,
+			      unsigned long *total_in,
+			      unsigned long *total_out,
+			      unsigned long max_out)
+{
+	int ret;
+	struct workspace *workspace;
+	char *data_in;
+	char *cpage_out;
+	int nr_pages = 0;
+	struct page *in_page = NULL;
+	struct page *out_page = NULL;
+	int out_written = 0;
+	int in_read = 0;
+	unsigned long bytes_left;
+
+	*out_pages = 0;
+	*total_out = 0;
+	*total_in = 0;
+
+	workspace = find_zlib_workspace();
+	if (!workspace)
+		return -1;
+
+	if (Z_OK != zlib_deflateInit(&workspace->def_strm, 3)) {
+		printk(KERN_WARNING "deflateInit failed\n");
+		ret = -1;
+		goto out;
+	}
+
+	workspace->def_strm.total_in = 0;
+	workspace->def_strm.total_out = 0;
+
+	in_page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT);
+	data_in = kmap(in_page);
+
+	out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+	cpage_out = kmap(out_page);
+	pages[0] = out_page;
+	nr_pages = 1;
+
+	workspace->def_strm.next_in = data_in;
+	workspace->def_strm.next_out = cpage_out;
+	workspace->def_strm.avail_out = PAGE_CACHE_SIZE;
+	workspace->def_strm.avail_in = min(len, PAGE_CACHE_SIZE);
+
+	out_written = 0;
+	in_read = 0;
+
+	while (workspace->def_strm.total_in < len) {
+		ret = zlib_deflate(&workspace->def_strm, Z_SYNC_FLUSH);
+		if (ret != Z_OK) {
+			printk(KERN_DEBUG "btrfs deflate in loop returned %d\n",
+			       ret);
+			zlib_deflateEnd(&workspace->def_strm);
+			ret = -1;
+			goto out;
+		}
+
+		/* we're making it bigger, give up */
+		if (workspace->def_strm.total_in > 8192 &&
+		    workspace->def_strm.total_in <
+		    workspace->def_strm.total_out) {
+			ret = -1;
+			goto out;
+		}
+		/* we need another page for writing out.  Test this
+		 * before the total_in so we will pull in a new page for
+		 * the stream end if required
+		 */
+		if (workspace->def_strm.avail_out == 0) {
+			kunmap(out_page);
+			if (nr_pages == nr_dest_pages) {
+				out_page = NULL;
+				ret = -1;
+				goto out;
+			}
+			out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+			cpage_out = kmap(out_page);
+			pages[nr_pages] = out_page;
+			nr_pages++;
+			workspace->def_strm.avail_out = PAGE_CACHE_SIZE;
+			workspace->def_strm.next_out = cpage_out;
+		}
+		/* we're all done */
+		if (workspace->def_strm.total_in >= len)
+			break;
+
+		/* we've read in a full page, get a new one */
+		if (workspace->def_strm.avail_in == 0) {
+			if (workspace->def_strm.total_out > max_out)
+				break;
+
+			bytes_left = len - workspace->def_strm.total_in;
+			kunmap(in_page);
+			page_cache_release(in_page);
+
+			start += PAGE_CACHE_SIZE;
+			in_page = find_get_page(mapping,
+						start >> PAGE_CACHE_SHIFT);
+			data_in = kmap(in_page);
+			workspace->def_strm.avail_in = min(bytes_left,
+							   PAGE_CACHE_SIZE);
+			workspace->def_strm.next_in = data_in;
+		}
+	}
+	workspace->def_strm.avail_in = 0;
+	ret = zlib_deflate(&workspace->def_strm, Z_FINISH);
+	zlib_deflateEnd(&workspace->def_strm);
+
+	if (ret != Z_STREAM_END) {
+		ret = -1;
+		goto out;
+	}
+
+	if (workspace->def_strm.total_out >= workspace->def_strm.total_in) {
+		ret = -1;
+		goto out;
+	}
+
+	ret = 0;
+	*total_out = workspace->def_strm.total_out;
+	*total_in = workspace->def_strm.total_in;
+out:
+	*out_pages = nr_pages;
+	if (out_page)
+		kunmap(out_page);
+
+	if (in_page) {
+		kunmap(in_page);
+		page_cache_release(in_page);
+	}
+	free_workspace(workspace);
+	return ret;
+}
+
+/*
+ * pages_in is an array of pages with compressed data.
+ *
+ * disk_start is the starting logical offset of this array in the file
+ *
+ * bvec is a bio_vec of pages from the file that we want to decompress into
+ *
+ * vcnt is the count of pages in the biovec
+ *
+ * srclen is the number of bytes in pages_in
+ *
+ * The basic idea is that we have a bio that was created by readpages.
+ * The pages in the bio are for the uncompressed data, and they may not
+ * be contiguous.  They all correspond to the range of bytes covered by
+ * the compressed extent.
+ */
+int btrfs_zlib_decompress_biovec(struct page **pages_in,
+			      u64 disk_start,
+			      struct bio_vec *bvec,
+			      int vcnt,
+			      size_t srclen)
+{
+	int ret = 0;
+	int wbits = MAX_WBITS;
+	struct workspace *workspace;
+	char *data_in;
+	size_t total_out = 0;
+	unsigned long page_bytes_left;
+	unsigned long page_in_index = 0;
+	unsigned long page_out_index = 0;
+	struct page *page_out;
+	unsigned long total_pages_in = (srclen + PAGE_CACHE_SIZE - 1) /
+					PAGE_CACHE_SIZE;
+	unsigned long buf_start;
+	unsigned long buf_offset;
+	unsigned long bytes;
+	unsigned long working_bytes;
+	unsigned long pg_offset;
+	unsigned long start_byte;
+	unsigned long current_buf_start;
+	char *kaddr;
+
+	workspace = find_zlib_workspace();
+	if (!workspace)
+		return -ENOMEM;
+
+	data_in = kmap(pages_in[page_in_index]);
+	workspace->inf_strm.next_in = data_in;
+	workspace->inf_strm.avail_in = min(srclen, PAGE_CACHE_SIZE);
+	workspace->inf_strm.total_in = 0;
+
+	workspace->inf_strm.total_out = 0;
+	workspace->inf_strm.next_out = workspace->buf;
+	workspace->inf_strm.avail_out = PAGE_CACHE_SIZE;
+	page_out = bvec[page_out_index].bv_page;
+	page_bytes_left = PAGE_CACHE_SIZE;
+	pg_offset = 0;
+
+	/* If it's deflate, and it's got no preset dictionary, then
+	   we can tell zlib to skip the adler32 check. */
+	if (srclen > 2 && !(data_in[1] & PRESET_DICT) &&
+	    ((data_in[0] & 0x0f) == Z_DEFLATED) &&
+	    !(((data_in[0]<<8) + data_in[1]) % 31)) {
+
+		wbits = -((data_in[0] >> 4) + 8);
+		workspace->inf_strm.next_in += 2;
+		workspace->inf_strm.avail_in -= 2;
+	}
+
+	if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) {
+		printk(KERN_WARNING "inflateInit failed\n");
+		ret = -1;
+		goto out;
+	}
+	while(workspace->inf_strm.total_in < srclen) {
+		ret = zlib_inflate(&workspace->inf_strm, Z_NO_FLUSH);
+		if (ret != Z_OK && ret != Z_STREAM_END) {
+			break;
+		}
+
+		/*
+		 * buf start is the byte offset we're of the start of
+		 * our workspace buffer
+		 */
+		buf_start = total_out;
+
+		/* total_out is the last byte of the workspace buffer */
+		total_out = workspace->inf_strm.total_out;
+
+		working_bytes = total_out - buf_start;
+
+		/*
+		 * start byte is the first byte of the page we're currently
+		 * copying into relative to the start of the compressed data.
+		 */
+		start_byte = page_offset(page_out) - disk_start;
+
+		if (working_bytes == 0) {
+			/* we didn't make progress in this inflate
+			 * call, we're done
+			 */
+			if (ret != Z_STREAM_END)
+				ret = -1;
+			break;
+		}
+
+		/* we haven't yet hit data corresponding to this page */
+		if (total_out <= start_byte) {
+			goto next;
+		}
+
+		/*
+		 * the start of the data we care about is offset into
+		 * the middle of our working buffer
+		 */
+		if (total_out > start_byte && buf_start < start_byte) {
+			buf_offset = start_byte - buf_start;
+			working_bytes -= buf_offset;
+		} else {
+			buf_offset = 0;
+		}
+		current_buf_start = buf_start;
+
+		/* copy bytes from the working buffer into the pages */
+		while(working_bytes > 0) {
+			bytes = min(PAGE_CACHE_SIZE - pg_offset,
+				    PAGE_CACHE_SIZE - buf_offset);
+			bytes = min(bytes, working_bytes);
+			kaddr = kmap_atomic(page_out, KM_USER0);
+			memcpy(kaddr + pg_offset, workspace->buf + buf_offset,
+			       bytes);
+			kunmap_atomic(kaddr, KM_USER0);
+			flush_dcache_page(page_out);
+
+			pg_offset += bytes;
+			page_bytes_left -= bytes;
+			buf_offset += bytes;
+			working_bytes -= bytes;
+			current_buf_start += bytes;
+
+			/* check if we need to pick another page */
+			if (page_bytes_left == 0) {
+				page_out_index++;
+				if (page_out_index >= vcnt) {
+					ret = 0;
+					goto done;
+				}
+				page_out = bvec[page_out_index].bv_page;
+				pg_offset = 0;
+				page_bytes_left = PAGE_CACHE_SIZE;
+				start_byte = page_offset(page_out) - disk_start;
+
+				/*
+				 * make sure our new page is covered by this
+				 * working buffer
+				 */
+				if (total_out <= start_byte) {
+					goto next;
+				}
+
+				/* the next page in the biovec might not
+				 * be adjacent to the last page, but it
+				 * might still be found inside this working
+				 * buffer.  bump our offset pointer
+				 */
+				if (total_out > start_byte &&
+				    current_buf_start < start_byte) {
+					buf_offset = start_byte - buf_start;
+					working_bytes = total_out - start_byte;
+					current_buf_start = buf_start +
+						buf_offset;
+				}
+			}
+		}
+next:
+		workspace->inf_strm.next_out = workspace->buf;
+		workspace->inf_strm.avail_out = PAGE_CACHE_SIZE;
+
+		if (workspace->inf_strm.avail_in == 0) {
+			unsigned long tmp;
+			kunmap(pages_in[page_in_index]);
+			page_in_index++;
+			if (page_in_index >= total_pages_in) {
+				data_in = NULL;
+				break;
+			}
+			data_in = kmap(pages_in[page_in_index]);
+			workspace->inf_strm.next_in = data_in;
+			tmp = srclen - workspace->inf_strm.total_in;
+			workspace->inf_strm.avail_in = min(tmp,
+							   PAGE_CACHE_SIZE);
+		}
+	}
+	if (ret != Z_STREAM_END) {
+		ret = -1;
+	} else {
+		ret = 0;
+	}
+done:
+	zlib_inflateEnd(&workspace->inf_strm);
+	if (data_in)
+		kunmap(pages_in[page_in_index]);
+out:
+	free_workspace(workspace);
+	return ret;
+}
+
+/*
+ * a less complex decompression routine.  Our compressed data fits in a
+ * single page, and we want to read a single page out of it.
+ * start_byte tells us the offset into the compressed data we're interested in
+ */
+int btrfs_zlib_decompress(unsigned char *data_in,
+			  struct page *dest_page,
+			  unsigned long start_byte,
+			  size_t srclen, size_t destlen)
+{
+	int ret = 0;
+	int wbits = MAX_WBITS;
+	struct workspace *workspace;
+	unsigned long bytes_left = destlen;
+	unsigned long total_out = 0;
+	char *kaddr;
+
+	if (destlen > PAGE_CACHE_SIZE)
+		return -ENOMEM;
+
+	workspace = find_zlib_workspace();
+	if (!workspace)
+		return -ENOMEM;
+
+	workspace->inf_strm.next_in = data_in;
+	workspace->inf_strm.avail_in = srclen;
+	workspace->inf_strm.total_in = 0;
+
+	workspace->inf_strm.next_out = workspace->buf;
+	workspace->inf_strm.avail_out = PAGE_CACHE_SIZE;
+	workspace->inf_strm.total_out = 0;
+	/* If it's deflate, and it's got no preset dictionary, then
+	   we can tell zlib to skip the adler32 check. */
+	if (srclen > 2 && !(data_in[1] & PRESET_DICT) &&
+	    ((data_in[0] & 0x0f) == Z_DEFLATED) &&
+	    !(((data_in[0]<<8) + data_in[1]) % 31)) {
+
+		wbits = -((data_in[0] >> 4) + 8);
+		workspace->inf_strm.next_in += 2;
+		workspace->inf_strm.avail_in -= 2;
+	}
+
+	if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) {
+		printk(KERN_WARNING "inflateInit failed\n");
+		ret = -1;
+		goto out;
+	}
+
+	while(bytes_left > 0) {
+		unsigned long buf_start;
+		unsigned long buf_offset;
+		unsigned long bytes;
+		unsigned long pg_offset = 0;
+
+		ret = zlib_inflate(&workspace->inf_strm, Z_NO_FLUSH);
+		if (ret != Z_OK && ret != Z_STREAM_END) {
+			break;
+		}
+
+		buf_start = total_out;
+		total_out = workspace->inf_strm.total_out;
+
+		if (total_out == buf_start) {
+			ret = -1;
+			break;
+		}
+
+		if (total_out <= start_byte) {
+			goto next;
+		}
+
+		if (total_out > start_byte && buf_start < start_byte) {
+			buf_offset = start_byte - buf_start;
+		} else {
+			buf_offset = 0;
+		}
+
+		bytes = min(PAGE_CACHE_SIZE - pg_offset,
+			    PAGE_CACHE_SIZE - buf_offset);
+		bytes = min(bytes, bytes_left);
+
+		kaddr = kmap_atomic(dest_page, KM_USER0);
+		memcpy(kaddr + pg_offset, workspace->buf + buf_offset, bytes);
+		kunmap_atomic(kaddr, KM_USER0);
+
+		pg_offset += bytes;
+		bytes_left -= bytes;
+next:
+		workspace->inf_strm.next_out = workspace->buf;
+		workspace->inf_strm.avail_out = PAGE_CACHE_SIZE;
+	}
+	if (ret != Z_STREAM_END && bytes_left != 0) {
+		ret = -1;
+	} else {
+		ret = 0;
+	}
+	zlib_inflateEnd(&workspace->inf_strm);
+out:
+	free_workspace(workspace);
+	return ret;
+}
+
+void btrfs_zlib_exit(void)
+{
+    free_workspaces();
+}
-- 
cgit v1.2.3


From 2517920135b0d29e70453e5b03d70d7b94207df3 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@redhat.com>
Date: Wed, 29 Oct 2008 14:49:05 -0400
Subject: Btrfs: nuke fs wide allocation mutex V2

This patch removes the giant fs_info->alloc_mutex and replaces it with a bunch
of little locks.

There is now a pinned_mutex, which is used when messing with the pinned_extents
extent io tree, and the extent_ins_mutex which is used with the pending_del and
extent_ins extent io trees.

The locking for the extent tree stuff was inspired by a patch that Yan Zheng
wrote to fix a race condition, I cleaned it up some and changed the locking
around a little bit, but the idea remains the same.  Basically instead of
holding the extent_ins_mutex throughout the processing of an extent on the
extent_ins or pending_del trees, we just hold it while we're searching and when
we clear the bits on those trees, and lock the extent for the duration of the
operations on the extent.

Also to keep from getting hung up waiting to lock an extent, I've added a
try_lock_extent so if we cannot lock the extent, move on to the next one in the
tree and we'll come back to that one.  I have tested this heavily and it does
not appear to break anything.  This has to be applied on top of my
find_free_extent redo patch.

I tested this patch on top of Yan's space reblancing code and it worked fine.
The only thing that has changed since the last version is I pulled out all my
debugging stuff, apparently I forgot to run guilt refresh before I sent the
last patch out.  Thank you,

Signed-off-by: Josef Bacik <jbacik@redhat.com>
---
 fs/btrfs/ctree.c            |   3 +-
 fs/btrfs/ctree.h            |   8 +-
 fs/btrfs/disk-io.c          |   3 +-
 fs/btrfs/extent-tree.c      | 333 +++++++++++++++++++++-----------------------
 fs/btrfs/extent_io.c        |  14 ++
 fs/btrfs/extent_io.h        |   2 +
 fs/btrfs/free-space-cache.c |  92 ++++++++----
 fs/btrfs/transaction.c      |   2 -
 fs/btrfs/tree-defrag.c      |   3 -
 fs/btrfs/tree-log.c         |   4 +-
 fs/btrfs/volumes.c          |   2 -
 11 files changed, 250 insertions(+), 216 deletions(-)

(limited to 'fs/btrfs/volumes.c')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 73899d0f9d8f..f82f8db02275 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1387,8 +1387,7 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
 	lowest_level = p->lowest_level;
 	WARN_ON(lowest_level && ins_len > 0);
 	WARN_ON(p->nodes[0] != NULL);
-	WARN_ON(cow && root == root->fs_info->extent_root &&
-		!mutex_is_locked(&root->fs_info->alloc_mutex));
+
 	if (ins_len < 0)
 		lowest_unlock = 2;
 
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index caa860a1c3e5..fdba4f1b634e 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -558,6 +558,7 @@ struct btrfs_block_group_cache {
 	struct btrfs_key key;
 	struct btrfs_block_group_item item;
 	spinlock_t lock;
+	struct mutex alloc_mutex;
 	u64 pinned;
 	u64 reserved;
 	u64 flags;
@@ -635,7 +636,8 @@ struct btrfs_fs_info {
 	struct mutex tree_log_mutex;
 	struct mutex transaction_kthread_mutex;
 	struct mutex cleaner_mutex;
-	struct mutex alloc_mutex;
+	struct mutex extent_ins_mutex;
+	struct mutex pinned_mutex;
 	struct mutex chunk_mutex;
 	struct mutex drop_mutex;
 	struct mutex volume_mutex;
@@ -1941,8 +1943,12 @@ int btrfs_acl_chmod(struct inode *inode);
 /* free-space-cache.c */
 int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
 			 u64 bytenr, u64 size);
+int btrfs_add_free_space_lock(struct btrfs_block_group_cache *block_group,
+			      u64 offset, u64 bytes);
 int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
 			    u64 bytenr, u64 size);
+int btrfs_remove_free_space_lock(struct btrfs_block_group_cache *block_group,
+				 u64 offset, u64 bytes);
 void btrfs_remove_free_space_cache(struct btrfs_block_group_cache
 				   *block_group);
 struct btrfs_free_space *btrfs_find_free_space(struct btrfs_block_group_cache
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 796256440dfa..d1137d7ea8d4 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1460,7 +1460,8 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	mutex_init(&fs_info->trans_mutex);
 	mutex_init(&fs_info->tree_log_mutex);
 	mutex_init(&fs_info->drop_mutex);
-	mutex_init(&fs_info->alloc_mutex);
+	mutex_init(&fs_info->extent_ins_mutex);
+	mutex_init(&fs_info->pinned_mutex);
 	mutex_init(&fs_info->chunk_mutex);
 	mutex_init(&fs_info->transaction_kthread_mutex);
 	mutex_init(&fs_info->cleaner_mutex);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index e3b3e13a4817..564260872c7e 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -53,24 +53,6 @@ __btrfs_find_block_group(struct btrfs_root *root,
 			 struct btrfs_block_group_cache *hint,
 			 u64 search_start, int data, int owner);
 
-void maybe_lock_mutex(struct btrfs_root *root)
-{
-	if (root != root->fs_info->extent_root &&
-	    root != root->fs_info->chunk_root &&
-	    root != root->fs_info->dev_root) {
-		mutex_lock(&root->fs_info->alloc_mutex);
-	}
-}
-
-void maybe_unlock_mutex(struct btrfs_root *root)
-{
-	if (root != root->fs_info->extent_root &&
-	    root != root->fs_info->chunk_root &&
-	    root != root->fs_info->dev_root) {
-		mutex_unlock(&root->fs_info->alloc_mutex);
-	}
-}
-
 static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
 {
 	return (cache->flags & bits) == bits;
@@ -164,6 +146,7 @@ static int add_new_free_space(struct btrfs_block_group_cache *block_group,
 	u64 extent_start, extent_end, size;
 	int ret;
 
+	mutex_lock(&info->pinned_mutex);
 	while (start < end) {
 		ret = find_first_extent_bit(&info->pinned_extents, start,
 					    &extent_start, &extent_end,
@@ -175,7 +158,8 @@ static int add_new_free_space(struct btrfs_block_group_cache *block_group,
 			start = extent_end + 1;
 		} else if (extent_start > start && extent_start < end) {
 			size = extent_start - start;
-			ret = btrfs_add_free_space(block_group, start, size);
+			ret = btrfs_add_free_space_lock(block_group, start,
+							size);
 			BUG_ON(ret);
 			start = extent_end + 1;
 		} else {
@@ -185,9 +169,10 @@ static int add_new_free_space(struct btrfs_block_group_cache *block_group,
 
 	if (start < end) {
 		size = end - start;
-		ret = btrfs_add_free_space(block_group, start, size);
+		ret = btrfs_add_free_space_lock(block_group, start, size);
 		BUG_ON(ret);
 	}
+	mutex_unlock(&info->pinned_mutex);
 
 	return 0;
 }
@@ -445,13 +430,11 @@ int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len)
 
 	path = btrfs_alloc_path();
 	BUG_ON(!path);
-	maybe_lock_mutex(root);
 	key.objectid = start;
 	key.offset = len;
 	btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
 	ret = btrfs_search_slot(NULL, root->fs_info->extent_root, &key, path,
 				0, 0);
-	maybe_unlock_mutex(root);
 	btrfs_free_path(path);
 	return ret;
 }
@@ -676,8 +659,9 @@ static int __btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
 
 		BUG_ON(owner_objectid >= BTRFS_MAX_LEVEL);
 		num_bytes = btrfs_level_size(root, (int)owner_objectid);
+		mutex_lock(&root->fs_info->extent_ins_mutex);
 		if (test_range_bit(&root->fs_info->extent_ins, bytenr,
-				bytenr + num_bytes - 1, EXTENT_LOCKED, 0)) {
+				bytenr + num_bytes - 1, EXTENT_WRITEBACK, 0)) {
 			u64 priv;
 			ret = get_state_private(&root->fs_info->extent_ins,
 						bytenr, &priv);
@@ -686,6 +670,7 @@ static int __btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
 							(unsigned long)priv;
 			BUG_ON(extent_op->parent != orig_parent);
 			BUG_ON(extent_op->generation != orig_generation);
+
 			extent_op->parent = parent;
 			extent_op->generation = ref_generation;
 		} else {
@@ -703,10 +688,11 @@ static int __btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
 
 			set_extent_bits(&root->fs_info->extent_ins,
 					bytenr, bytenr + num_bytes - 1,
-					EXTENT_LOCKED, GFP_NOFS);
+					EXTENT_WRITEBACK, GFP_NOFS);
 			set_state_private(&root->fs_info->extent_ins,
 					  bytenr, (unsigned long)extent_op);
 		}
+		mutex_unlock(&root->fs_info->extent_ins_mutex);
 		return 0;
 	}
 
@@ -742,12 +728,10 @@ int btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
 	if (ref_root == BTRFS_TREE_LOG_OBJECTID &&
 	    owner_objectid < BTRFS_FIRST_FREE_OBJECTID)
 		return 0;
-	maybe_lock_mutex(root);
 	ret = __btrfs_update_extent_ref(trans, root, bytenr, orig_parent,
 					parent, ref_root, ref_root,
 					ref_generation, ref_generation,
 					owner_objectid);
-	maybe_unlock_mutex(root);
 	return ret;
 }
 
@@ -817,11 +801,9 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
 	if (ref_root == BTRFS_TREE_LOG_OBJECTID &&
 	    owner_objectid < BTRFS_FIRST_FREE_OBJECTID)
 		return 0;
-	maybe_lock_mutex(root);
 	ret = __btrfs_inc_extent_ref(trans, root, bytenr, 0, parent,
 				     0, ref_root, 0, ref_generation,
 				     owner_objectid);
-	maybe_unlock_mutex(root);
 	return ret;
 }
 
@@ -886,7 +868,6 @@ static int get_reference_status(struct btrfs_root *root, u64 bytenr,
 	key.type = BTRFS_EXTENT_ITEM_KEY;
 
 	path = btrfs_alloc_path();
-	mutex_lock(&root->fs_info->alloc_mutex);
 	ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
 	if (ret < 0)
 		goto out;
@@ -953,7 +934,6 @@ static int get_reference_status(struct btrfs_root *root, u64 bytenr,
 	}
 	ret = 0;
 out:
-	mutex_unlock(&root->fs_info->alloc_mutex);
 	btrfs_free_path(path);
 	return ret;
 }
@@ -1179,13 +1159,11 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 
 			nr_file_extents++;
 
-			maybe_lock_mutex(root);
 			ret = process_func(trans, root, bytenr,
 					   orig_buf->start, buf->start,
 					   orig_root, ref_root,
 					   orig_generation, ref_generation,
 					   key.objectid);
-			maybe_unlock_mutex(root);
 
 			if (ret) {
 				faili = i;
@@ -1194,13 +1172,11 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 			}
 		} else {
 			bytenr = btrfs_node_blockptr(buf, i);
-			maybe_lock_mutex(root);
 			ret = process_func(trans, root, bytenr,
 					   orig_buf->start, buf->start,
 					   orig_root, ref_root,
 					   orig_generation, ref_generation,
 					   level - 1);
-			maybe_unlock_mutex(root);
 			if (ret) {
 				faili = i;
 				WARN_ON(1);
@@ -1270,24 +1246,20 @@ int btrfs_update_ref(struct btrfs_trans_handle *trans,
 			bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
 			if (bytenr == 0)
 				continue;
-			maybe_lock_mutex(root);
 			ret = __btrfs_update_extent_ref(trans, root, bytenr,
 					    orig_buf->start, buf->start,
 					    orig_root, ref_root,
 					    orig_generation, ref_generation,
 					    key.objectid);
-			maybe_unlock_mutex(root);
 			if (ret)
 				goto fail;
 		} else {
 			bytenr = btrfs_node_blockptr(buf, slot);
-			maybe_lock_mutex(root);
 			ret = __btrfs_update_extent_ref(trans, root, bytenr,
 					    orig_buf->start, buf->start,
 					    orig_root, ref_root,
 					    orig_generation, ref_generation,
 					    level - 1);
-			maybe_unlock_mutex(root);
 			if (ret)
 				goto fail;
 		}
@@ -1344,7 +1316,6 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
 	if (!path)
 		return -ENOMEM;
 
-	mutex_lock(&root->fs_info->alloc_mutex);
 	while(1) {
 		cache = NULL;
 		spin_lock(&root->fs_info->block_group_cache_lock);
@@ -1378,7 +1349,6 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
 		}
 	}
 	btrfs_free_path(path);
-	mutex_unlock(&root->fs_info->alloc_mutex);
 	return werr;
 }
 
@@ -1390,9 +1360,11 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
 
 	found = __find_space_info(info, flags);
 	if (found) {
+		spin_lock(&found->lock);
 		found->total_bytes += total_bytes;
 		found->bytes_used += bytes_used;
 		found->full = 0;
+		spin_unlock(&found->lock);
 		*space_info = found;
 		return 0;
 	}
@@ -1479,43 +1451,53 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
 	}
 	BUG_ON(!space_info);
 
+	spin_lock(&space_info->lock);
 	if (space_info->force_alloc) {
 		force = 1;
 		space_info->force_alloc = 0;
 	}
-	if (space_info->full)
+	if (space_info->full) {
+		spin_unlock(&space_info->lock);
 		goto out;
+	}
 
 	thresh = div_factor(space_info->total_bytes, 6);
 	if (!force &&
 	   (space_info->bytes_used + space_info->bytes_pinned +
-	    space_info->bytes_reserved + alloc_bytes) < thresh)
+	    space_info->bytes_reserved + alloc_bytes) < thresh) {
+		spin_unlock(&space_info->lock);
 		goto out;
+	}
 
-	while (!mutex_trylock(&extent_root->fs_info->chunk_mutex)) {
-		if (!force)
-			goto out;
-		mutex_unlock(&extent_root->fs_info->alloc_mutex);
-		cond_resched();
-		mutex_lock(&extent_root->fs_info->alloc_mutex);
+	spin_unlock(&space_info->lock);
+
+	ret = mutex_trylock(&extent_root->fs_info->chunk_mutex);
+	if (!ret && !force) {
+		goto out;
+	} else if (!ret) {
+		mutex_lock(&extent_root->fs_info->chunk_mutex);
 		waited = 1;
 	}
 
-	if (waited && space_info->full)
-		goto out_unlock;
+	if (waited) {
+		spin_lock(&space_info->lock);
+		if (space_info->full) {
+			spin_unlock(&space_info->lock);
+			goto out_unlock;
+		}
+		spin_unlock(&space_info->lock);
+	}
 
 	ret = btrfs_alloc_chunk(trans, extent_root, &start, &num_bytes, flags);
-	if (ret == -ENOSPC) {
+	if (ret) {
 printk("space info full %Lu\n", flags);
 		space_info->full = 1;
 		goto out_unlock;
 	}
-	BUG_ON(ret);
 
 	ret = btrfs_make_block_group(trans, extent_root, 0, flags,
 		     BTRFS_FIRST_CHUNK_TREE_OBJECTID, start, num_bytes);
 	BUG_ON(ret);
-
 out_unlock:
 	mutex_unlock(&extent_root->fs_info->chunk_mutex);
 out:
@@ -1533,7 +1515,6 @@ static int update_block_group(struct btrfs_trans_handle *trans,
 	u64 old_val;
 	u64 byte_in_group;
 
-	WARN_ON(!mutex_is_locked(&root->fs_info->alloc_mutex));
 	while(total) {
 		cache = btrfs_lookup_block_group(info, bytenr);
 		if (!cache) {
@@ -1542,6 +1523,7 @@ static int update_block_group(struct btrfs_trans_handle *trans,
 		byte_in_group = bytenr - cache->key.objectid;
 		WARN_ON(byte_in_group > cache->key.offset);
 
+		spin_lock(&cache->space_info->lock);
 		spin_lock(&cache->lock);
 		cache->dirty = 1;
 		old_val = btrfs_block_group_used(&cache->item);
@@ -1551,11 +1533,13 @@ static int update_block_group(struct btrfs_trans_handle *trans,
 			cache->space_info->bytes_used += num_bytes;
 			btrfs_set_block_group_used(&cache->item, old_val);
 			spin_unlock(&cache->lock);
+			spin_unlock(&cache->space_info->lock);
 		} else {
 			old_val -= num_bytes;
 			cache->space_info->bytes_used -= num_bytes;
 			btrfs_set_block_group_used(&cache->item, old_val);
 			spin_unlock(&cache->lock);
+			spin_unlock(&cache->space_info->lock);
 			if (mark_free) {
 				int ret;
 				ret = btrfs_add_free_space(cache, bytenr,
@@ -1588,7 +1572,7 @@ int btrfs_update_pinned_extents(struct btrfs_root *root,
 	struct btrfs_block_group_cache *cache;
 	struct btrfs_fs_info *fs_info = root->fs_info;
 
-	WARN_ON(!mutex_is_locked(&root->fs_info->alloc_mutex));
+	WARN_ON(!mutex_is_locked(&root->fs_info->pinned_mutex));
 	if (pin) {
 		set_extent_dirty(&fs_info->pinned_extents,
 				bytenr, bytenr + num - 1, GFP_NOFS);
@@ -1602,16 +1586,20 @@ int btrfs_update_pinned_extents(struct btrfs_root *root,
 		len = min(num, cache->key.offset -
 			  (bytenr - cache->key.objectid));
 		if (pin) {
+			spin_lock(&cache->space_info->lock);
 			spin_lock(&cache->lock);
 			cache->pinned += len;
 			cache->space_info->bytes_pinned += len;
 			spin_unlock(&cache->lock);
+			spin_unlock(&cache->space_info->lock);
 			fs_info->total_pinned += len;
 		} else {
+			spin_lock(&cache->space_info->lock);
 			spin_lock(&cache->lock);
 			cache->pinned -= len;
 			cache->space_info->bytes_pinned -= len;
 			spin_unlock(&cache->lock);
+			spin_unlock(&cache->space_info->lock);
 			fs_info->total_pinned -= len;
 		}
 		bytenr += len;
@@ -1627,23 +1615,23 @@ static int update_reserved_extents(struct btrfs_root *root,
 	struct btrfs_block_group_cache *cache;
 	struct btrfs_fs_info *fs_info = root->fs_info;
 
-	WARN_ON(!mutex_is_locked(&root->fs_info->alloc_mutex));
 	while (num > 0) {
 		cache = btrfs_lookup_block_group(fs_info, bytenr);
 		BUG_ON(!cache);
 		len = min(num, cache->key.offset -
 			  (bytenr - cache->key.objectid));
+
+		spin_lock(&cache->space_info->lock);
+		spin_lock(&cache->lock);
 		if (reserve) {
-			spin_lock(&cache->lock);
 			cache->reserved += len;
 			cache->space_info->bytes_reserved += len;
-			spin_unlock(&cache->lock);
 		} else {
-			spin_lock(&cache->lock);
 			cache->reserved -= len;
 			cache->space_info->bytes_reserved -= len;
-			spin_unlock(&cache->lock);
 		}
+		spin_unlock(&cache->lock);
+		spin_unlock(&cache->space_info->lock);
 		bytenr += len;
 		num -= len;
 	}
@@ -1658,6 +1646,7 @@ int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy)
 	struct extent_io_tree *pinned_extents = &root->fs_info->pinned_extents;
 	int ret;
 
+	mutex_lock(&root->fs_info->pinned_mutex);
 	while(1) {
 		ret = find_first_extent_bit(pinned_extents, last,
 					    &start, &end, EXTENT_DIRTY);
@@ -1666,6 +1655,7 @@ int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy)
 		set_extent_dirty(copy, start, end, GFP_NOFS);
 		last = end + 1;
 	}
+	mutex_unlock(&root->fs_info->pinned_mutex);
 	return 0;
 }
 
@@ -1678,7 +1668,7 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
 	int ret;
 	struct btrfs_block_group_cache *cache;
 
-	mutex_lock(&root->fs_info->alloc_mutex);
+	mutex_lock(&root->fs_info->pinned_mutex);
 	while(1) {
 		ret = find_first_extent_bit(unpin, 0, &start, &end,
 					    EXTENT_DIRTY);
@@ -1690,12 +1680,12 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
 		if (cache->cached)
 			btrfs_add_free_space(cache, start, end - start + 1);
 		if (need_resched()) {
-			mutex_unlock(&root->fs_info->alloc_mutex);
+			mutex_unlock(&root->fs_info->pinned_mutex);
 			cond_resched();
-			mutex_lock(&root->fs_info->alloc_mutex);
+			mutex_lock(&root->fs_info->pinned_mutex);
 		}
 	}
-	mutex_unlock(&root->fs_info->alloc_mutex);
+	mutex_unlock(&root->fs_info->pinned_mutex);
 	return 0;
 }
 
@@ -1705,6 +1695,7 @@ static int finish_current_insert(struct btrfs_trans_handle *trans,
 	u64 start;
 	u64 end;
 	u64 priv;
+	u64 search = 0;
 	struct btrfs_fs_info *info = extent_root->fs_info;
 	struct btrfs_path *path;
 	struct btrfs_extent_ref *ref;
@@ -1714,20 +1705,37 @@ static int finish_current_insert(struct btrfs_trans_handle *trans,
 	int ret;
 	int err = 0;
 
-	WARN_ON(!mutex_is_locked(&extent_root->fs_info->alloc_mutex));
 	btrfs_set_stack_extent_refs(&extent_item, 1);
 	path = btrfs_alloc_path();
 
 	while(1) {
-		ret = find_first_extent_bit(&info->extent_ins, 0, &start,
-					    &end, EXTENT_LOCKED);
-		if (ret)
+		mutex_lock(&info->extent_ins_mutex);
+		ret = find_first_extent_bit(&info->extent_ins, search, &start,
+					    &end, EXTENT_WRITEBACK);
+		if (ret) {
+			mutex_unlock(&info->extent_ins_mutex);
+			if (search) {
+				search = 0;
+				continue;
+			}
 			break;
+		}
+
+		ret = try_lock_extent(&info->extent_ins, start, end, GFP_NOFS);
+		if (!ret) {
+			search = end+1;
+			mutex_unlock(&info->extent_ins_mutex);
+			cond_resched();
+			continue;
+		}
+		BUG_ON(ret < 0);
 
 		ret = get_state_private(&info->extent_ins, start, &priv);
 		BUG_ON(ret);
 		extent_op = (struct pending_extent_op *)(unsigned long)priv;
 
+		mutex_unlock(&info->extent_ins_mutex);
+
 		if (extent_op->type == PENDING_EXTENT_INSERT) {
 			key.objectid = start;
 			key.offset = end + 1 - start;
@@ -1736,8 +1744,10 @@ static int finish_current_insert(struct btrfs_trans_handle *trans,
 					&extent_item, sizeof(extent_item));
 			BUG_ON(err);
 
+			mutex_lock(&info->extent_ins_mutex);
 			clear_extent_bits(&info->extent_ins, start, end,
-					  EXTENT_LOCKED, GFP_NOFS);
+					  EXTENT_WRITEBACK, GFP_NOFS);
+			mutex_unlock(&info->extent_ins_mutex);
 
 			err = insert_extent_backref(trans, extent_root, path,
 						start, extent_op->parent,
@@ -1753,8 +1763,10 @@ static int finish_current_insert(struct btrfs_trans_handle *trans,
 						extent_op->level, 0);
 			BUG_ON(err);
 
+			mutex_lock(&info->extent_ins_mutex);
 			clear_extent_bits(&info->extent_ins, start, end,
-					  EXTENT_LOCKED, GFP_NOFS);
+					  EXTENT_WRITEBACK, GFP_NOFS);
+			mutex_unlock(&info->extent_ins_mutex);
 
 			key.objectid = start;
 			key.offset = extent_op->parent;
@@ -1772,12 +1784,10 @@ static int finish_current_insert(struct btrfs_trans_handle *trans,
 			BUG_ON(1);
 		}
 		kfree(extent_op);
+		unlock_extent(&info->extent_ins, start, end, GFP_NOFS);
+		search = 0;
 
-		if (need_resched()) {
-			mutex_unlock(&extent_root->fs_info->alloc_mutex);
-			cond_resched();
-			mutex_lock(&extent_root->fs_info->alloc_mutex);
-		}
+		cond_resched();
 	}
 	btrfs_free_path(path);
 	return 0;
@@ -1790,7 +1800,6 @@ static int pin_down_bytes(struct btrfs_trans_handle *trans,
 	int err = 0;
 	struct extent_buffer *buf;
 
-	WARN_ON(!mutex_is_locked(&root->fs_info->alloc_mutex));
 	if (is_data)
 		goto pinit;
 
@@ -1847,7 +1856,6 @@ static int __free_extent(struct btrfs_trans_handle *trans,
 	struct btrfs_extent_item *ei;
 	u32 refs;
 
-	WARN_ON(!mutex_is_locked(&root->fs_info->alloc_mutex));
 	key.objectid = bytenr;
 	btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
 	key.offset = num_bytes;
@@ -1935,8 +1943,10 @@ static int __free_extent(struct btrfs_trans_handle *trans,
 #endif
 
 		if (pin) {
+			mutex_lock(&root->fs_info->pinned_mutex);
 			ret = pin_down_bytes(trans, root, bytenr, num_bytes,
 				owner_objectid >= BTRFS_FIRST_FREE_OBJECTID);
+			mutex_unlock(&root->fs_info->pinned_mutex);
 			if (ret > 0)
 				mark_free = 1;
 			BUG_ON(ret < 0);
@@ -1956,6 +1966,7 @@ static int __free_extent(struct btrfs_trans_handle *trans,
 		ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
 				      num_to_del);
 		BUG_ON(ret);
+		btrfs_release_path(extent_root, path);
 		ret = update_block_group(trans, root, bytenr, num_bytes, 0,
 					 mark_free);
 		BUG_ON(ret);
@@ -1994,70 +2005,91 @@ static int del_pending_extents(struct btrfs_trans_handle *trans, struct
 {
 	int ret;
 	int err = 0;
-	int mark_free = 0;
 	u64 start;
 	u64 end;
 	u64 priv;
+	u64 search = 0;
 	struct extent_io_tree *pending_del;
 	struct extent_io_tree *extent_ins;
 	struct pending_extent_op *extent_op;
+	struct btrfs_fs_info *info = extent_root->fs_info;
 
-	WARN_ON(!mutex_is_locked(&extent_root->fs_info->alloc_mutex));
 	extent_ins = &extent_root->fs_info->extent_ins;
 	pending_del = &extent_root->fs_info->pending_del;
 
 	while(1) {
-		ret = find_first_extent_bit(pending_del, 0, &start, &end,
-					    EXTENT_LOCKED);
-		if (ret)
+		mutex_lock(&info->extent_ins_mutex);
+		ret = find_first_extent_bit(pending_del, search, &start, &end,
+					    EXTENT_WRITEBACK);
+		if (ret) {
+			mutex_unlock(&info->extent_ins_mutex);
+			if (search) {
+				search = 0;
+				continue;
+			}
 			break;
+		}
+
+		ret = try_lock_extent(extent_ins, start, end, GFP_NOFS);
+		if (!ret) {
+			search = end+1;
+			mutex_unlock(&info->extent_ins_mutex);
+			cond_resched();
+			continue;
+		}
+		BUG_ON(ret < 0);
 
 		ret = get_state_private(pending_del, start, &priv);
 		BUG_ON(ret);
 		extent_op = (struct pending_extent_op *)(unsigned long)priv;
 
-		clear_extent_bits(pending_del, start, end, EXTENT_LOCKED,
+		clear_extent_bits(pending_del, start, end, EXTENT_WRITEBACK,
 				  GFP_NOFS);
-
-		ret = pin_down_bytes(trans, extent_root, start,
-				     end + 1 - start, 0);
-		mark_free = ret > 0;
 		if (!test_range_bit(extent_ins, start, end,
-				    EXTENT_LOCKED, 0)) {
+				    EXTENT_WRITEBACK, 0)) {
+			mutex_unlock(&info->extent_ins_mutex);
 free_extent:
 			ret = __free_extent(trans, extent_root,
 					    start, end + 1 - start,
 					    extent_op->orig_parent,
 					    extent_root->root_key.objectid,
 					    extent_op->orig_generation,
-					    extent_op->level, 0, mark_free);
+					    extent_op->level, 1, 0);
 			kfree(extent_op);
 		} else {
 			kfree(extent_op);
-			ret = get_state_private(extent_ins, start, &priv);
+
+			ret = get_state_private(&info->extent_ins, start,
+						&priv);
 			BUG_ON(ret);
 			extent_op = (struct pending_extent_op *)
-							(unsigned long)priv;
+						(unsigned long)priv;
+
+			clear_extent_bits(&info->extent_ins, start, end,
+					  EXTENT_WRITEBACK, GFP_NOFS);
 
-			clear_extent_bits(extent_ins, start, end,
-					  EXTENT_LOCKED, GFP_NOFS);
+			mutex_unlock(&info->extent_ins_mutex);
 
 			if (extent_op->type == PENDING_BACKREF_UPDATE)
 				goto free_extent;
 
+			mutex_lock(&extent_root->fs_info->pinned_mutex);
+			ret = pin_down_bytes(trans, extent_root, start,
+					     end + 1 - start, 0);
+			mutex_unlock(&extent_root->fs_info->pinned_mutex);
+
 			ret = update_block_group(trans, extent_root, start,
-						end + 1 - start, 0, mark_free);
+						end + 1 - start, 0, ret > 0);
+
 			BUG_ON(ret);
 			kfree(extent_op);
 		}
 		if (ret)
 			err = ret;
+		unlock_extent(extent_ins, start, end, GFP_NOFS);
 
-		if (need_resched()) {
-			mutex_unlock(&extent_root->fs_info->alloc_mutex);
-			cond_resched();
-			mutex_lock(&extent_root->fs_info->alloc_mutex);
-		}
+		search = 0;
+		cond_resched();
 	}
 	return err;
 }
@@ -2091,11 +2123,13 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
 		extent_op->orig_generation = ref_generation;
 		extent_op->level = (int)owner_objectid;
 
+		mutex_lock(&root->fs_info->extent_ins_mutex);
 		set_extent_bits(&root->fs_info->pending_del,
 				bytenr, bytenr + num_bytes - 1,
-				EXTENT_LOCKED, GFP_NOFS);
+				EXTENT_WRITEBACK, GFP_NOFS);
 		set_state_private(&root->fs_info->pending_del,
 				  bytenr, (unsigned long)extent_op);
+		mutex_unlock(&root->fs_info->extent_ins_mutex);
 		return 0;
 	}
 	/* if metadata always pin */
@@ -2134,11 +2168,9 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,
 {
 	int ret;
 
-	maybe_lock_mutex(root);
 	ret = __btrfs_free_extent(trans, root, bytenr, num_bytes, parent,
 				  root_objectid, ref_generation,
 				  owner_objectid, pin);
-	maybe_unlock_mutex(root);
 	return ret;
 }
 
@@ -2214,12 +2246,16 @@ static int noinline find_free_extent(struct btrfs_trans_handle *trans,
 		 * group thats not of the proper type, while looping this
 		 * should never happen
 		 */
+		WARN_ON(!block_group);
+		mutex_lock(&block_group->alloc_mutex);
 		if (unlikely(!block_group_bits(block_group, data)))
 			goto new_group;
 
 		ret = cache_block_group(root, block_group);
-		if (ret)
+		if (ret) {
+			mutex_unlock(&block_group->alloc_mutex);
 			break;
+		}
 
 		if (block_group->ro)
 			goto new_group;
@@ -2250,8 +2286,10 @@ static int noinline find_free_extent(struct btrfs_trans_handle *trans,
 				 * then we just re-search this block group
 				 */
 				if (search_start >= start &&
-				    search_start < end)
+				    search_start < end) {
+					mutex_unlock(&block_group->alloc_mutex);
 					continue;
+				}
 
 				/* else we go to the next block group */
 				goto new_group;
@@ -2259,10 +2297,15 @@ static int noinline find_free_extent(struct btrfs_trans_handle *trans,
 
 			ins->objectid = search_start;
 			ins->offset = num_bytes;
+
+			btrfs_remove_free_space_lock(block_group, search_start,
+						     num_bytes);
 			/* we are all good, lets return */
+			mutex_unlock(&block_group->alloc_mutex);
 			break;
 		}
 new_group:
+		mutex_unlock(&block_group->alloc_mutex);
 		/*
 		 * Here's how this works.
 		 * loop == 0: we were searching a block group via a hint
@@ -2363,7 +2406,6 @@ static int __btrfs_reserve_extent(struct btrfs_trans_handle *trans,
 	u64 search_start = 0;
 	u64 alloc_profile;
 	struct btrfs_fs_info *info = root->fs_info;
-	struct btrfs_block_group_cache *cache;
 
 	if (data) {
 		alloc_profile = info->avail_data_alloc_bits &
@@ -2419,13 +2461,6 @@ again:
 		dump_space_info(sinfo, num_bytes);
 		BUG();
 	}
-	cache = btrfs_lookup_block_group(root->fs_info, ins->objectid);
-	if (!cache) {
-		printk(KERN_ERR "Unable to find block group for %Lu\n", ins->objectid);
-		return -ENOSPC;
-	}
-
-	ret = btrfs_remove_free_space(cache, ins->objectid, ins->offset);
 
 	return ret;
 }
@@ -2434,16 +2469,13 @@ int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len)
 {
 	struct btrfs_block_group_cache *cache;
 
-	maybe_lock_mutex(root);
 	cache = btrfs_lookup_block_group(root->fs_info, start);
 	if (!cache) {
 		printk(KERN_ERR "Unable to find block group for %Lu\n", start);
-		maybe_unlock_mutex(root);
 		return -ENOSPC;
 	}
 	btrfs_add_free_space(cache, start, len);
 	update_reserved_extents(root, start, len, 0);
-	maybe_unlock_mutex(root);
 	return 0;
 }
 
@@ -2455,12 +2487,10 @@ int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
 				  u64 data)
 {
 	int ret;
-	maybe_lock_mutex(root);
 	ret = __btrfs_reserve_extent(trans, root, num_bytes, min_alloc_size,
 				     empty_size, hint_byte, search_end, ins,
 				     data);
 	update_reserved_extents(root, ins->objectid, ins->offset, 1);
-	maybe_unlock_mutex(root);
 	return ret;
 }
 
@@ -2510,11 +2540,13 @@ static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
 		extent_op->orig_generation = 0;
 		extent_op->level = (int)owner;
 
+		mutex_lock(&root->fs_info->extent_ins_mutex);
 		set_extent_bits(&root->fs_info->extent_ins, ins->objectid,
 				ins->objectid + ins->offset - 1,
-				EXTENT_LOCKED, GFP_NOFS);
+				EXTENT_WRITEBACK, GFP_NOFS);
 		set_state_private(&root->fs_info->extent_ins,
 				  ins->objectid, (unsigned long)extent_op);
+		mutex_unlock(&root->fs_info->extent_ins_mutex);
 		goto update_block;
 	}
 
@@ -2578,11 +2610,9 @@ int btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
 
 	if (root_objectid == BTRFS_TREE_LOG_OBJECTID)
 		return 0;
-	maybe_lock_mutex(root);
 	ret = __btrfs_alloc_reserved_extent(trans, root, parent, root_objectid,
 					    ref_generation, owner, ins);
 	update_reserved_extents(root, ins->objectid, ins->offset, 0);
-	maybe_unlock_mutex(root);
 	return ret;
 }
 
@@ -2599,15 +2629,16 @@ int btrfs_alloc_logged_extent(struct btrfs_trans_handle *trans,
 	int ret;
 	struct btrfs_block_group_cache *block_group;
 
-	maybe_lock_mutex(root);
 	block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid);
+	mutex_lock(&block_group->alloc_mutex);
 	cache_block_group(root, block_group);
 
-	ret = btrfs_remove_free_space(block_group, ins->objectid, ins->offset);
+	ret = btrfs_remove_free_space_lock(block_group, ins->objectid,
+					   ins->offset);
+	mutex_unlock(&block_group->alloc_mutex);
 	BUG_ON(ret);
 	ret = __btrfs_alloc_reserved_extent(trans, root, parent, root_objectid,
 					    ref_generation, owner, ins);
-	maybe_unlock_mutex(root);
 	return ret;
 }
 
@@ -2627,8 +2658,6 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
 {
 	int ret;
 
-	maybe_lock_mutex(root);
-
 	ret = __btrfs_reserve_extent(trans, root, num_bytes,
 				     min_alloc_size, empty_size, hint_byte,
 				     search_end, ins, data);
@@ -2642,7 +2671,6 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
 	} else {
 		update_reserved_extents(root, ins->objectid, ins->offset, 1);
 	}
-	maybe_unlock_mutex(root);
 	return ret;
 }
 
@@ -2734,12 +2762,10 @@ int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
 		if (disk_bytenr == 0)
 			continue;
 
-		mutex_lock(&root->fs_info->alloc_mutex);
 		ret = __btrfs_free_extent(trans, root, disk_bytenr,
 				btrfs_file_extent_disk_num_bytes(leaf, fi),
 				leaf->start, leaf_owner, leaf_generation,
 				key.objectid, 0);
-		mutex_unlock(&root->fs_info->alloc_mutex);
 		BUG_ON(ret);
 
 		atomic_inc(&root->fs_info->throttle_gen);
@@ -2758,12 +2784,10 @@ static int noinline cache_drop_leaf_ref(struct btrfs_trans_handle *trans,
 	struct btrfs_extent_info *info = ref->extents;
 
 	for (i = 0; i < ref->nritems; i++) {
-		mutex_lock(&root->fs_info->alloc_mutex);
 		ret = __btrfs_free_extent(trans, root, info->bytenr,
 					  info->num_bytes, ref->bytenr,
 					  ref->owner, ref->generation,
 					  info->objectid, 0);
-		mutex_unlock(&root->fs_info->alloc_mutex);
 
 		atomic_inc(&root->fs_info->throttle_gen);
 		wake_up(&root->fs_info->transaction_throttle);
@@ -2875,13 +2899,11 @@ static int noinline walk_down_tree(struct btrfs_trans_handle *trans,
 			root_gen = btrfs_header_generation(parent);
 			path->slots[*level]++;
 
-			mutex_lock(&root->fs_info->alloc_mutex);
 			ret = __btrfs_free_extent(trans, root, bytenr,
 						blocksize, parent->start,
 						root_owner, root_gen,
 						*level - 1, 1);
 			BUG_ON(ret);
-			mutex_unlock(&root->fs_info->alloc_mutex);
 
 			atomic_inc(&root->fs_info->throttle_gen);
 			wake_up(&root->fs_info->transaction_throttle);
@@ -2957,11 +2979,9 @@ out:
 	root_owner = btrfs_header_owner(parent);
 	root_gen = btrfs_header_generation(parent);
 
-	mutex_lock(&root->fs_info->alloc_mutex);
 	ret = __btrfs_free_extent(trans, root, bytenr, blocksize,
 				  parent->start, root_owner, root_gen,
 				  *level, 1);
-	mutex_unlock(&root->fs_info->alloc_mutex);
 	free_extent_buffer(path->nodes[*level]);
 	path->nodes[*level] = NULL;
 	*level += 1;
@@ -3440,8 +3460,6 @@ static int noinline __next_ref_path(struct btrfs_trans_handle *trans,
 	if (!path)
 		return -ENOMEM;
 
-	mutex_lock(&extent_root->fs_info->alloc_mutex);
-
 	if (first_time) {
 		ref_path->lowest_level = -1;
 		ref_path->current_level = -1;
@@ -3498,9 +3516,7 @@ next:
 		level--;
 		btrfs_release_path(extent_root, path);
 		if (need_resched()) {
-			mutex_unlock(&extent_root->fs_info->alloc_mutex);
 			cond_resched();
-			mutex_lock(&extent_root->fs_info->alloc_mutex);
 		}
 	}
 	/* reached lowest level */
@@ -3613,15 +3629,12 @@ found:
 
 		btrfs_release_path(extent_root, path);
 		if (need_resched()) {
-			mutex_unlock(&extent_root->fs_info->alloc_mutex);
 			cond_resched();
-			mutex_lock(&extent_root->fs_info->alloc_mutex);
 		}
 	}
 	/* reached max tree level, but no tree root found. */
 	BUG();
 out:
-	mutex_unlock(&extent_root->fs_info->alloc_mutex);
 	btrfs_free_path(path);
 	return ret;
 }
@@ -4556,14 +4569,6 @@ static int noinline relocate_tree_block(struct btrfs_trans_handle *trans,
 					struct btrfs_ref_path *ref_path)
 {
 	int ret;
-	int needs_lock = 0;
-
-	if (root == root->fs_info->extent_root ||
-	    root == root->fs_info->chunk_root ||
-	    root == root->fs_info->dev_root) {
-		needs_lock = 1;
-		mutex_lock(&root->fs_info->alloc_mutex);
-	}
 
 	ret = relocate_one_path(trans, root, path, first_key,
 				ref_path, NULL, NULL);
@@ -4571,8 +4576,6 @@ static int noinline relocate_tree_block(struct btrfs_trans_handle *trans,
 
 	if (root == root->fs_info->extent_root)
 		btrfs_extent_post_op(trans, root);
-	if (needs_lock)
-		mutex_unlock(&root->fs_info->alloc_mutex);
 
 	return 0;
 }
@@ -4584,14 +4587,12 @@ static int noinline del_extent_zero(struct btrfs_trans_handle *trans,
 {
 	int ret;
 
-	mutex_lock(&extent_root->fs_info->alloc_mutex);
 	ret = btrfs_search_slot(trans, extent_root, extent_key, path, -1, 1);
 	if (ret)
 		goto out;
 	ret = btrfs_del_item(trans, extent_root, path);
 out:
 	btrfs_release_path(extent_root, path);
-	mutex_unlock(&extent_root->fs_info->alloc_mutex);
 	return ret;
 }
 
@@ -4627,7 +4628,6 @@ static int noinline relocate_one_extent(struct btrfs_root *extent_root,
 	struct btrfs_key first_key;
 	u64 prev_block = 0;
 
-	mutex_unlock(&extent_root->fs_info->alloc_mutex);
 
 	trans = btrfs_start_transaction(extent_root, 1);
 	BUG_ON(!trans);
@@ -4754,7 +4754,6 @@ out:
 	btrfs_end_transaction(trans, extent_root);
 	kfree(new_extents);
 	kfree(ref_path);
-	mutex_lock(&extent_root->fs_info->alloc_mutex);
 	return ret;
 }
 
@@ -4807,10 +4806,8 @@ int __alloc_chunk_for_shrink(struct btrfs_root *root,
 	spin_lock(&shrink_block_group->lock);
 	if (btrfs_block_group_used(&shrink_block_group->item) > 0) {
 		spin_unlock(&shrink_block_group->lock);
-		mutex_unlock(&root->fs_info->alloc_mutex);
 
 		trans = btrfs_start_transaction(root, 1);
-		mutex_lock(&root->fs_info->alloc_mutex);
 		spin_lock(&shrink_block_group->lock);
 
 		new_alloc_flags = update_block_group_flags(root,
@@ -4826,9 +4823,7 @@ int __alloc_chunk_for_shrink(struct btrfs_root *root,
 		do_chunk_alloc(trans, root->fs_info->extent_root,
 			       calc + 2 * 1024 * 1024, new_alloc_flags, force);
 
-		mutex_unlock(&root->fs_info->alloc_mutex);
 		btrfs_end_transaction(trans, root);
-		mutex_lock(&root->fs_info->alloc_mutex);
 	} else
 		spin_unlock(&shrink_block_group->lock);
 	return 0;
@@ -4952,14 +4947,10 @@ int btrfs_relocate_block_group(struct btrfs_root *root, u64 group_start)
 	reloc_inode = create_reloc_inode(info, block_group);
 	BUG_ON(IS_ERR(reloc_inode));
 
-	mutex_lock(&root->fs_info->alloc_mutex);
-
 	__alloc_chunk_for_shrink(root, block_group, 1);
 	block_group->ro = 1;
 	block_group->space_info->total_bytes -= block_group->key.offset;
 
-	mutex_unlock(&root->fs_info->alloc_mutex);
-
 	btrfs_start_delalloc_inodes(info->tree_root);
 	btrfs_wait_ordered_extents(info->tree_root, 0);
 again:
@@ -4978,8 +4969,6 @@ again:
 	btrfs_remove_leaf_refs(info->tree_root, (u64)-1, 1);
 	mutex_unlock(&root->fs_info->cleaner_mutex);
 
-	mutex_lock(&root->fs_info->alloc_mutex);
-
 	while(1) {
 		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
 		if (ret < 0)
@@ -5007,9 +4996,7 @@ next:
 
 		if (progress && need_resched()) {
 			btrfs_release_path(root, path);
-			mutex_unlock(&root->fs_info->alloc_mutex);
 			cond_resched();
-			mutex_lock(&root->fs_info->alloc_mutex);
 			progress = 0;
 			continue;
 		}
@@ -5036,7 +5023,6 @@ next:
 	}
 
 	btrfs_release_path(root, path);
-	mutex_unlock(&root->fs_info->alloc_mutex);
 
 	if (pass == 0) {
 		btrfs_wait_ordered_range(reloc_inode, 0, (u64)-1);
@@ -5058,8 +5044,6 @@ next:
 	trans = btrfs_start_transaction(info->tree_root, 1);
 	btrfs_commit_transaction(trans, info->tree_root);
 
-	mutex_lock(&root->fs_info->alloc_mutex);
-
 	spin_lock(&block_group->lock);
 	WARN_ON(block_group->pinned > 0);
 	WARN_ON(block_group->reserved > 0);
@@ -5067,7 +5051,6 @@ next:
 	spin_unlock(&block_group->lock);
 	ret = 0;
 out:
-	mutex_unlock(&root->fs_info->alloc_mutex);
 	btrfs_free_path(path);
 	return ret;
 }
@@ -5114,7 +5097,6 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
 	struct btrfs_block_group_cache *block_group;
 	struct rb_node *n;
 
-	mutex_lock(&info->alloc_mutex);
 	spin_lock(&info->block_group_cache_lock);
 	while ((n = rb_last(&info->block_group_cache_tree)) != NULL) {
 		block_group = rb_entry(n, struct btrfs_block_group_cache,
@@ -5132,7 +5114,6 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
 		kfree(block_group);
 	}
 	spin_unlock(&info->block_group_cache_lock);
-	mutex_unlock(&info->alloc_mutex);
 	return 0;
 }
 
@@ -5155,7 +5136,6 @@ int btrfs_read_block_groups(struct btrfs_root *root)
 	if (!path)
 		return -ENOMEM;
 
-	mutex_lock(&root->fs_info->alloc_mutex);
 	while(1) {
 		ret = find_first_block_group(root, path, &key);
 		if (ret > 0) {
@@ -5174,6 +5154,7 @@ int btrfs_read_block_groups(struct btrfs_root *root)
 		}
 
 		spin_lock_init(&cache->lock);
+		mutex_init(&cache->alloc_mutex);
 		INIT_LIST_HEAD(&cache->list);
 		read_extent_buffer(leaf, &cache->item,
 				   btrfs_item_ptr_offset(leaf, path->slots[0]),
@@ -5201,7 +5182,6 @@ int btrfs_read_block_groups(struct btrfs_root *root)
 	ret = 0;
 error:
 	btrfs_free_path(path);
-	mutex_unlock(&root->fs_info->alloc_mutex);
 	return ret;
 }
 
@@ -5214,7 +5194,6 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
 	struct btrfs_root *extent_root;
 	struct btrfs_block_group_cache *cache;
 
-	WARN_ON(!mutex_is_locked(&root->fs_info->alloc_mutex));
 	extent_root = root->fs_info->extent_root;
 
 	root->fs_info->last_trans_new_blockgroup = trans->transid;
@@ -5226,6 +5205,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
 	cache->key.objectid = chunk_offset;
 	cache->key.offset = size;
 	spin_lock_init(&cache->lock);
+	mutex_init(&cache->alloc_mutex);
 	INIT_LIST_HEAD(&cache->list);
 	btrfs_set_key_type(&cache->key, BTRFS_BLOCK_GROUP_ITEM_KEY);
 
@@ -5264,7 +5244,6 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
 	struct btrfs_key key;
 	int ret;
 
-	BUG_ON(!mutex_is_locked(&root->fs_info->alloc_mutex));
 	root = root->fs_info->extent_root;
 
 	block_group = btrfs_lookup_block_group(root->fs_info, group_start);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 314041fdfa43..7503bd46819b 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -938,6 +938,20 @@ int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask)
 }
 EXPORT_SYMBOL(lock_extent);
 
+int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
+		    gfp_t mask)
+{
+	int err;
+	u64 failed_start;
+
+	err = set_extent_bit(tree, start, end, EXTENT_LOCKED, 1,
+			     &failed_start, mask);
+	if (err == -EEXIST)
+		return 0;
+	return 1;
+}
+EXPORT_SYMBOL(try_lock_extent);
+
 int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end,
 		  gfp_t mask)
 {
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 86f859b87a6e..283110ec4ee0 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -128,6 +128,8 @@ int try_release_extent_state(struct extent_map_tree *map,
 			     gfp_t mask);
 int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask);
 int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask);
+int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
+		    gfp_t mask);
 int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
 			  get_extent_t *get_extent);
 int __init extent_io_init(void);
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 96241f01fa0a..f4926c0f3c8c 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -184,8 +184,8 @@ static int link_free_space(struct btrfs_block_group_cache *block_group,
 	return ret;
 }
 
-int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
-			 u64 offset, u64 bytes)
+static int __btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
+				  u64 offset, u64 bytes)
 {
 	struct btrfs_free_space *right_info;
 	struct btrfs_free_space *left_info;
@@ -202,8 +202,6 @@ int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
 	 * are adding, if there is remove that struct and add a new one to
 	 * cover the entire range
 	 */
-	spin_lock(&block_group->lock);
-
 	right_info = tree_search_offset(&block_group->free_space_offset,
 					offset+bytes, 0, 1);
 	left_info = tree_search_offset(&block_group->free_space_offset,
@@ -261,7 +259,6 @@ int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
 	if (ret)
 		kfree(info);
 out:
-	spin_unlock(&block_group->lock);
 	if (ret) {
 		printk(KERN_ERR "btrfs: unable to add free space :%d\n", ret);
 		if (ret == -EEXIST)
@@ -274,13 +271,13 @@ out:
 	return ret;
 }
 
-int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
-			    u64 offset, u64 bytes)
+static int
+__btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
+			  u64 offset, u64 bytes)
 {
 	struct btrfs_free_space *info;
 	int ret = 0;
 
-	spin_lock(&block_group->lock);
 	info = tree_search_offset(&block_group->free_space_offset, offset, 0,
 				  1);
 
@@ -334,17 +331,63 @@ int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
 		/* step two, insert a new info struct to cover anything
 		 * before the hole
 		 */
-		spin_unlock(&block_group->lock);
-		ret = btrfs_add_free_space(block_group, old_start,
-					   offset - old_start);
+		ret = __btrfs_add_free_space(block_group, old_start,
+					     offset - old_start);
 		BUG_ON(ret);
-		goto out_nolock;
 	} else {
 		WARN_ON(1);
 	}
 out:
-	spin_unlock(&block_group->lock);
-out_nolock:
+	return ret;
+}
+
+int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
+			 u64 offset, u64 bytes)
+{
+	int ret;
+	struct btrfs_free_space *sp;
+
+	mutex_lock(&block_group->alloc_mutex);
+	ret = __btrfs_add_free_space(block_group, offset, bytes);
+	sp = tree_search_offset(&block_group->free_space_offset, offset, 0, 1);
+	BUG_ON(!sp);
+	mutex_unlock(&block_group->alloc_mutex);
+
+	return ret;
+}
+
+int btrfs_add_free_space_lock(struct btrfs_block_group_cache *block_group,
+			      u64 offset, u64 bytes)
+{
+	int ret;
+	struct btrfs_free_space *sp;
+
+	ret = __btrfs_add_free_space(block_group, offset, bytes);
+	sp = tree_search_offset(&block_group->free_space_offset, offset, 0, 1);
+	BUG_ON(!sp);
+
+	return ret;
+}
+
+int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
+			    u64 offset, u64 bytes)
+{
+	int ret = 0;
+
+	mutex_lock(&block_group->alloc_mutex);
+	ret = __btrfs_remove_free_space(block_group, offset, bytes);
+	mutex_unlock(&block_group->alloc_mutex);
+
+	return ret;
+}
+
+int btrfs_remove_free_space_lock(struct btrfs_block_group_cache *block_group,
+				 u64 offset, u64 bytes)
+{
+	int ret;
+
+	ret = __btrfs_remove_free_space(block_group, offset, bytes);
+
 	return ret;
 }
 
@@ -386,18 +429,18 @@ void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group)
 	struct btrfs_free_space *info;
 	struct rb_node *node;
 
-	spin_lock(&block_group->lock);
+	mutex_lock(&block_group->alloc_mutex);
 	while ((node = rb_last(&block_group->free_space_bytes)) != NULL) {
 		info = rb_entry(node, struct btrfs_free_space, bytes_index);
 		unlink_free_space(block_group, info);
 		kfree(info);
 		if (need_resched()) {
-			spin_unlock(&block_group->lock);
+			mutex_unlock(&block_group->alloc_mutex);
 			cond_resched();
-			spin_lock(&block_group->lock);
+			mutex_lock(&block_group->alloc_mutex);
 		}
 	}
-	spin_unlock(&block_group->lock);
+	mutex_unlock(&block_group->alloc_mutex);
 }
 
 struct btrfs_free_space *btrfs_find_free_space_offset(struct
@@ -407,10 +450,10 @@ struct btrfs_free_space *btrfs_find_free_space_offset(struct
 {
 	struct btrfs_free_space *ret;
 
-	spin_lock(&block_group->lock);
+	mutex_lock(&block_group->alloc_mutex);
 	ret = tree_search_offset(&block_group->free_space_offset, offset,
 				 bytes, 0);
-	spin_unlock(&block_group->lock);
+	mutex_unlock(&block_group->alloc_mutex);
 
 	return ret;
 }
@@ -422,10 +465,10 @@ struct btrfs_free_space *btrfs_find_free_space_bytes(struct
 {
 	struct btrfs_free_space *ret;
 
-	spin_lock(&block_group->lock);
+	mutex_lock(&block_group->alloc_mutex);
 
 	ret = tree_search_bytes(&block_group->free_space_bytes, offset, bytes);
-	spin_unlock(&block_group->lock);
+	mutex_unlock(&block_group->alloc_mutex);
 
 	return ret;
 }
@@ -434,16 +477,13 @@ struct btrfs_free_space *btrfs_find_free_space(struct btrfs_block_group_cache
 					       *block_group, u64 offset,
 					       u64 bytes)
 {
-	struct btrfs_free_space *ret;
+	struct btrfs_free_space *ret = NULL;
 
-	spin_lock(&block_group->lock);
 	ret = tree_search_offset(&block_group->free_space_offset, offset,
 				 bytes, 0);
 	if (!ret)
 		ret = tree_search_bytes(&block_group->free_space_bytes,
 					offset, bytes);
 
-	spin_unlock(&block_group->lock);
-
 	return ret;
 }
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 1df67129cc3d..48b455fdaac5 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -670,7 +670,6 @@ static noinline int drop_dirty_roots(struct btrfs_root *tree_root,
 		atomic_dec(&root->fs_info->throttles);
 		wake_up(&root->fs_info->transaction_throttle);
 
-		mutex_lock(&root->fs_info->alloc_mutex);
 		num_bytes -= btrfs_root_used(&dirty->root->root_item);
 		bytes_used = btrfs_root_used(&root->root_item);
 		if (num_bytes) {
@@ -678,7 +677,6 @@ static noinline int drop_dirty_roots(struct btrfs_root *tree_root,
 			btrfs_set_root_used(&root->root_item,
 					    bytes_used - num_bytes);
 		}
-		mutex_unlock(&root->fs_info->alloc_mutex);
 
 		ret = btrfs_del_root(trans, tree_root, &dirty->root->root_key);
 		if (ret) {
diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c
index 6f57d0889b1e..a6a3956cedfb 100644
--- a/fs/btrfs/tree-defrag.c
+++ b/fs/btrfs/tree-defrag.c
@@ -125,9 +125,6 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
 	if (is_extent)
 		btrfs_extent_post_op(trans, root);
 out:
-	if (is_extent)
-		mutex_unlock(&root->fs_info->alloc_mutex);
-
 	if (path)
 		btrfs_free_path(path);
 	if (ret == -EAGAIN) {
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index e6d579053a47..835daed5561f 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -271,10 +271,10 @@ static int process_one_buffer(struct btrfs_root *log,
 			      struct walk_control *wc, u64 gen)
 {
 	if (wc->pin) {
-		mutex_lock(&log->fs_info->alloc_mutex);
+		mutex_lock(&log->fs_info->pinned_mutex);
 		btrfs_update_pinned_extents(log->fs_info->extent_root,
 					    eb->start, eb->len, 1);
-		mutex_unlock(&log->fs_info->alloc_mutex);
+		mutex_unlock(&log->fs_info->pinned_mutex);
 	}
 
 	if (btrfs_buffer_uptodate(eb, gen)) {
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 7db4cfd03a98..cbb9bb31431d 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -58,14 +58,12 @@ void btrfs_unlock_volumes(void)
 
 static void lock_chunks(struct btrfs_root *root)
 {
-	mutex_lock(&root->fs_info->alloc_mutex);
 	mutex_lock(&root->fs_info->chunk_mutex);
 }
 
 static void unlock_chunks(struct btrfs_root *root)
 {
 	mutex_unlock(&root->fs_info->chunk_mutex);
-	mutex_unlock(&root->fs_info->alloc_mutex);
 }
 
 int btrfs_cleanup_fs_uuids(void)
-- 
cgit v1.2.3


From 5f2cc086ccab27ac5252b3883ac004347860b4c7 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 7 Nov 2008 18:22:45 -0500
Subject: Btrfs: Avoid unplug storms during commit

While doing a commit, btrfs makes sure all the metadata blocks
were properly written to disk, calling wait_on_page_writeback for
each page.  This writeback happens after allowing another transaction
to start, so it competes for the disk with other processes in the FS.

If the page writeback bit is still set, each wait_on_page_writeback might
trigger an unplug, even though the page might be waiting for checksumming
to finish or might be waiting for the async work queue to submit the
bio.

This trades wait_on_page_writeback for waiting on the extent writeback
bits.  It won't trigger any unplugs and substantially improves performance
in a number of workloads.

This also changes the async bio submission to avoid requeueing if there
is only one device.  The requeue just wastes CPU time because there are
no other devices to service.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/transaction.c | 30 +++++++++++++++++++++++++++++-
 fs/btrfs/volumes.c     |  3 ++-
 2 files changed, 31 insertions(+), 2 deletions(-)

(limited to 'fs/btrfs/volumes.c')

diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index e72a013d24bf..202c1b6df4a4 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -20,6 +20,7 @@
 #include <linux/sched.h>
 #include <linux/writeback.h>
 #include <linux/pagemap.h>
+#include <linux/blkdev.h>
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"
@@ -331,6 +332,7 @@ int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
 	int werr = 0;
 	struct page *page;
 	struct inode *btree_inode = root->fs_info->btree_inode;
+	struct extent_io_tree *io_tree = &BTRFS_I(btree_inode)->io_tree;
 	u64 start = 0;
 	u64 end;
 	unsigned long index;
@@ -371,6 +373,11 @@ int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
 			page_cache_release(page);
 		}
 	}
+	/*
+	 * we unplug once and then use the wait_on_extent_bit for
+	 * everything else
+	 */
+	blk_run_address_space(btree_inode->i_mapping);
 	while(1) {
 		ret = find_first_extent_bit(dirty_pages, 0, &start, &end,
 					    EXTENT_DIRTY);
@@ -391,7 +398,28 @@ int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
 				if (err)
 					werr = err;
 			}
-			wait_on_page_writeback(page);
+			if (PageWriteback(page)) {
+				/*
+				 * we don't wait on the page writeback bit
+				 * because that triggers a lot of unplugs.
+				 * The extent bits are much nicer to
+				 * the disks, but come with a slightly
+				 * higher latency because we aren't forcing
+				 * unplugs.
+				 */
+				wait_on_extent_writeback(io_tree,
+					 page_offset(page),
+					 page_offset(page) +
+					 PAGE_CACHE_SIZE - 1);
+			}
+			if (PageWriteback(page)) {
+				/*
+				 * the state bits get cleared before the
+				 * page bits, lets add some extra
+				 * paranoia here
+				 */
+				wait_on_page_writeback(page);
+			}
 			page_cache_release(page);
 			cond_resched();
 		}
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index cbb9bb31431d..80a27284dbf1 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -200,7 +200,8 @@ loop:
 		 * is now congested.  Back off and let other work structs
 		 * run instead
 		 */
-		if (pending && bdi_write_congested(bdi)) {
+		if (pending && bdi_write_congested(bdi) &&
+		    fs_info->fs_devices->open_devices > 1) {
 			struct bio *old_head;
 
 			spin_lock(&device->io_lock);
-- 
cgit v1.2.3


From 2b82032c34ec40515d3c45c36cd1961f37977de8 Mon Sep 17 00:00:00 2001
From: Yan Zheng <zheng.yan@oracle.com>
Date: Mon, 17 Nov 2008 21:11:30 -0500
Subject: Btrfs: Seed device support

Seed device is a special btrfs with SEEDING super flag
set and can only be mounted in read-only mode. Seed
devices allow people to create new btrfs on top of it.

The new FS contains the same contents as the seed device,
but it can be mounted in read-write mode.

This patch does the following:

1) split code in btrfs_alloc_chunk into two parts. The first part does makes
the newly allocated chunk usable, but does not do any operation that modifies
the chunk tree. The second part does the the chunk tree modifications. This
division is for the bootstrap step of adding storage to the seed device.

2) Update device management code to handle seed device.
The basic idea is: For an FS grown from seed devices, its
seed devices are put into a list. Seed devices are
opened on demand at mounting time. If any seed device is
missing or has been changed, btrfs kernel module will
refuse to mount the FS.

3) make btrfs_find_block_group not return NULL when all
block groups are read-only.

Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
---
 fs/btrfs/ctree.c       |    8 +
 fs/btrfs/ctree.h       |   18 +-
 fs/btrfs/disk-io.c     |   56 ++-
 fs/btrfs/extent-tree.c |   31 +-
 fs/btrfs/ioctl.c       |    2 +-
 fs/btrfs/super.c       |    9 +
 fs/btrfs/volumes.c     | 1131 ++++++++++++++++++++++++++++++++++++------------
 fs/btrfs/volumes.h     |   20 +-
 8 files changed, 946 insertions(+), 329 deletions(-)

(limited to 'fs/btrfs/volumes.c')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 8bb452456d90..dd1c03aea2df 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -185,6 +185,10 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
 	btrfs_set_header_owner(cow, new_root_objectid);
 	btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN);
 
+	write_extent_buffer(cow, root->fs_info->fsid,
+			    (unsigned long)btrfs_header_fsid(cow),
+			    BTRFS_FSID_SIZE);
+
 	WARN_ON(btrfs_header_generation(buf) > trans->transid);
 	ret = btrfs_inc_ref(trans, new_root, buf, cow, NULL);
 	kfree(new_root);
@@ -274,6 +278,10 @@ int noinline __btrfs_cow_block(struct btrfs_trans_handle *trans,
 	btrfs_set_header_owner(cow, root->root_key.objectid);
 	btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN);
 
+	write_extent_buffer(cow, root->fs_info->fsid,
+			    (unsigned long)btrfs_header_fsid(cow),
+			    BTRFS_FSID_SIZE);
+
 	WARN_ON(btrfs_header_generation(buf) > trans->transid);
 	if (btrfs_header_generation(buf) != trans->transid) {
 		u32 nr_extents;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index c4c6c127323b..5ff74282a620 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -177,6 +177,9 @@ struct btrfs_dev_item {
 	/* type and info about this device */
 	__le64 type;
 
+	/* expected generation for this device */
+	__le64 generation;
+
 	/* grouping information for allocation decisions */
 	__le32 dev_group;
 
@@ -188,6 +191,9 @@ struct btrfs_dev_item {
 
 	/* btrfs generated uuid for this device */
 	u8 uuid[BTRFS_UUID_SIZE];
+
+	/* uuid of FS who owns this device */
+	u8 fsid[BTRFS_UUID_SIZE];
 } __attribute__ ((__packed__));
 
 struct btrfs_stripe {
@@ -263,6 +269,7 @@ struct btrfs_header {
 					sizeof(struct btrfs_item) - \
 					sizeof(struct btrfs_file_extent_item))
 
+#define BTRFS_SUPER_FLAG_SEEDING (1ULL << 32)
 
 /*
  * this is a very generous portion of the super block, giving us
@@ -278,7 +285,7 @@ struct btrfs_header {
 struct btrfs_super_block {
 	u8 csum[BTRFS_CSUM_SIZE];
 	/* the first 4 fields must match struct btrfs_header */
-	u8 fsid[16];    /* FS specific uuid */
+	u8 fsid[BTRFS_FSID_SIZE];    /* FS specific uuid */
 	__le64 bytenr; /* this block number */
 	__le64 flags;
 
@@ -941,6 +948,7 @@ BTRFS_SETGET_FUNCS(device_id, struct btrfs_dev_item, devid, 64);
 BTRFS_SETGET_FUNCS(device_group, struct btrfs_dev_item, dev_group, 32);
 BTRFS_SETGET_FUNCS(device_seek_speed, struct btrfs_dev_item, seek_speed, 8);
 BTRFS_SETGET_FUNCS(device_bandwidth, struct btrfs_dev_item, bandwidth, 8);
+BTRFS_SETGET_FUNCS(device_generation, struct btrfs_dev_item, generation, 64);
 
 BTRFS_SETGET_STACK_FUNCS(stack_device_type, struct btrfs_dev_item, type, 64);
 BTRFS_SETGET_STACK_FUNCS(stack_device_total_bytes, struct btrfs_dev_item,
@@ -960,12 +968,19 @@ BTRFS_SETGET_STACK_FUNCS(stack_device_seek_speed, struct btrfs_dev_item,
 			 seek_speed, 8);
 BTRFS_SETGET_STACK_FUNCS(stack_device_bandwidth, struct btrfs_dev_item,
 			 bandwidth, 8);
+BTRFS_SETGET_STACK_FUNCS(stack_device_generation, struct btrfs_dev_item,
+			 generation, 64);
 
 static inline char *btrfs_device_uuid(struct btrfs_dev_item *d)
 {
 	return (char *)d + offsetof(struct btrfs_dev_item, uuid);
 }
 
+static inline char *btrfs_device_fsid(struct btrfs_dev_item *d)
+{
+	return (char *)d + offsetof(struct btrfs_dev_item, fsid);
+}
+
 BTRFS_SETGET_FUNCS(chunk_length, struct btrfs_chunk, length, 64);
 BTRFS_SETGET_FUNCS(chunk_owner, struct btrfs_chunk, owner, 64);
 BTRFS_SETGET_FUNCS(chunk_stripe_len, struct btrfs_chunk, stripe_len, 64);
@@ -1661,6 +1676,7 @@ int btrfs_reloc_tree_cache_ref(struct btrfs_trans_handle *trans,
 			       struct extent_buffer *buf, u64 orig_start);
 int btrfs_add_dead_reloc_root(struct btrfs_root *root);
 int btrfs_cleanup_reloc_trees(struct btrfs_root *root);
+u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags);
 /* ctree.c */
 int btrfs_previous_item(struct btrfs_root *root,
 			struct btrfs_path *path, u64 min_objectid,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index c599f0ee997a..82833e5d84b6 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -345,6 +345,25 @@ out:
 	return 0;
 }
 
+static int check_tree_block_fsid(struct btrfs_root *root,
+				 struct extent_buffer *eb)
+{
+	struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
+	u8 fsid[BTRFS_UUID_SIZE];
+	int ret = 1;
+
+	read_extent_buffer(eb, fsid, (unsigned long)btrfs_header_fsid(eb),
+			   BTRFS_FSID_SIZE);
+	while (fs_devices) {
+		if (!memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE)) {
+			ret = 0;
+			break;
+		}
+		fs_devices = fs_devices->seed;
+	}
+	return ret;
+}
+
 int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
 			       struct extent_state *state)
 {
@@ -382,9 +401,7 @@ int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
 		ret = -EIO;
 		goto err;
 	}
-	if (memcmp_extent_buffer(eb, root->fs_info->fsid,
-				 (unsigned long)btrfs_header_fsid(eb),
-				 BTRFS_FSID_SIZE)) {
+	if (check_tree_block_fsid(root, eb)) {
 		printk("bad fsid on block %Lu\n", eb->start);
 		ret = -EIO;
 		goto err;
@@ -1558,9 +1575,11 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	if (!btrfs_super_root(disk_super))
 		goto fail_sb_buffer;
 
-	err = btrfs_parse_options(tree_root, options);
-	if (err)
+	ret = btrfs_parse_options(tree_root, options);
+	if (ret) {
+		err = ret;
 		goto fail_sb_buffer;
+	}
 
 	/*
 	 * we need to start all the end_io workers up front because the
@@ -1610,18 +1629,6 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	btrfs_start_workers(&fs_info->endio_write_workers,
 			    fs_info->thread_pool_size);
 
-	err = -EINVAL;
-	if (btrfs_super_num_devices(disk_super) > fs_devices->open_devices) {
-		printk("Btrfs: wanted %llu devices, but found %llu\n",
-		       (unsigned long long)btrfs_super_num_devices(disk_super),
-		       (unsigned long long)fs_devices->open_devices);
-		if (btrfs_test_opt(tree_root, DEGRADED))
-			printk("continuing in degraded mode\n");
-		else {
-			goto fail_sb_buffer;
-		}
-	}
-
 	fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super);
 	fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages,
 				    4 * 1024 * 1024 / PAGE_CACHE_SIZE);
@@ -1672,7 +1679,10 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	mutex_lock(&fs_info->chunk_mutex);
 	ret = btrfs_read_chunk_tree(chunk_root);
 	mutex_unlock(&fs_info->chunk_mutex);
-	BUG_ON(ret);
+	if (ret) {
+		printk("btrfs: failed to read chunk tree on %s\n", sb->s_id);
+		goto fail_chunk_root;
+	}
 
 	btrfs_close_extra_devices(fs_devices);
 
@@ -1684,7 +1694,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 					  btrfs_super_root(disk_super),
 					  blocksize, generation);
 	if (!tree_root->node)
-		goto fail_sb_buffer;
+		goto fail_chunk_root;
 
 
 	ret = find_and_setup_root(tree_root, fs_info,
@@ -1753,6 +1763,8 @@ fail_extent_root:
 	free_extent_buffer(extent_root->node);
 fail_tree_root:
 	free_extent_buffer(tree_root->node);
+fail_chunk_root:
+	free_extent_buffer(chunk_root->node);
 fail_sys_array:
 fail_sb_buffer:
 	btrfs_stop_workers(&fs_info->fixup_workers);
@@ -1823,9 +1835,10 @@ int write_all_supers(struct btrfs_root *root)
 			total_errors++;
 			continue;
 		}
-		if (!dev->in_fs_metadata)
+		if (!dev->in_fs_metadata || !dev->writeable)
 			continue;
 
+		btrfs_set_stack_device_generation(dev_item, 0);
 		btrfs_set_stack_device_type(dev_item, dev->type);
 		btrfs_set_stack_device_id(dev_item, dev->devid);
 		btrfs_set_stack_device_total_bytes(dev_item, dev->total_bytes);
@@ -1834,6 +1847,7 @@ int write_all_supers(struct btrfs_root *root)
 		btrfs_set_stack_device_io_width(dev_item, dev->io_width);
 		btrfs_set_stack_device_sector_size(dev_item, dev->sector_size);
 		memcpy(dev_item->uuid, dev->uuid, BTRFS_UUID_SIZE);
+		memcpy(dev_item->fsid, dev->fs_devices->fsid, BTRFS_UUID_SIZE);
 		flags = btrfs_super_flags(sb);
 		btrfs_set_super_flags(sb, flags | BTRFS_HEADER_FLAG_WRITTEN);
 
@@ -1881,7 +1895,7 @@ int write_all_supers(struct btrfs_root *root)
 		dev = list_entry(cur, struct btrfs_device, dev_list);
 		if (!dev->bdev)
 			continue;
-		if (!dev->in_fs_metadata)
+		if (!dev->in_fs_metadata || !dev->writeable)
 			continue;
 
 		BUG_ON(!dev->pending_io);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index af2de30dbeac..197422c1dc4b 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -355,7 +355,7 @@ __btrfs_find_block_group(struct btrfs_root *root,
 	if (search_start) {
 		struct btrfs_block_group_cache *shint;
 		shint = btrfs_lookup_first_block_group(info, search_start);
-		if (shint && block_group_bits(shint, data) && !shint->ro) {
+		if (shint && block_group_bits(shint, data)) {
 			spin_lock(&shint->lock);
 			used = btrfs_block_group_used(&shint->item);
 			if (used + shint->pinned + shint->reserved <
@@ -366,7 +366,7 @@ __btrfs_find_block_group(struct btrfs_root *root,
 			spin_unlock(&shint->lock);
 		}
 	}
-	if (hint && !hint->ro && block_group_bits(hint, data)) {
+	if (hint && block_group_bits(hint, data)) {
 		spin_lock(&hint->lock);
 		used = btrfs_block_group_used(&hint->item);
 		if (used + hint->pinned + hint->reserved <
@@ -392,7 +392,7 @@ again:
 		last = cache->key.objectid + cache->key.offset;
 		used = btrfs_block_group_used(&cache->item);
 
-		if (!cache->ro && block_group_bits(cache, data)) {
+		if (block_group_bits(cache, data)) {
 			free_check = div_factor(cache->key.offset, factor);
 			if (used + cache->pinned + cache->reserved <
 			    free_check) {
@@ -1843,9 +1843,9 @@ static void set_block_group_readonly(struct btrfs_block_group_cache *cache)
 	spin_unlock(&cache->space_info->lock);
 }
 
-static u64 reduce_alloc_profile(struct btrfs_root *root, u64 flags)
+u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
 {
-	u64 num_devices = root->fs_info->fs_devices->num_devices;
+	u64 num_devices = root->fs_info->fs_devices->rw_devices;
 
 	if (num_devices == 1)
 		flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0);
@@ -1877,13 +1877,11 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
 {
 	struct btrfs_space_info *space_info;
 	u64 thresh;
-	u64 start;
-	u64 num_bytes;
 	int ret = 0;
 
 	mutex_lock(&extent_root->fs_info->chunk_mutex);
 
-	flags = reduce_alloc_profile(extent_root, flags);
+	flags = btrfs_reduce_alloc_profile(extent_root, flags);
 
 	space_info = __find_space_info(extent_root->fs_info, flags);
 	if (!space_info) {
@@ -1913,16 +1911,11 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
 	}
 	spin_unlock(&space_info->lock);
 
-	ret = btrfs_alloc_chunk(trans, extent_root, &start, &num_bytes, flags);
+	ret = btrfs_alloc_chunk(trans, extent_root, flags);
 	if (ret) {
 printk("space info full %Lu\n", flags);
 		space_info->full = 1;
-		goto out;
 	}
-
-	ret = btrfs_make_block_group(trans, extent_root, 0, flags,
-		     BTRFS_FIRST_CHUNK_TREE_OBJECTID, start, num_bytes);
-	BUG_ON(ret);
 out:
 	mutex_unlock(&extent_root->fs_info->chunk_mutex);
 	return ret;
@@ -3040,7 +3033,7 @@ static int __btrfs_reserve_extent(struct btrfs_trans_handle *trans,
 		data = BTRFS_BLOCK_GROUP_METADATA | alloc_profile;
 	}
 again:
-	data = reduce_alloc_profile(root, data);
+	data = btrfs_reduce_alloc_profile(root, data);
 	/*
 	 * the only place that sets empty_size is btrfs_realloc_node, which
 	 * is not called recursively on allocations
@@ -5136,7 +5129,8 @@ static int noinline relocate_one_path(struct btrfs_trans_handle *trans,
 			else
 				btrfs_node_key_to_cpu(eb, &keys[level], 0);
 		}
-		if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
+		if (nodes[0] &&
+		    ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
 			eb = path->nodes[0];
 			ret = replace_extents_in_leaf(trans, reloc_root, eb,
 						      group, reloc_inode);
@@ -5377,7 +5371,7 @@ static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
 	u64 stripped = BTRFS_BLOCK_GROUP_RAID0 |
 		BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10;
 
-	num_devices = root->fs_info->fs_devices->num_devices;
+	num_devices = root->fs_info->fs_devices->rw_devices;
 	if (num_devices == 1) {
 		stripped |= BTRFS_BLOCK_GROUP_DUP;
 		stripped = flags & ~stripped;
@@ -5801,6 +5795,8 @@ int btrfs_read_block_groups(struct btrfs_root *root)
 		BUG_ON(ret);
 
 		set_avail_alloc_bits(root->fs_info, cache->flags);
+		if (btrfs_chunk_readonly(root, cache->key.objectid))
+			set_block_group_readonly(cache);
 	}
 	ret = 0;
 error:
@@ -5889,6 +5885,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
 	block_group->space_info->total_bytes -= block_group->key.offset;
 	block_group->space_info->bytes_readonly -= block_group->key.offset;
 	spin_unlock(&block_group->space_info->lock);
+	block_group->space_info->full = 0;
 
 	/*
 	memset(shrink_block_group, 0, sizeof(*shrink_block_group));
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 52863cebd594..f43df72b0e17 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -405,7 +405,7 @@ static int btrfs_ioctl_resize(struct btrfs_root *root, void __user *arg)
 		devid = simple_strtoull(devstr, &end, 10);
 		printk(KERN_INFO "resizing devid %llu\n", devid);
 	}
-	device = btrfs_find_device(root, devid, NULL);
+	device = btrfs_find_device(root, devid, NULL, NULL);
 	if (!device) {
 		printk(KERN_INFO "resizer unable to find device %llu\n", devid);
 		ret = -EINVAL;
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 04a3bf816509..92393cc60d08 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -431,6 +431,11 @@ static int btrfs_get_sb(struct file_system_type *fs_type, int flags,
 	if (error)
 		goto error_free_subvol_name;
 
+	if (!(flags & MS_RDONLY) && fs_devices->rw_devices == 0) {
+		error = -EACCES;
+		goto error_close_devices;
+	}
+
 	bdev = fs_devices->latest_bdev;
 	s = sget(fs_type, btrfs_test_super, set_anon_super, fs_devices);
 	if (IS_ERR(s))
@@ -444,6 +449,7 @@ static int btrfs_get_sb(struct file_system_type *fs_type, int flags,
 			goto error_close_devices;
 		}
 
+		btrfs_close_devices(fs_devices);
 	} else {
 		char b[BDEVNAME_SIZE];
 
@@ -512,6 +518,9 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
 		ret =  btrfs_commit_super(root);
 		WARN_ON(ret);
 	} else {
+		if (root->fs_info->fs_devices->rw_devices == 0)
+			return -EACCES;
+
 		if (btrfs_super_log_root(&root->fs_info->super_copy) != 0)
 			return -EINVAL;
 
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 80a27284dbf1..d6f1996de629 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -40,6 +40,12 @@ struct map_lookup {
 	struct btrfs_bio_stripe stripes[];
 };
 
+static int init_first_rw_device(struct btrfs_trans_handle *trans,
+				struct btrfs_root *root,
+				struct btrfs_device *device);
+static int btrfs_relocate_sys_chunks(struct btrfs_root *root);
+
+
 #define map_lookup_size(n) (sizeof(struct map_lookup) + \
 			    (sizeof(struct btrfs_bio_stripe) * (n)))
 
@@ -69,25 +75,31 @@ static void unlock_chunks(struct btrfs_root *root)
 int btrfs_cleanup_fs_uuids(void)
 {
 	struct btrfs_fs_devices *fs_devices;
-	struct list_head *uuid_cur;
-	struct list_head *devices_cur;
 	struct btrfs_device *dev;
 
-	list_for_each(uuid_cur, &fs_uuids) {
-		fs_devices = list_entry(uuid_cur, struct btrfs_fs_devices,
-					list);
+	while (!list_empty(&fs_uuids)) {
+		fs_devices = list_entry(fs_uuids.next,
+					struct btrfs_fs_devices, list);
+		list_del(&fs_devices->list);
 		while(!list_empty(&fs_devices->devices)) {
-			devices_cur = fs_devices->devices.next;
-			dev = list_entry(devices_cur, struct btrfs_device,
-					 dev_list);
+			dev = list_entry(fs_devices->devices.next,
+					 struct btrfs_device, dev_list);
 			if (dev->bdev) {
 				close_bdev_excl(dev->bdev);
 				fs_devices->open_devices--;
 			}
+			fs_devices->num_devices--;
+			if (dev->writeable)
+				fs_devices->rw_devices--;
 			list_del(&dev->dev_list);
+			list_del(&dev->dev_alloc_list);
 			kfree(dev->name);
 			kfree(dev);
 		}
+		WARN_ON(fs_devices->num_devices);
+		WARN_ON(fs_devices->open_devices);
+		WARN_ON(fs_devices->rw_devices);
+		kfree(fs_devices);
 	}
 	return 0;
 }
@@ -257,6 +269,9 @@ static noinline int device_list_add(const char *path,
 				       disk_super->dev_item.uuid);
 	}
 	if (!device) {
+		if (fs_devices->opened)
+			return -EBUSY;
+
 		device = kzalloc(sizeof(*device), GFP_NOFS);
 		if (!device) {
 			/* we can safely leave the fs_devices entry around */
@@ -273,8 +288,9 @@ static noinline int device_list_add(const char *path,
 			kfree(device);
 			return -ENOMEM;
 		}
+		INIT_LIST_HEAD(&device->dev_alloc_list);
 		list_add(&device->dev_list, &fs_devices->devices);
-		list_add(&device->dev_alloc_list, &fs_devices->alloc_list);
+		device->fs_devices = fs_devices;
 		fs_devices->num_devices++;
 	}
 
@@ -288,58 +304,94 @@ static noinline int device_list_add(const char *path,
 
 int btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices)
 {
-	struct list_head *head = &fs_devices->devices;
+	struct list_head *tmp;
 	struct list_head *cur;
 	struct btrfs_device *device;
+	int seed_devices = 0;
 
 	mutex_lock(&uuid_mutex);
 again:
-	list_for_each(cur, head) {
+	list_for_each_safe(cur, tmp, &fs_devices->devices) {
 		device = list_entry(cur, struct btrfs_device, dev_list);
-		if (!device->in_fs_metadata) {
-			struct block_device *bdev;
-			list_del(&device->dev_list);
-			list_del(&device->dev_alloc_list);
+		if (device->in_fs_metadata)
+			continue;
+
+		if (device->bdev) {
+			close_bdev_excl(device->bdev);
+			device->bdev = NULL;
+			fs_devices->open_devices--;
+		}
+		if (device->writeable) {
+			list_del_init(&device->dev_alloc_list);
+			device->writeable = 0;
+			fs_devices->rw_devices--;
+		}
+		if (!seed_devices) {
+			list_del_init(&device->dev_list);
 			fs_devices->num_devices--;
-			if (device->bdev) {
-				bdev = device->bdev;
-				fs_devices->open_devices--;
-				mutex_unlock(&uuid_mutex);
-				close_bdev_excl(bdev);
-				mutex_lock(&uuid_mutex);
-			}
 			kfree(device->name);
 			kfree(device);
-			goto again;
 		}
 	}
+
+	if (fs_devices->seed) {
+		fs_devices = fs_devices->seed;
+		seed_devices = 1;
+		goto again;
+	}
+
 	mutex_unlock(&uuid_mutex);
 	return 0;
 }
 
-int btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
+static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
 {
-	struct list_head *head = &fs_devices->devices;
+	struct btrfs_fs_devices *seed_devices;
 	struct list_head *cur;
 	struct btrfs_device *device;
+again:
+	if (--fs_devices->opened > 0)
+		return 0;
 
-	mutex_lock(&uuid_mutex);
-	list_for_each(cur, head) {
+	list_for_each(cur, &fs_devices->devices) {
 		device = list_entry(cur, struct btrfs_device, dev_list);
 		if (device->bdev) {
 			close_bdev_excl(device->bdev);
 			fs_devices->open_devices--;
 		}
+		if (device->writeable) {
+			list_del_init(&device->dev_alloc_list);
+			fs_devices->rw_devices--;
+		}
+
 		device->bdev = NULL;
+		device->writeable = 0;
 		device->in_fs_metadata = 0;
 	}
-	fs_devices->mounted = 0;
-	mutex_unlock(&uuid_mutex);
+	fs_devices->opened = 0;
+	fs_devices->seeding = 0;
+	fs_devices->sprouted = 0;
+
+	seed_devices = fs_devices->seed;
+	fs_devices->seed = NULL;
+	if (seed_devices) {
+		fs_devices = seed_devices;
+		goto again;
+	}
 	return 0;
 }
 
-int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
-		       int flags, void *holder)
+int btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
+{
+	int ret;
+
+	mutex_lock(&uuid_mutex);
+	ret = __btrfs_close_devices(fs_devices);
+	mutex_unlock(&uuid_mutex);
+	return ret;
+}
+
+int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices, void *holder)
 {
 	struct block_device *bdev;
 	struct list_head *head = &fs_devices->devices;
@@ -350,24 +402,18 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
 	struct btrfs_super_block *disk_super;
 	u64 latest_devid = 0;
 	u64 latest_transid = 0;
-	u64 transid;
 	u64 devid;
+	int seeding = 1;
 	int ret = 0;
 
-	mutex_lock(&uuid_mutex);
-	if (fs_devices->mounted)
-		goto out;
-
 	list_for_each(cur, head) {
 		device = list_entry(cur, struct btrfs_device, dev_list);
 		if (device->bdev)
 			continue;
-
 		if (!device->name)
 			continue;
 
-		bdev = open_bdev_excl(device->name, flags, holder);
-
+		bdev = open_bdev_excl(device->name, MS_RDONLY, holder);
 		if (IS_ERR(bdev)) {
 			printk("open %s failed\n", device->name);
 			goto error;
@@ -387,16 +433,32 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
 		if (devid != device->devid)
 			goto error_brelse;
 
-		transid = btrfs_super_generation(disk_super);
-		if (!latest_transid || transid > latest_transid) {
+		if (memcmp(device->uuid, disk_super->dev_item.uuid,
+			   BTRFS_UUID_SIZE))
+			goto error_brelse;
+
+		device->generation = btrfs_super_generation(disk_super);
+		if (!latest_transid || device->generation > latest_transid) {
 			latest_devid = devid;
-			latest_transid = transid;
+			latest_transid = device->generation;
 			latest_bdev = bdev;
 		}
 
+		if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) {
+			device->writeable = 0;
+		} else {
+			device->writeable = !bdev_read_only(bdev);
+			seeding = 0;
+		}
+
 		device->bdev = bdev;
 		device->in_fs_metadata = 0;
 		fs_devices->open_devices++;
+		if (device->writeable) {
+			fs_devices->rw_devices++;
+			list_add(&device->dev_alloc_list,
+				 &fs_devices->alloc_list);
+		}
 		continue;
 
 error_brelse:
@@ -410,11 +472,32 @@ error:
 		ret = -EIO;
 		goto out;
 	}
-	fs_devices->mounted = 1;
+	fs_devices->seeding = seeding;
+	fs_devices->opened = 1;
 	fs_devices->latest_bdev = latest_bdev;
 	fs_devices->latest_devid = latest_devid;
 	fs_devices->latest_trans = latest_transid;
+	fs_devices->total_rw_bytes = 0;
 out:
+	return ret;
+}
+
+int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
+		       int flags, void *holder)
+{
+	int ret;
+
+	mutex_lock(&uuid_mutex);
+	if (fs_devices->opened) {
+		if (fs_devices->sprouted) {
+			ret = -EBUSY;
+		} else {
+			fs_devices->opened++;
+			ret = 0;
+		}
+	} else {
+		ret = __btrfs_open_devices(fs_devices, holder);
+	}
 	mutex_unlock(&uuid_mutex);
 	return ret;
 }
@@ -481,12 +564,12 @@ error:
  */
 static noinline int find_free_dev_extent(struct btrfs_trans_handle *trans,
 					 struct btrfs_device *device,
-					 struct btrfs_path *path,
 					 u64 num_bytes, u64 *start)
 {
 	struct btrfs_key key;
 	struct btrfs_root *root = device->dev_root;
 	struct btrfs_dev_extent *dev_extent = NULL;
+	struct btrfs_path *path;
 	u64 hole_size = 0;
 	u64 last_byte = 0;
 	u64 search_start = 0;
@@ -496,8 +579,11 @@ static noinline int find_free_dev_extent(struct btrfs_trans_handle *trans,
 	int start_found;
 	struct extent_buffer *l;
 
-	start_found = 0;
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
 	path->reada = 2;
+	start_found = 0;
 
 	/* FIXME use last free of some kind */
 
@@ -581,7 +667,6 @@ check_pending:
 	/* we have to make sure we didn't find an extent that has already
 	 * been allocated by the map tree or the original allocation
 	 */
-	btrfs_release_path(root, path);
 	BUG_ON(*start < search_start);
 
 	if (*start + num_bytes > search_end) {
@@ -589,10 +674,10 @@ check_pending:
 		goto error;
 	}
 	/* check for pending inserts here */
-	return 0;
+	ret = 0;
 
 error:
-	btrfs_release_path(root, path);
+	btrfs_free_path(path);
 	return ret;
 }
 
@@ -644,11 +729,10 @@ int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
 	return ret;
 }
 
-int noinline btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
+int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
 			   struct btrfs_device *device,
 			   u64 chunk_tree, u64 chunk_objectid,
-			   u64 chunk_offset,
-			   u64 num_bytes, u64 *start)
+			   u64 chunk_offset, u64 start, u64 num_bytes)
 {
 	int ret;
 	struct btrfs_path *path;
@@ -662,13 +746,8 @@ int noinline btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
 	if (!path)
 		return -ENOMEM;
 
-	ret = find_free_dev_extent(trans, device, path, num_bytes, start);
-	if (ret) {
-		goto err;
-	}
-
 	key.objectid = device->devid;
-	key.offset = *start;
+	key.offset = start;
 	key.type = BTRFS_DEV_EXTENT_KEY;
 	ret = btrfs_insert_empty_item(trans, root, path, &key,
 				      sizeof(*extent));
@@ -687,7 +766,6 @@ int noinline btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
 
 	btrfs_set_dev_extent_length(leaf, extent, num_bytes);
 	btrfs_mark_buffer_dirty(leaf);
-err:
 	btrfs_free_path(path);
 	return ret;
 }
@@ -735,12 +813,18 @@ error:
 	return ret;
 }
 
-static noinline int find_next_devid(struct btrfs_root *root,
-				    struct btrfs_path *path, u64 *objectid)
+static noinline int find_next_devid(struct btrfs_root *root, u64 *objectid)
 {
 	int ret;
 	struct btrfs_key key;
 	struct btrfs_key found_key;
+	struct btrfs_path *path;
+
+	root = root->fs_info->chunk_root;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
 
 	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
 	key.type = BTRFS_DEV_ITEM_KEY;
@@ -763,7 +847,7 @@ static noinline int find_next_devid(struct btrfs_root *root,
 	}
 	ret = 0;
 error:
-	btrfs_release_path(root, path);
+	btrfs_free_path(path);
 	return ret;
 }
 
@@ -781,7 +865,6 @@ int btrfs_add_device(struct btrfs_trans_handle *trans,
 	struct extent_buffer *leaf;
 	struct btrfs_key key;
 	unsigned long ptr;
-	u64 free_devid = 0;
 
 	root = root->fs_info->chunk_root;
 
@@ -789,13 +872,9 @@ int btrfs_add_device(struct btrfs_trans_handle *trans,
 	if (!path)
 		return -ENOMEM;
 
-	ret = find_next_devid(root, path, &free_devid);
-	if (ret)
-		goto out;
-
 	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
 	key.type = BTRFS_DEV_ITEM_KEY;
-	key.offset = free_devid;
+	key.offset = device->devid;
 
 	ret = btrfs_insert_empty_item(trans, root, path, &key,
 				      sizeof(*dev_item));
@@ -805,8 +884,8 @@ int btrfs_add_device(struct btrfs_trans_handle *trans,
 	leaf = path->nodes[0];
 	dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
 
-	device->devid = free_devid;
 	btrfs_set_device_id(leaf, dev_item, device->devid);
+	btrfs_set_device_generation(leaf, dev_item, 0);
 	btrfs_set_device_type(leaf, dev_item, device->type);
 	btrfs_set_device_io_align(leaf, dev_item, device->io_align);
 	btrfs_set_device_io_width(leaf, dev_item, device->io_width);
@@ -819,9 +898,11 @@ int btrfs_add_device(struct btrfs_trans_handle *trans,
 
 	ptr = (unsigned long)btrfs_device_uuid(dev_item);
 	write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
+	ptr = (unsigned long)btrfs_device_fsid(dev_item);
+	write_extent_buffer(leaf, root->fs_info->fsid, ptr, BTRFS_UUID_SIZE);
 	btrfs_mark_buffer_dirty(leaf);
-	ret = 0;
 
+	ret = 0;
 out:
 	btrfs_free_path(path);
 	return ret;
@@ -832,11 +913,7 @@ static int btrfs_rm_dev_item(struct btrfs_root *root,
 {
 	int ret;
 	struct btrfs_path *path;
-	struct block_device *bdev = device->bdev;
-	struct btrfs_device *next_dev;
 	struct btrfs_key key;
-	u64 total_bytes;
-	struct btrfs_fs_devices *fs_devices;
 	struct btrfs_trans_handle *trans;
 
 	root = root->fs_info->chunk_root;
@@ -863,25 +940,6 @@ static int btrfs_rm_dev_item(struct btrfs_root *root,
 	ret = btrfs_del_item(trans, root, path);
 	if (ret)
 		goto out;
-
-	/*
-	 * at this point, the device is zero sized.  We want to
-	 * remove it from the devices list and zero out the old super
-	 */
-	list_del_init(&device->dev_list);
-	list_del_init(&device->dev_alloc_list);
-	fs_devices = root->fs_info->fs_devices;
-
-	next_dev = list_entry(fs_devices->devices.next, struct btrfs_device,
-			      dev_list);
-	if (bdev == root->fs_info->sb->s_bdev)
-		root->fs_info->sb->s_bdev = next_dev->bdev;
-	if (bdev == fs_devices->latest_bdev)
-		fs_devices->latest_bdev = next_dev->bdev;
-
-	total_bytes = btrfs_super_num_devices(&root->fs_info->super_copy);
-	btrfs_set_super_num_devices(&root->fs_info->super_copy,
-				    total_bytes - 1);
 out:
 	btrfs_free_path(path);
 	unlock_chunks(root);
@@ -892,11 +950,14 @@ out:
 int btrfs_rm_device(struct btrfs_root *root, char *device_path)
 {
 	struct btrfs_device *device;
+	struct btrfs_device *next_device;
 	struct block_device *bdev;
 	struct buffer_head *bh = NULL;
 	struct btrfs_super_block *disk_super;
 	u64 all_avail;
 	u64 devid;
+	u64 num_devices;
+	u8 *dev_uuid;
 	int ret = 0;
 
 	mutex_lock(&uuid_mutex);
@@ -907,14 +968,14 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
 		root->fs_info->avail_metadata_alloc_bits;
 
 	if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) &&
-	    btrfs_super_num_devices(&root->fs_info->super_copy) <= 4) {
+	    root->fs_info->fs_devices->rw_devices <= 4) {
 		printk("btrfs: unable to go below four devices on raid10\n");
 		ret = -EINVAL;
 		goto out;
 	}
 
 	if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) &&
-	    btrfs_super_num_devices(&root->fs_info->super_copy) <= 2) {
+	    root->fs_info->fs_devices->rw_devices <= 2) {
 		printk("btrfs: unable to go below two devices on raid1\n");
 		ret = -EINVAL;
 		goto out;
@@ -941,15 +1002,15 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
 			printk("btrfs: no missing devices found to remove\n");
 			goto out;
 		}
-
 	} else {
-		bdev = open_bdev_excl(device_path, 0,
+		bdev = open_bdev_excl(device_path, MS_RDONLY,
 				      root->fs_info->bdev_holder);
 		if (IS_ERR(bdev)) {
 			ret = PTR_ERR(bdev);
 			goto out;
 		}
 
+		set_blocksize(bdev, 4096);
 		bh = __bread(bdev, BTRFS_SUPER_INFO_OFFSET / 4096, 4096);
 		if (!bh) {
 			ret = -EIO;
@@ -957,45 +1018,97 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
 		}
 		disk_super = (struct btrfs_super_block *)bh->b_data;
 		if (strncmp((char *)(&disk_super->magic), BTRFS_MAGIC,
-		    sizeof(disk_super->magic))) {
-			ret = -ENOENT;
-			goto error_brelse;
-		}
-		if (memcmp(disk_super->fsid, root->fs_info->fsid,
-			   BTRFS_FSID_SIZE)) {
+			    sizeof(disk_super->magic))) {
 			ret = -ENOENT;
 			goto error_brelse;
 		}
 		devid = le64_to_cpu(disk_super->dev_item.devid);
-		device = btrfs_find_device(root, devid, NULL);
+		dev_uuid = disk_super->dev_item.uuid;
+		device = btrfs_find_device(root, devid, dev_uuid,
+					   disk_super->fsid);
 		if (!device) {
 			ret = -ENOENT;
 			goto error_brelse;
 		}
+	}
 
+	if (device->writeable && root->fs_info->fs_devices->rw_devices == 1) {
+		printk("btrfs: unable to remove the only writeable device\n");
+		ret = -EINVAL;
+		goto error_brelse;
+	}
+
+	if (device->writeable) {
+		list_del_init(&device->dev_alloc_list);
+		root->fs_info->fs_devices->rw_devices--;
 	}
-	root->fs_info->fs_devices->num_devices--;
-	root->fs_info->fs_devices->open_devices--;
 
 	ret = btrfs_shrink_device(device, 0);
 	if (ret)
 		goto error_brelse;
 
-
 	ret = btrfs_rm_dev_item(root->fs_info->chunk_root, device);
 	if (ret)
 		goto error_brelse;
 
-	if (bh) {
+	device->in_fs_metadata = 0;
+	if (device->fs_devices == root->fs_info->fs_devices) {
+		list_del_init(&device->dev_list);
+		root->fs_info->fs_devices->num_devices--;
+		if (device->bdev)
+			device->fs_devices->open_devices--;
+	}
+
+	next_device = list_entry(root->fs_info->fs_devices->devices.next,
+				 struct btrfs_device, dev_list);
+	if (device->bdev == root->fs_info->sb->s_bdev)
+		root->fs_info->sb->s_bdev = next_device->bdev;
+	if (device->bdev == root->fs_info->fs_devices->latest_bdev)
+		root->fs_info->fs_devices->latest_bdev = next_device->bdev;
+
+	num_devices = btrfs_super_num_devices(&root->fs_info->super_copy) - 1;
+	btrfs_set_super_num_devices(&root->fs_info->super_copy, num_devices);
+
+	if (device->fs_devices != root->fs_info->fs_devices) {
+		BUG_ON(device->writeable);
+		brelse(bh);
+		if (bdev)
+			close_bdev_excl(bdev);
+
+		if (device->bdev) {
+			close_bdev_excl(device->bdev);
+			device->bdev = NULL;
+			device->fs_devices->open_devices--;
+		}
+		if (device->fs_devices->open_devices == 0) {
+			struct btrfs_fs_devices *fs_devices;
+			fs_devices = root->fs_info->fs_devices;
+			while (fs_devices) {
+				if (fs_devices->seed == device->fs_devices)
+					break;
+				fs_devices = fs_devices->seed;
+			}
+			fs_devices->seed = device->fs_devices->seed;
+			device->fs_devices->seed = NULL;
+			__btrfs_close_devices(device->fs_devices);
+		}
+		ret = 0;
+		goto out;
+	}
+
+	/*
+	 * at this point, the device is zero sized.  We want to
+	 * remove it from the devices list and zero out the old super
+	 */
+	if (device->writeable) {
 		/* make sure this device isn't detected as part of
 		 * the FS anymore
 		 */
 		memset(&disk_super->magic, 0, sizeof(disk_super->magic));
 		set_buffer_dirty(bh);
 		sync_dirty_buffer(bh);
-
-		brelse(bh);
 	}
+	brelse(bh);
 
 	if (device->bdev) {
 		/* one close for the device struct or super_block */
@@ -1021,6 +1134,129 @@ out:
 	return ret;
 }
 
+/*
+ * does all the dirty work required for changing file system's UUID.
+ */
+static int btrfs_prepare_sprout(struct btrfs_trans_handle *trans,
+				struct btrfs_root *root)
+{
+	struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
+	struct btrfs_fs_devices *old_devices;
+	struct btrfs_super_block *disk_super = &root->fs_info->super_copy;
+	struct btrfs_device *device;
+	u64 super_flags;
+
+	BUG_ON(!mutex_is_locked(&uuid_mutex));
+	if (!fs_devices->seeding || fs_devices->opened != 1)
+		return -EINVAL;
+
+	old_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS);
+	if (!old_devices)
+		return -ENOMEM;
+
+	memcpy(old_devices, fs_devices, sizeof(*old_devices));
+	old_devices->opened = 1;
+	old_devices->sprouted = 1;
+	INIT_LIST_HEAD(&old_devices->devices);
+	INIT_LIST_HEAD(&old_devices->alloc_list);
+	list_splice_init(&fs_devices->devices, &old_devices->devices);
+	list_splice_init(&fs_devices->alloc_list, &old_devices->alloc_list);
+	list_for_each_entry(device, &old_devices->devices, dev_list) {
+		device->fs_devices = old_devices;
+	}
+	list_add(&old_devices->list, &fs_uuids);
+
+	fs_devices->seeding = 0;
+	fs_devices->num_devices = 0;
+	fs_devices->open_devices = 0;
+	fs_devices->seed = old_devices;
+
+	generate_random_uuid(fs_devices->fsid);
+	memcpy(root->fs_info->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
+	memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
+	super_flags = btrfs_super_flags(disk_super) &
+		      ~BTRFS_SUPER_FLAG_SEEDING;
+	btrfs_set_super_flags(disk_super, super_flags);
+
+	return 0;
+}
+
+/*
+ * strore the expected generation for seed devices in device items.
+ */
+static int btrfs_finish_sprout(struct btrfs_trans_handle *trans,
+			       struct btrfs_root *root)
+{
+	struct btrfs_path *path;
+	struct extent_buffer *leaf;
+	struct btrfs_dev_item *dev_item;
+	struct btrfs_device *device;
+	struct btrfs_key key;
+	u8 fs_uuid[BTRFS_UUID_SIZE];
+	u8 dev_uuid[BTRFS_UUID_SIZE];
+	u64 devid;
+	int ret;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	root = root->fs_info->chunk_root;
+	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
+	key.offset = 0;
+	key.type = BTRFS_DEV_ITEM_KEY;
+
+	while (1) {
+		ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
+		if (ret < 0)
+			goto error;
+
+		leaf = path->nodes[0];
+next_slot:
+		if (path->slots[0] >= btrfs_header_nritems(leaf)) {
+			ret = btrfs_next_leaf(root, path);
+			if (ret > 0)
+				break;
+			if (ret < 0)
+				goto error;
+			leaf = path->nodes[0];
+			btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+			btrfs_release_path(root, path);
+			continue;
+		}
+
+		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+		if (key.objectid != BTRFS_DEV_ITEMS_OBJECTID ||
+		    key.type != BTRFS_DEV_ITEM_KEY)
+			break;
+
+		dev_item = btrfs_item_ptr(leaf, path->slots[0],
+					  struct btrfs_dev_item);
+		devid = btrfs_device_id(leaf, dev_item);
+		read_extent_buffer(leaf, dev_uuid,
+				   (unsigned long)btrfs_device_uuid(dev_item),
+				   BTRFS_UUID_SIZE);
+		read_extent_buffer(leaf, fs_uuid,
+				   (unsigned long)btrfs_device_fsid(dev_item),
+				   BTRFS_UUID_SIZE);
+		device = btrfs_find_device(root, devid, dev_uuid, fs_uuid);
+		BUG_ON(!device);
+
+		if (device->fs_devices->seeding) {
+			btrfs_set_device_generation(leaf, dev_item,
+						    device->generation);
+			btrfs_mark_buffer_dirty(leaf);
+		}
+
+		path->slots[0]++;
+		goto next_slot;
+	}
+	ret = 0;
+error:
+	btrfs_free_path(path);
+	return ret;
+}
+
 int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
 {
 	struct btrfs_trans_handle *trans;
@@ -1028,26 +1264,34 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
 	struct block_device *bdev;
 	struct list_head *cur;
 	struct list_head *devices;
+	struct super_block *sb = root->fs_info->sb;
 	u64 total_bytes;
+	int seeding_dev = 0;
 	int ret = 0;
 
+	if ((sb->s_flags & MS_RDONLY) && !root->fs_info->fs_devices->seeding)
+		return -EINVAL;
 
 	bdev = open_bdev_excl(device_path, 0, root->fs_info->bdev_holder);
 	if (!bdev) {
 		return -EIO;
 	}
 
+	if (root->fs_info->fs_devices->seeding) {
+		seeding_dev = 1;
+		down_write(&sb->s_umount);
+		mutex_lock(&uuid_mutex);
+	}
+
 	filemap_write_and_wait(bdev->bd_inode->i_mapping);
 	mutex_lock(&root->fs_info->volume_mutex);
 
-	trans = btrfs_start_transaction(root, 1);
-	lock_chunks(root);
 	devices = &root->fs_info->fs_devices->devices;
 	list_for_each(cur, devices) {
 		device = list_entry(cur, struct btrfs_device, dev_list);
 		if (device->bdev == bdev) {
 			ret = -EEXIST;
-			goto out;
+			goto error;
 		}
 	}
 
@@ -1055,18 +1299,31 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
 	if (!device) {
 		/* we can safely leave the fs_devices entry around */
 		ret = -ENOMEM;
-		goto out_close_bdev;
+		goto error;
 	}
 
-	device->barriers = 1;
-	device->work.func = pending_bios_fn;
-	generate_random_uuid(device->uuid);
-	spin_lock_init(&device->io_lock);
 	device->name = kstrdup(device_path, GFP_NOFS);
 	if (!device->name) {
 		kfree(device);
-		goto out_close_bdev;
+		ret = -ENOMEM;
+		goto error;
 	}
+
+	ret = find_next_devid(root, &device->devid);
+	if (ret) {
+		kfree(device);
+		goto error;
+	}
+
+	trans = btrfs_start_transaction(root, 1);
+	lock_chunks(root);
+
+	device->barriers = 1;
+	device->writeable = 1;
+	device->work.func = pending_bios_fn;
+	generate_random_uuid(device->uuid);
+	spin_lock_init(&device->io_lock);
+	device->generation = trans->transid;
 	device->io_width = root->sectorsize;
 	device->io_align = root->sectorsize;
 	device->sector_size = root->sectorsize;
@@ -1074,12 +1331,22 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
 	device->dev_root = root->fs_info->dev_root;
 	device->bdev = bdev;
 	device->in_fs_metadata = 1;
+	set_blocksize(device->bdev, 4096);
 
-	ret = btrfs_add_device(trans, root, device);
-	if (ret)
-		goto out_close_bdev;
+	if (seeding_dev) {
+		sb->s_flags &= ~MS_RDONLY;
+		ret = btrfs_prepare_sprout(trans, root);
+		BUG_ON(ret);
+	}
 
-	set_blocksize(device->bdev, 4096);
+	device->fs_devices = root->fs_info->fs_devices;
+	list_add(&device->dev_list, &root->fs_info->fs_devices->devices);
+	list_add(&device->dev_alloc_list,
+		 &root->fs_info->fs_devices->alloc_list);
+	root->fs_info->fs_devices->num_devices++;
+	root->fs_info->fs_devices->open_devices++;
+	root->fs_info->fs_devices->rw_devices++;
+	root->fs_info->fs_devices->total_rw_bytes += device->total_bytes;
 
 	total_bytes = btrfs_super_total_bytes(&root->fs_info->super_copy);
 	btrfs_set_super_total_bytes(&root->fs_info->super_copy,
@@ -1089,20 +1356,34 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
 	btrfs_set_super_num_devices(&root->fs_info->super_copy,
 				    total_bytes + 1);
 
-	list_add(&device->dev_list, &root->fs_info->fs_devices->devices);
-	list_add(&device->dev_alloc_list,
-		 &root->fs_info->fs_devices->alloc_list);
-	root->fs_info->fs_devices->num_devices++;
-	root->fs_info->fs_devices->open_devices++;
-out:
+	if (seeding_dev) {
+		ret = init_first_rw_device(trans, root, device);
+		BUG_ON(ret);
+		ret = btrfs_finish_sprout(trans, root);
+		BUG_ON(ret);
+	} else {
+		ret = btrfs_add_device(trans, root, device);
+	}
+
 	unlock_chunks(root);
-	btrfs_end_transaction(trans, root);
-	mutex_unlock(&root->fs_info->volume_mutex);
+	btrfs_commit_transaction(trans, root);
 
-	return ret;
+	if (seeding_dev) {
+		mutex_unlock(&uuid_mutex);
+		up_write(&sb->s_umount);
 
-out_close_bdev:
+		ret = btrfs_relocate_sys_chunks(root);
+		BUG_ON(ret);
+	}
+out:
+	mutex_unlock(&root->fs_info->volume_mutex);
+	return ret;
+error:
 	close_bdev_excl(bdev);
+	if (seeding_dev) {
+		mutex_unlock(&uuid_mutex);
+		up_write(&sb->s_umount);
+	}
 	goto out;
 }
 
@@ -1160,7 +1441,15 @@ static int __btrfs_grow_device(struct btrfs_trans_handle *trans,
 	u64 old_total = btrfs_super_total_bytes(super_copy);
 	u64 diff = new_size - device->total_bytes;
 
+	if (!device->writeable)
+		return -EACCES;
+	if (new_size <= device->total_bytes)
+		return -EINVAL;
+
 	btrfs_set_super_total_bytes(super_copy, old_total + diff);
+	device->fs_devices->total_rw_bytes += diff;
+
+	device->total_bytes = new_size;
 	return btrfs_update_device(trans, device);
 }
 
@@ -1248,7 +1537,6 @@ int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64
 	return ret;
 }
 
-
 int btrfs_relocate_chunk(struct btrfs_root *root,
 			 u64 chunk_tree, u64 chunk_objectid,
 			 u64 chunk_offset)
@@ -1308,24 +1596,82 @@ int btrfs_relocate_chunk(struct btrfs_root *root,
 		BUG_ON(ret);
 	}
 
-	ret = btrfs_remove_block_group(trans, extent_root, chunk_offset);
-	BUG_ON(ret);
+	ret = btrfs_remove_block_group(trans, extent_root, chunk_offset);
+	BUG_ON(ret);
+
+	spin_lock(&em_tree->lock);
+	remove_extent_mapping(em_tree, em);
+	spin_unlock(&em_tree->lock);
+
+	kfree(map);
+	em->bdev = NULL;
+
+	/* once for the tree */
+	free_extent_map(em);
+	/* once for us */
+	free_extent_map(em);
+
+	unlock_chunks(root);
+	btrfs_end_transaction(trans, root);
+	return 0;
+}
+
+static int btrfs_relocate_sys_chunks(struct btrfs_root *root)
+{
+	struct btrfs_root *chunk_root = root->fs_info->chunk_root;
+	struct btrfs_path *path;
+	struct extent_buffer *leaf;
+	struct btrfs_chunk *chunk;
+	struct btrfs_key key;
+	struct btrfs_key found_key;
+	u64 chunk_tree = chunk_root->root_key.objectid;
+	u64 chunk_type;
+	int ret;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
+	key.offset = (u64)-1;
+	key.type = BTRFS_CHUNK_ITEM_KEY;
+
+	while (1) {
+		ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
+		if (ret < 0)
+			goto error;
+		BUG_ON(ret == 0);
+
+		ret = btrfs_previous_item(chunk_root, path, key.objectid,
+					  key.type);
+		if (ret < 0)
+			goto error;
+		if (ret > 0)
+			break;
 
-	spin_lock(&em_tree->lock);
-	remove_extent_mapping(em_tree, em);
-	spin_unlock(&em_tree->lock);
+		leaf = path->nodes[0];
+		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
 
-	kfree(map);
-	em->bdev = NULL;
+		chunk = btrfs_item_ptr(leaf, path->slots[0],
+				       struct btrfs_chunk);
+		chunk_type = btrfs_chunk_type(leaf, chunk);
+		btrfs_release_path(chunk_root, path);
 
-	/* once for the tree */
-	free_extent_map(em);
-	/* once for us */
-	free_extent_map(em);
+		if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) {
+			ret = btrfs_relocate_chunk(chunk_root, chunk_tree,
+						   found_key.objectid,
+						   found_key.offset);
+			BUG_ON(ret);
+		}
 
-	unlock_chunks(root);
-	btrfs_end_transaction(trans, root);
-	return 0;
+		if (found_key.offset == 0)
+			break;
+		key.offset = found_key.offset - 1;
+	}
+	ret = 0;
+error:
+	btrfs_free_path(path);
+	return ret;
 }
 
 static u64 div_factor(u64 num, int factor)
@@ -1337,7 +1683,6 @@ static u64 div_factor(u64 num, int factor)
 	return num;
 }
 
-
 int btrfs_balance(struct btrfs_root *dev_root)
 {
 	int ret;
@@ -1353,6 +1698,8 @@ int btrfs_balance(struct btrfs_root *dev_root)
 	struct btrfs_trans_handle *trans;
 	struct btrfs_key found_key;
 
+	if (dev_root->fs_info->sb->s_flags & MS_RDONLY)
+		return -EROFS;
 
 	mutex_lock(&dev_root->fs_info->volume_mutex);
 	dev_root = dev_root->fs_info->dev_root;
@@ -1363,7 +1710,8 @@ int btrfs_balance(struct btrfs_root *dev_root)
 		old_size = device->total_bytes;
 		size_to_free = div_factor(old_size, 1);
 		size_to_free = min(size_to_free, (u64)1 * 1024 * 1024);
-		if (device->total_bytes - device->bytes_used > size_to_free)
+		if (!device->writeable ||
+		    device->total_bytes - device->bytes_used > size_to_free)
 			continue;
 
 		ret = btrfs_shrink_device(device, old_size - size_to_free);
@@ -1453,6 +1801,8 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
 	u64 old_total = btrfs_super_total_bytes(super_copy);
 	u64 diff = device->total_bytes - new_size;
 
+	if (new_size >= device->total_bytes)
+		return -EINVAL;
 
 	path = btrfs_alloc_path();
 	if (!path)
@@ -1469,6 +1819,8 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
 	lock_chunks(root);
 
 	device->total_bytes = new_size;
+	if (device->writeable)
+		device->fs_devices->total_rw_bytes -= diff;
 	ret = btrfs_update_device(trans, device);
 	if (ret) {
 		unlock_chunks(root);
@@ -1561,32 +1913,27 @@ static u64 noinline chunk_bytes_by_type(u64 type, u64 calc_size,
 		return calc_size * num_stripes;
 }
 
-
-int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
-		      struct btrfs_root *extent_root, u64 *start,
-		      u64 *num_bytes, u64 type)
+static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
+			       struct btrfs_root *extent_root,
+			       struct map_lookup **map_ret,
+			       u64 *num_bytes, u64 *stripe_size,
+			       u64 start, u64 type)
 {
-	u64 dev_offset;
 	struct btrfs_fs_info *info = extent_root->fs_info;
-	struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root;
-	struct btrfs_path *path;
-	struct btrfs_stripe *stripes;
 	struct btrfs_device *device = NULL;
-	struct btrfs_chunk *chunk;
-	struct list_head private_devs;
-	struct list_head *dev_list;
+	struct btrfs_fs_devices *fs_devices = info->fs_devices;
 	struct list_head *cur;
+	struct map_lookup *map = NULL;
 	struct extent_map_tree *em_tree;
-	struct map_lookup *map;
 	struct extent_map *em;
+	struct list_head private_devs;
 	int min_stripe_size = 1 * 1024 * 1024;
-	u64 physical;
 	u64 calc_size = 1024 * 1024 * 1024;
 	u64 max_chunk_size = calc_size;
 	u64 min_free;
 	u64 avail;
 	u64 max_avail = 0;
-	u64 percent_max;
+	u64 dev_offset;
 	int num_stripes = 1;
 	int min_stripes = 1;
 	int sub_stripes = 0;
@@ -1594,19 +1941,17 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 	int ret;
 	int index;
 	int stripe_len = 64 * 1024;
-	struct btrfs_key key;
 
 	if ((type & BTRFS_BLOCK_GROUP_RAID1) &&
 	    (type & BTRFS_BLOCK_GROUP_DUP)) {
 		WARN_ON(1);
 		type &= ~BTRFS_BLOCK_GROUP_DUP;
 	}
-	dev_list = &extent_root->fs_info->fs_devices->alloc_list;
-	if (list_empty(dev_list))
+	if (list_empty(&fs_devices->alloc_list))
 		return -ENOSPC;
 
 	if (type & (BTRFS_BLOCK_GROUP_RAID0)) {
-		num_stripes = extent_root->fs_info->fs_devices->open_devices;
+		num_stripes = fs_devices->rw_devices;
 		min_stripes = 2;
 	}
 	if (type & (BTRFS_BLOCK_GROUP_DUP)) {
@@ -1614,14 +1959,13 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 		min_stripes = 2;
 	}
 	if (type & (BTRFS_BLOCK_GROUP_RAID1)) {
-		num_stripes = min_t(u64, 2,
-			    extent_root->fs_info->fs_devices->open_devices);
+		num_stripes = min_t(u64, 2, fs_devices->rw_devices);
 		if (num_stripes < 2)
 			return -ENOSPC;
 		min_stripes = 2;
 	}
 	if (type & (BTRFS_BLOCK_GROUP_RAID10)) {
-		num_stripes = extent_root->fs_info->fs_devices->open_devices;
+		num_stripes = fs_devices->rw_devices;
 		if (num_stripes < 4)
 			return -ENOSPC;
 		num_stripes &= ~(u32)1;
@@ -1641,15 +1985,19 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 		min_stripe_size = 1 * 1024 * 1024;
 	}
 
-	path = btrfs_alloc_path();
-	if (!path)
-		return -ENOMEM;
-
-	/* we don't want a chunk larger than 10% of the FS */
-	percent_max = div_factor(btrfs_super_total_bytes(&info->super_copy), 1);
-	max_chunk_size = min(percent_max, max_chunk_size);
+	/* we don't want a chunk larger than 10% of writeable space */
+	max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1),
+			     max_chunk_size);
 
 again:
+	if (!map || map->num_stripes != num_stripes) {
+		kfree(map);
+		map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
+		if (!map)
+			return -ENOMEM;
+		map->num_stripes = num_stripes;
+	}
+
 	if (calc_size * num_stripes > max_chunk_size) {
 		calc_size = max_chunk_size;
 		do_div(calc_size, num_stripes);
@@ -1662,8 +2010,7 @@ again:
 	do_div(calc_size, stripe_len);
 	calc_size *= stripe_len;
 
-	INIT_LIST_HEAD(&private_devs);
-	cur = dev_list->next;
+	cur = fs_devices->alloc_list.next;
 	index = 0;
 
 	if (type & BTRFS_BLOCK_GROUP_DUP)
@@ -1679,10 +2026,10 @@ again:
 	if (!looped)
 		min_free += 1024 * 1024;
 
-	/* build a private list of devices we will allocate from */
+	INIT_LIST_HEAD(&private_devs);
 	while(index < num_stripes) {
 		device = list_entry(cur, struct btrfs_device, dev_alloc_list);
-
+		BUG_ON(!device->writeable);
 		if (device->total_bytes > device->bytes_used)
 			avail = device->total_bytes - device->bytes_used;
 		else
@@ -1690,24 +2037,28 @@ again:
 		cur = cur->next;
 
 		if (device->in_fs_metadata && avail >= min_free) {
-			u64 ignored_start = 0;
-			ret = find_free_dev_extent(trans, device, path,
-						   min_free,
-						   &ignored_start);
+			ret = find_free_dev_extent(trans, device,
+						   min_free, &dev_offset);
 			if (ret == 0) {
 				list_move_tail(&device->dev_alloc_list,
 					       &private_devs);
+				map->stripes[index].dev = device;
+				map->stripes[index].physical = dev_offset;
 				index++;
-				if (type & BTRFS_BLOCK_GROUP_DUP)
+				if (type & BTRFS_BLOCK_GROUP_DUP) {
+					map->stripes[index].dev = device;
+					map->stripes[index].physical =
+						dev_offset + calc_size;
 					index++;
+				}
 			}
 		} else if (device->in_fs_metadata && avail > max_avail)
 			max_avail = avail;
-		if (cur == dev_list)
+		if (cur == &fs_devices->alloc_list)
 			break;
 	}
+	list_splice(&private_devs, &fs_devices->alloc_list);
 	if (index < num_stripes) {
-		list_splice(&private_devs, dev_list);
 		if (index >= min_stripes) {
 			num_stripes = index;
 			if (type & (BTRFS_BLOCK_GROUP_RAID10)) {
@@ -1722,115 +2073,246 @@ again:
 			calc_size = max_avail;
 			goto again;
 		}
-		btrfs_free_path(path);
+		kfree(map);
 		return -ENOSPC;
 	}
-	key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
-	key.type = BTRFS_CHUNK_ITEM_KEY;
-	ret = find_next_chunk(chunk_root, BTRFS_FIRST_CHUNK_TREE_OBJECTID,
-			      &key.offset);
-	if (ret) {
-		btrfs_free_path(path);
-		return ret;
-	}
+	map->sector_size = extent_root->sectorsize;
+	map->stripe_len = stripe_len;
+	map->io_align = stripe_len;
+	map->io_width = stripe_len;
+	map->type = type;
+	map->num_stripes = num_stripes;
+	map->sub_stripes = sub_stripes;
 
-	chunk = kmalloc(btrfs_chunk_item_size(num_stripes), GFP_NOFS);
-	if (!chunk) {
-		btrfs_free_path(path);
-		return -ENOMEM;
-	}
+	*map_ret = map;
+	*stripe_size = calc_size;
+	*num_bytes = chunk_bytes_by_type(type, calc_size,
+					 num_stripes, sub_stripes);
 
-	map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
-	if (!map) {
-		kfree(chunk);
-		btrfs_free_path(path);
+	em = alloc_extent_map(GFP_NOFS);
+	if (!em) {
+		kfree(map);
 		return -ENOMEM;
 	}
-	btrfs_free_path(path);
-	path = NULL;
+	em->bdev = (struct block_device *)map;
+	em->start = start;
+	em->len = *num_bytes;
+	em->block_start = 0;
+	em->block_len = em->len;
 
-	stripes = &chunk->stripe;
-	*num_bytes = chunk_bytes_by_type(type, calc_size,
-					 num_stripes, sub_stripes);
+	em_tree = &extent_root->fs_info->mapping_tree.map_tree;
+	spin_lock(&em_tree->lock);
+	ret = add_extent_mapping(em_tree, em);
+	spin_unlock(&em_tree->lock);
+	BUG_ON(ret);
+	free_extent_map(em);
 
-	index = 0;
-	while(index < num_stripes) {
-		struct btrfs_stripe *stripe;
-		BUG_ON(list_empty(&private_devs));
-		cur = private_devs.next;
-		device = list_entry(cur, struct btrfs_device, dev_alloc_list);
+	ret = btrfs_make_block_group(trans, extent_root, 0, type,
+				     BTRFS_FIRST_CHUNK_TREE_OBJECTID,
+				     start, *num_bytes);
+	BUG_ON(ret);
 
-		/* loop over this device again if we're doing a dup group */
-		if (!(type & BTRFS_BLOCK_GROUP_DUP) ||
-		    (index == num_stripes - 1))
-			list_move_tail(&device->dev_alloc_list, dev_list);
+	index = 0;
+	while (index < map->num_stripes) {
+		device = map->stripes[index].dev;
+		dev_offset = map->stripes[index].physical;
 
 		ret = btrfs_alloc_dev_extent(trans, device,
-			     info->chunk_root->root_key.objectid,
-			     BTRFS_FIRST_CHUNK_TREE_OBJECTID, key.offset,
-			     calc_size, &dev_offset);
+				info->chunk_root->root_key.objectid,
+				BTRFS_FIRST_CHUNK_TREE_OBJECTID,
+				start, dev_offset, calc_size);
 		BUG_ON(ret);
-		device->bytes_used += calc_size;
+		index++;
+	}
+
+	return 0;
+}
+
+static int __finish_chunk_alloc(struct btrfs_trans_handle *trans,
+				struct btrfs_root *extent_root,
+				struct map_lookup *map, u64 chunk_offset,
+				u64 chunk_size, u64 stripe_size)
+{
+	u64 dev_offset;
+	struct btrfs_key key;
+	struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root;
+	struct btrfs_device *device;
+	struct btrfs_chunk *chunk;
+	struct btrfs_stripe *stripe;
+	size_t item_size = btrfs_chunk_item_size(map->num_stripes);
+	int index = 0;
+	int ret;
+
+	chunk = kzalloc(item_size, GFP_NOFS);
+	if (!chunk)
+		return -ENOMEM;
+
+	index = 0;
+	while (index < map->num_stripes) {
+		device = map->stripes[index].dev;
+		device->bytes_used += stripe_size;
 		ret = btrfs_update_device(trans, device);
 		BUG_ON(ret);
+		index++;
+	}
+
+	index = 0;
+	stripe = &chunk->stripe;
+	while (index < map->num_stripes) {
+		device = map->stripes[index].dev;
+		dev_offset = map->stripes[index].physical;
 
-		map->stripes[index].dev = device;
-		map->stripes[index].physical = dev_offset;
-		stripe = stripes + index;
 		btrfs_set_stack_stripe_devid(stripe, device->devid);
 		btrfs_set_stack_stripe_offset(stripe, dev_offset);
 		memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE);
-		physical = dev_offset;
+		stripe++;
 		index++;
 	}
-	BUG_ON(!list_empty(&private_devs));
 
-	/* key was set above */
-	btrfs_set_stack_chunk_length(chunk, *num_bytes);
+	btrfs_set_stack_chunk_length(chunk, chunk_size);
 	btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid);
-	btrfs_set_stack_chunk_stripe_len(chunk, stripe_len);
-	btrfs_set_stack_chunk_type(chunk, type);
-	btrfs_set_stack_chunk_num_stripes(chunk, num_stripes);
-	btrfs_set_stack_chunk_io_align(chunk, stripe_len);
-	btrfs_set_stack_chunk_io_width(chunk, stripe_len);
+	btrfs_set_stack_chunk_stripe_len(chunk, map->stripe_len);
+	btrfs_set_stack_chunk_type(chunk, map->type);
+	btrfs_set_stack_chunk_num_stripes(chunk, map->num_stripes);
+	btrfs_set_stack_chunk_io_align(chunk, map->stripe_len);
+	btrfs_set_stack_chunk_io_width(chunk, map->stripe_len);
 	btrfs_set_stack_chunk_sector_size(chunk, extent_root->sectorsize);
-	btrfs_set_stack_chunk_sub_stripes(chunk, sub_stripes);
-	map->sector_size = extent_root->sectorsize;
-	map->stripe_len = stripe_len;
-	map->io_align = stripe_len;
-	map->io_width = stripe_len;
-	map->type = type;
-	map->num_stripes = num_stripes;
-	map->sub_stripes = sub_stripes;
+	btrfs_set_stack_chunk_sub_stripes(chunk, map->sub_stripes);
 
-	ret = btrfs_insert_item(trans, chunk_root, &key, chunk,
-				btrfs_chunk_item_size(num_stripes));
-	BUG_ON(ret);
-	*start = key.offset;;
+	key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
+	key.type = BTRFS_CHUNK_ITEM_KEY;
+	key.offset = chunk_offset;
 
-	em = alloc_extent_map(GFP_NOFS);
-	if (!em)
-		return -ENOMEM;
-	em->bdev = (struct block_device *)map;
-	em->start = key.offset;
-	em->len = *num_bytes;
-	em->block_start = 0;
-	em->block_len = em->len;
+	ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size);
+	BUG_ON(ret);
 
-	if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
-		ret = btrfs_add_system_chunk(trans, chunk_root, &key,
-				    chunk, btrfs_chunk_item_size(num_stripes));
+	if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
+		ret = btrfs_add_system_chunk(trans, chunk_root, &key, chunk,
+					     item_size);
 		BUG_ON(ret);
 	}
 	kfree(chunk);
+	return 0;
+}
 
-	em_tree = &extent_root->fs_info->mapping_tree.map_tree;
-	spin_lock(&em_tree->lock);
-	ret = add_extent_mapping(em_tree, em);
-	spin_unlock(&em_tree->lock);
+/*
+ * Chunk allocation falls into two parts. The first part does works
+ * that make the new allocated chunk useable, but not do any operation
+ * that modifies the chunk tree. The second part does the works that
+ * require modifying the chunk tree. This division is important for the
+ * bootstrap process of adding storage to a seed btrfs.
+ */
+int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
+		      struct btrfs_root *extent_root, u64 type)
+{
+	u64 chunk_offset;
+	u64 chunk_size;
+	u64 stripe_size;
+	struct map_lookup *map;
+	struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root;
+	int ret;
+
+	ret = find_next_chunk(chunk_root, BTRFS_FIRST_CHUNK_TREE_OBJECTID,
+			      &chunk_offset);
+	if (ret)
+		return ret;
+
+	ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size,
+				  &stripe_size, chunk_offset, type);
+	if (ret)
+		return ret;
+
+	ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset,
+				   chunk_size, stripe_size);
+	BUG_ON(ret);
+	return 0;
+}
+
+static int noinline init_first_rw_device(struct btrfs_trans_handle *trans,
+					 struct btrfs_root *root,
+					 struct btrfs_device *device)
+{
+	u64 chunk_offset;
+	u64 sys_chunk_offset;
+	u64 chunk_size;
+	u64 sys_chunk_size;
+	u64 stripe_size;
+	u64 sys_stripe_size;
+	u64 alloc_profile;
+	struct map_lookup *map;
+	struct map_lookup *sys_map;
+	struct btrfs_fs_info *fs_info = root->fs_info;
+	struct btrfs_root *extent_root = fs_info->extent_root;
+	int ret;
+
+	ret = find_next_chunk(fs_info->chunk_root,
+			      BTRFS_FIRST_CHUNK_TREE_OBJECTID, &chunk_offset);
+	BUG_ON(ret);
+
+	alloc_profile = BTRFS_BLOCK_GROUP_METADATA |
+			(fs_info->metadata_alloc_profile &
+			 fs_info->avail_metadata_alloc_bits);
+	alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile);
+
+	ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size,
+				  &stripe_size, chunk_offset, alloc_profile);
+	BUG_ON(ret);
+
+	sys_chunk_offset = chunk_offset + chunk_size;
+
+	alloc_profile = BTRFS_BLOCK_GROUP_SYSTEM |
+			(fs_info->system_alloc_profile &
+			 fs_info->avail_system_alloc_bits);
+	alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile);
+
+	ret = __btrfs_alloc_chunk(trans, extent_root, &sys_map,
+				  &sys_chunk_size, &sys_stripe_size,
+				  sys_chunk_offset, alloc_profile);
+	BUG_ON(ret);
+
+	ret = btrfs_add_device(trans, fs_info->chunk_root, device);
+	BUG_ON(ret);
+
+	/*
+	 * Modifying chunk tree needs allocating new blocks from both
+	 * system block group and metadata block group. So we only can
+	 * do operations require modifying the chunk tree after both
+	 * block groups were created.
+	 */
+	ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset,
+				   chunk_size, stripe_size);
+	BUG_ON(ret);
+
+	ret = __finish_chunk_alloc(trans, extent_root, sys_map,
+				   sys_chunk_offset, sys_chunk_size,
+				   sys_stripe_size);
 	BUG_ON(ret);
+	return 0;
+}
+
+int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset)
+{
+	struct extent_map *em;
+	struct map_lookup *map;
+	struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;
+	int readonly = 0;
+	int i;
+
+	spin_lock(&map_tree->map_tree.lock);
+	em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1);
+	spin_unlock(&map_tree->map_tree.lock);
+	if (!em)
+		return 1;
+
+	map = (struct map_lookup *)em->bdev;
+	for (i = 0; i < map->num_stripes; i++) {
+		if (!map->stripes[i].dev->writeable) {
+			readonly = 1;
+			break;
+		}
+	}
 	free_extent_map(em);
-	return ret;
+	return readonly;
 }
 
 void btrfs_mapping_init(struct btrfs_mapping_tree *tree)
@@ -2227,6 +2709,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
 		}
 		bio->bi_sector = multi->stripes[dev_nr].physical >> 9;
 		dev = multi->stripes[dev_nr].dev;
+		BUG_ON(rw == WRITE && !dev->writeable);
 		if (dev && dev->bdev) {
 			bio->bi_bdev = dev->bdev;
 			if (async_submit)
@@ -2246,11 +2729,23 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
 }
 
 struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid,
-				       u8 *uuid)
+				       u8 *uuid, u8 *fsid)
 {
-	struct list_head *head = &root->fs_info->fs_devices->devices;
-
-	return __find_device(head, devid, uuid);
+	struct btrfs_device *device;
+	struct btrfs_fs_devices *cur_devices;
+
+	cur_devices = root->fs_info->fs_devices;
+	while (cur_devices) {
+		if (!fsid ||
+		    !memcmp(cur_devices->fsid, fsid, BTRFS_UUID_SIZE)) {
+			device = __find_device(&cur_devices->devices,
+					       devid, uuid);
+			if (device)
+				return device;
+		}
+		cur_devices = cur_devices->seed;
+	}
+	return NULL;
 }
 
 static struct btrfs_device *add_missing_dev(struct btrfs_root *root,
@@ -2262,8 +2757,6 @@ static struct btrfs_device *add_missing_dev(struct btrfs_root *root,
 	device = kzalloc(sizeof(*device), GFP_NOFS);
 	list_add(&device->dev_list,
 		 &fs_devices->devices);
-	list_add(&device->dev_alloc_list,
-		 &fs_devices->alloc_list);
 	device->barriers = 1;
 	device->dev_root = root->fs_info->dev_root;
 	device->devid = devid;
@@ -2274,7 +2767,6 @@ static struct btrfs_device *add_missing_dev(struct btrfs_root *root,
 	return device;
 }
 
-
 static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
 			  struct extent_buffer *leaf,
 			  struct btrfs_chunk *chunk)
@@ -2339,8 +2831,8 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
 		read_extent_buffer(leaf, uuid, (unsigned long)
 				   btrfs_stripe_dev_uuid_nr(chunk, i),
 				   BTRFS_UUID_SIZE);
-		map->stripes[i].dev = btrfs_find_device(root, devid, uuid);
-
+		map->stripes[i].dev = btrfs_find_device(root, devid, uuid,
+							NULL);
 		if (!map->stripes[i].dev && !btrfs_test_opt(root, DEGRADED)) {
 			kfree(map);
 			free_extent_map(em);
@@ -2387,6 +2879,50 @@ static int fill_device_from_item(struct extent_buffer *leaf,
 	return 0;
 }
 
+static int open_seed_devices(struct btrfs_root *root, u8 *fsid)
+{
+	struct btrfs_fs_devices *fs_devices;
+	int ret;
+
+	mutex_lock(&uuid_mutex);
+
+	fs_devices = root->fs_info->fs_devices->seed;
+	while (fs_devices) {
+		if (!memcmp(fs_devices->fsid, fsid, BTRFS_UUID_SIZE)) {
+			ret = 0;
+			goto out;
+		}
+		fs_devices = fs_devices->seed;
+	}
+
+	fs_devices = find_fsid(fsid);
+	if (!fs_devices) {
+		ret = -ENOENT;
+		goto out;
+	}
+	if (fs_devices->opened) {
+		ret = -EBUSY;
+		goto out;
+	}
+
+	ret = __btrfs_open_devices(fs_devices, root->fs_info->bdev_holder);
+	if (ret)
+		goto out;
+
+	if (!fs_devices->seeding) {
+		__btrfs_close_devices(fs_devices);
+		ret = -EINVAL;
+		goto out;
+	}
+
+	fs_devices->seed = root->fs_info->fs_devices->seed;
+	root->fs_info->fs_devices->seed = fs_devices;
+	fs_devices->sprouted = 1;
+out:
+	mutex_unlock(&uuid_mutex);
+	return ret;
+}
+
 static int read_one_dev(struct btrfs_root *root,
 			struct extent_buffer *leaf,
 			struct btrfs_dev_item *dev_item)
@@ -2394,23 +2930,50 @@ static int read_one_dev(struct btrfs_root *root,
 	struct btrfs_device *device;
 	u64 devid;
 	int ret;
+	int seed_devices = 0;
+	u8 fs_uuid[BTRFS_UUID_SIZE];
 	u8 dev_uuid[BTRFS_UUID_SIZE];
 
 	devid = btrfs_device_id(leaf, dev_item);
 	read_extent_buffer(leaf, dev_uuid,
 			   (unsigned long)btrfs_device_uuid(dev_item),
 			   BTRFS_UUID_SIZE);
-	device = btrfs_find_device(root, devid, dev_uuid);
-	if (!device) {
-		printk("warning devid %Lu missing\n", devid);
-		device = add_missing_dev(root, devid, dev_uuid);
-		if (!device)
-			return -ENOMEM;
+	read_extent_buffer(leaf, fs_uuid,
+			   (unsigned long)btrfs_device_fsid(dev_item),
+			   BTRFS_UUID_SIZE);
+
+	if (memcmp(fs_uuid, root->fs_info->fsid, BTRFS_UUID_SIZE)) {
+		ret = open_seed_devices(root, fs_uuid);
+		if (ret)
+			return ret;
+		seed_devices = 1;
+	}
+
+	device = btrfs_find_device(root, devid, dev_uuid, fs_uuid);
+	if (!device || !device->bdev) {
+		if (!btrfs_test_opt(root, DEGRADED) || seed_devices)
+			return -EIO;
+
+		if (!device) {
+			printk("warning devid %Lu missing\n", devid);
+			device = add_missing_dev(root, devid, dev_uuid);
+			if (!device)
+				return -ENOMEM;
+		}
+	}
+
+	if (device->fs_devices != root->fs_info->fs_devices) {
+		BUG_ON(device->writeable);
+		if (device->generation !=
+		    btrfs_device_generation(leaf, dev_item))
+			return -EINVAL;
 	}
 
 	fill_device_from_item(leaf, dev_item, device);
 	device->dev_root = root->fs_info->dev_root;
 	device->in_fs_metadata = 1;
+	if (device->writeable)
+		device->fs_devices->total_rw_bytes += device->total_bytes;
 	ret = 0;
 #if 0
 	ret = btrfs_open_device(device);
@@ -2528,12 +3091,15 @@ again:
 				dev_item = btrfs_item_ptr(leaf, slot,
 						  struct btrfs_dev_item);
 				ret = read_one_dev(root, leaf, dev_item);
-				BUG_ON(ret);
+				if (ret)
+					goto error;
 			}
 		} else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) {
 			struct btrfs_chunk *chunk;
 			chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
 			ret = read_one_chunk(root, &found_key, leaf, chunk);
+			if (ret)
+				goto error;
 		}
 		path->slots[0]++;
 	}
@@ -2542,9 +3108,8 @@ again:
 		btrfs_release_path(root, path);
 		goto again;
 	}
-
-	btrfs_free_path(path);
 	ret = 0;
 error:
+	btrfs_free_path(path);
 	return ret;
 }
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index c50e50580b51..1f6f25a5787f 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -26,6 +26,7 @@ struct buffer_head;
 struct btrfs_device {
 	struct list_head dev_list;
 	struct list_head dev_alloc_list;
+	struct btrfs_fs_devices *fs_devices;
 	struct btrfs_root *dev_root;
 	struct buffer_head *pending_io;
 	struct bio *pending_bios;
@@ -34,6 +35,7 @@ struct btrfs_device {
 	u64 generation;
 
 	int barriers;
+	int writeable;
 	int in_fs_metadata;
 
 	spinlock_t io_lock;
@@ -77,6 +79,8 @@ struct btrfs_fs_devices {
 	u64 latest_trans;
 	u64 num_devices;
 	u64 open_devices;
+	u64 rw_devices;
+	u64 total_rw_bytes;
 	struct block_device *latest_bdev;
 	/* all of the devices in the FS */
 	struct list_head devices;
@@ -84,7 +88,12 @@ struct btrfs_fs_devices {
 	/* devices not currently being allocated */
 	struct list_head alloc_list;
 	struct list_head list;
-	int mounted;
+
+	struct btrfs_fs_devices *seed;
+	int seeding;
+	int sprouted;
+
+	int opened;
 };
 
 struct btrfs_bio_stripe {
@@ -109,16 +118,14 @@ struct btrfs_multi_bio {
 int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
 			   struct btrfs_device *device,
 			   u64 chunk_tree, u64 chunk_objectid,
-			   u64 chunk_offset,
-			   u64 num_bytes, u64 *start);
+			   u64 chunk_offset, u64 start, u64 num_bytes);
 int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
 		    u64 logical, u64 *length,
 		    struct btrfs_multi_bio **multi_ret, int mirror_num);
 int btrfs_read_sys_array(struct btrfs_root *root);
 int btrfs_read_chunk_tree(struct btrfs_root *root);
 int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
-		      struct btrfs_root *extent_root, u64 *start,
-		      u64 *num_bytes, u64 type);
+		      struct btrfs_root *extent_root, u64 type);
 void btrfs_mapping_init(struct btrfs_mapping_tree *tree);
 void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree);
 int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
@@ -141,10 +148,11 @@ int btrfs_unplug_page(struct btrfs_mapping_tree *map_tree,
 int btrfs_grow_device(struct btrfs_trans_handle *trans,
 		      struct btrfs_device *device, u64 new_size);
 struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid,
-				       u8 *uuid);
+				       u8 *uuid, u8 *fsid);
 int btrfs_shrink_device(struct btrfs_device *device, u64 new_size);
 int btrfs_init_new_device(struct btrfs_root *root, char *path);
 int btrfs_balance(struct btrfs_root *dev_root);
 void btrfs_unlock_volumes(void);
 void btrfs_lock_volumes(void);
+int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset);
 #endif
-- 
cgit v1.2.3


From 7cbd8a839118eede2393f0926b8e15027162bcd6 Mon Sep 17 00:00:00 2001
From: yanhai zhu <zhu.yanhai@gmail.com>
Date: Wed, 12 Nov 2008 14:38:54 -0500
Subject: Btrfs: Add a missing return pointer check

Add a missing kzalloc() return pointer check in add_missing_dev().

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/volumes.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs/btrfs/volumes.c')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index d6f1996de629..ecf0633ab8cc 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -2755,6 +2755,8 @@ static struct btrfs_device *add_missing_dev(struct btrfs_root *root,
 	struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
 
 	device = kzalloc(sizeof(*device), GFP_NOFS);
+	if (!device)
+		return NULL;
 	list_add(&device->dev_list,
 		 &fs_devices->devices);
 	device->barriers = 1;
-- 
cgit v1.2.3


From 15916de835a683bd8133a0d1ac0c982b795ab4ff Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 19 Nov 2008 21:17:22 -0500
Subject: Btrfs: Fixes for 2.6.28-rc API changes

* open/close_bdev_excl -> open/close_bdev_exclusive
* blkdev_issue_discard takes a GFP mask now
* Fix blkdev_issue_discard usage now that it is enabled

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/compat.h      |  7 +++++++
 fs/btrfs/compression.c |  4 ++--
 fs/btrfs/extent-tree.c | 27 ++++++++++++++++++++-------
 fs/btrfs/extent_io.c   |  4 ++--
 fs/btrfs/version.sh    | 34 +++++++++++++++++-----------------
 fs/btrfs/volumes.c     | 41 +++++++++++++++++++++++------------------
 fs/btrfs/volumes.h     |  3 +++
 7 files changed, 74 insertions(+), 46 deletions(-)

(limited to 'fs/btrfs/volumes.c')

diff --git a/fs/btrfs/compat.h b/fs/btrfs/compat.h
index cd6598b169df..dd1defdbfa36 100644
--- a/fs/btrfs/compat.h
+++ b/fs/btrfs/compat.h
@@ -21,4 +21,11 @@ static inline struct dentry *d_obtain_alias(struct inode *inode)
 }
 #endif
 
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,28)
+# define  __pagevec_lru_add_file __pagevec_lru_add
+# define open_bdev_exclusive open_bdev_excl
+# define close_bdev_exclusive(bdev, mode) close_bdev_excl(bdev)
+#endif
+
+
 #endif /* _COMPAT_H_ */
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index bfd1512cce0a..df05f513e1eb 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -419,7 +419,7 @@ static noinline int add_ra_bio_pages(struct inode *inode,
 		/* open coding of lru_cache_add, also not exported */
 		page_cache_get(page);
 		if (!pagevec_add(&pvec, page))
-			__pagevec_lru_add(&pvec);
+			__pagevec_lru_add_file(&pvec);
 
 		end = last_offset + PAGE_CACHE_SIZE - 1;
 		/*
@@ -475,7 +475,7 @@ next:
 		last_offset += PAGE_CACHE_SIZE;
 	}
 	if (pagevec_count(&pvec))
-		__pagevec_lru_add(&pvec);
+		__pagevec_lru_add_file(&pvec);
 	return 0;
 }
 
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index ee73efe75423..62d49705d140 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -28,6 +28,7 @@
 #include "volumes.h"
 #include "locking.h"
 #include "ref-cache.h"
+#include "compat.h"
 
 #define PENDING_EXTENT_INSERT 0
 #define PENDING_EXTENT_DELETE 1
@@ -899,6 +900,17 @@ static int noinline remove_extent_backref(struct btrfs_trans_handle *trans,
 	return ret;
 }
 
+static void btrfs_issue_discard(struct block_device *bdev,
+				u64 start, u64 len)
+{
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,28)
+	blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL);
+#else
+	blkdev_issue_discard(bdev, start >> 9, len >> 9);
+#endif
+}
+
+
 static int noinline free_extents(struct btrfs_trans_handle *trans,
 				 struct btrfs_root *extent_root,
 				 struct list_head *del_list)
@@ -1108,6 +1120,7 @@ search:
 			BUG_ON(ret);
 
 #ifdef BIO_RW_DISCARD
+			map_length = tmp->num_bytes;
 			ret = btrfs_map_block(&info->mapping_tree, READ,
 					      tmp->bytenr, &map_length, &multi,
 					      0);
@@ -1115,16 +1128,16 @@ search:
 				struct btrfs_bio_stripe *stripe;
 				int i;
 
-				stripe = multi->stripe;
+				stripe = multi->stripes;
 
 				if (map_length > tmp->num_bytes)
 					map_length = tmp->num_bytes;
 
 				for (i = 0; i < multi->num_stripes;
 				     i++, stripe++)
-					blkdev_issue_discard(stripe->dev->bdev,
-							stripe->physical >> 9,
-							map_length >> 9);
+					btrfs_issue_discard(stripe->dev->bdev,
+							    stripe->physical,
+							    map_length);
 				kfree(multi);
 			}
 #endif
@@ -2498,9 +2511,9 @@ static int __free_extent(struct btrfs_trans_handle *trans,
 				map_length = num_bytes;
 
 			for (i = 0; i < multi->num_stripes; i++, stripe++) {
-				blkdev_issue_discard(stripe->dev->bdev,
-						     stripe->physical >> 9,
-						     map_length >> 9);
+				btrfs_issue_discard(stripe->dev->bdev,
+						    stripe->physical,
+						     map_length);
 			}
 			kfree(multi);
 		}
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index a0f3804efe4f..3a65c10dce33 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2639,14 +2639,14 @@ int extent_readpages(struct extent_io_tree *tree,
 			/* open coding of lru_cache_add, also not exported */
 			page_cache_get(page);
 			if (!pagevec_add(&pvec, page))
-				__pagevec_lru_add(&pvec);
+				__pagevec_lru_add_file(&pvec);
 			__extent_read_full_page(tree, page, get_extent,
 						&bio, 0, &bio_flags);
 		}
 		page_cache_release(page);
 	}
 	if (pagevec_count(&pvec))
-		__pagevec_lru_add(&pvec);
+		__pagevec_lru_add_file(&pvec);
 	BUG_ON(!list_empty(pages));
 	if (bio)
 		submit_one_bio(READ, bio, 0, bio_flags);
diff --git a/fs/btrfs/version.sh b/fs/btrfs/version.sh
index 0f57f24404d9..1ca1952fd917 100644
--- a/fs/btrfs/version.sh
+++ b/fs/btrfs/version.sh
@@ -8,24 +8,24 @@
  
 v="v0.16"
 
-which hg > /dev/null
-if [ -d .hg ] && [ $? == 0 ]; then
-	last=$(hg tags | grep -m1 -o '^v[0-9.]\+')
-	 
-	# now check if the repo has commits since then...
-	if [[ $(hg id -t) == $last || \
-	    $(hg di -r "$last:." | awk '/^diff/{print $NF}' | sort -u) == .hgtags ]]
-	then
-	    # check if it's dirty
-	    if [[ $(hg id | cut -d' ' -f1) == *+ ]]; then
-		v=$last+
-	    else
-		v=$last
+which git &> /dev/null
+if [ $? == 0 ]; then
+    git branch >& /dev/null
+    if [ $? == 0 ]; then
+	    if head=`git rev-parse --verify HEAD 2>/dev/null`; then
+		if tag=`git describe --tags 2>/dev/null`; then
+		    v="$tag"
+		fi
+
+		# Are there uncommitted changes?
+		git update-index --refresh --unmerged > /dev/null
+		if git diff-index --name-only HEAD | \
+		    grep -v "^scripts/package" \
+		    | read dummy; then
+		    v="$v"-dirty
+		fi
 	    fi
-	else
-	    # includes dirty flag
-	    v=$last+$(hg id -i)
-	fi
+    fi
 fi
  
 echo "#ifndef __BUILD_VERSION" > .build-version.h
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index ecf0633ab8cc..c3ee63f92a5f 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -85,7 +85,7 @@ int btrfs_cleanup_fs_uuids(void)
 			dev = list_entry(fs_devices->devices.next,
 					 struct btrfs_device, dev_list);
 			if (dev->bdev) {
-				close_bdev_excl(dev->bdev);
+				close_bdev_exclusive(dev->bdev, dev->mode);
 				fs_devices->open_devices--;
 			}
 			fs_devices->num_devices--;
@@ -317,7 +317,7 @@ again:
 			continue;
 
 		if (device->bdev) {
-			close_bdev_excl(device->bdev);
+			close_bdev_exclusive(device->bdev, device->mode);
 			device->bdev = NULL;
 			fs_devices->open_devices--;
 		}
@@ -356,7 +356,7 @@ again:
 	list_for_each(cur, &fs_devices->devices) {
 		device = list_entry(cur, struct btrfs_device, dev_list);
 		if (device->bdev) {
-			close_bdev_excl(device->bdev);
+			close_bdev_exclusive(device->bdev, device->mode);
 			fs_devices->open_devices--;
 		}
 		if (device->writeable) {
@@ -391,7 +391,8 @@ int btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
 	return ret;
 }
 
-int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices, void *holder)
+int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
+			 int flags, void *holder)
 {
 	struct block_device *bdev;
 	struct list_head *head = &fs_devices->devices;
@@ -413,7 +414,7 @@ int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices, void *holder)
 		if (!device->name)
 			continue;
 
-		bdev = open_bdev_excl(device->name, MS_RDONLY, holder);
+		bdev = open_bdev_exclusive(device->name, flags, holder);
 		if (IS_ERR(bdev)) {
 			printk("open %s failed\n", device->name);
 			goto error;
@@ -453,6 +454,8 @@ int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices, void *holder)
 
 		device->bdev = bdev;
 		device->in_fs_metadata = 0;
+		device->mode = flags;
+
 		fs_devices->open_devices++;
 		if (device->writeable) {
 			fs_devices->rw_devices++;
@@ -464,7 +467,7 @@ int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices, void *holder)
 error_brelse:
 		brelse(bh);
 error_close:
-		close_bdev_excl(bdev);
+		close_bdev_exclusive(bdev, MS_RDONLY);
 error:
 		continue;
 	}
@@ -496,7 +499,7 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
 			ret = 0;
 		}
 	} else {
-		ret = __btrfs_open_devices(fs_devices, holder);
+		ret = __btrfs_open_devices(fs_devices, flags, holder);
 	}
 	mutex_unlock(&uuid_mutex);
 	return ret;
@@ -514,7 +517,7 @@ int btrfs_scan_one_device(const char *path, int flags, void *holder,
 
 	mutex_lock(&uuid_mutex);
 
-	bdev = open_bdev_excl(path, flags, holder);
+	bdev = open_bdev_exclusive(path, flags, holder);
 
 	if (IS_ERR(bdev)) {
 		ret = PTR_ERR(bdev);
@@ -551,7 +554,7 @@ int btrfs_scan_one_device(const char *path, int flags, void *holder,
 error_brelse:
 	brelse(bh);
 error_close:
-	close_bdev_excl(bdev);
+	close_bdev_exclusive(bdev, flags);
 error:
 	mutex_unlock(&uuid_mutex);
 	return ret;
@@ -1003,7 +1006,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
 			goto out;
 		}
 	} else {
-		bdev = open_bdev_excl(device_path, MS_RDONLY,
+		bdev = open_bdev_exclusive(device_path, MS_RDONLY,
 				      root->fs_info->bdev_holder);
 		if (IS_ERR(bdev)) {
 			ret = PTR_ERR(bdev);
@@ -1073,10 +1076,10 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
 		BUG_ON(device->writeable);
 		brelse(bh);
 		if (bdev)
-			close_bdev_excl(bdev);
+			close_bdev_exclusive(bdev, MS_RDONLY);
 
 		if (device->bdev) {
-			close_bdev_excl(device->bdev);
+			close_bdev_exclusive(device->bdev, device->mode);
 			device->bdev = NULL;
 			device->fs_devices->open_devices--;
 		}
@@ -1112,11 +1115,11 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
 
 	if (device->bdev) {
 		/* one close for the device struct or super_block */
-		close_bdev_excl(device->bdev);
+		close_bdev_exclusive(device->bdev, device->mode);
 	}
 	if (bdev) {
 		/* one close for us */
-		close_bdev_excl(bdev);
+		close_bdev_exclusive(bdev, MS_RDONLY);
 	}
 	kfree(device->name);
 	kfree(device);
@@ -1127,7 +1130,7 @@ error_brelse:
 	brelse(bh);
 error_close:
 	if (bdev)
-		close_bdev_excl(bdev);
+		close_bdev_exclusive(bdev, MS_RDONLY);
 out:
 	mutex_unlock(&root->fs_info->volume_mutex);
 	mutex_unlock(&uuid_mutex);
@@ -1272,7 +1275,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
 	if ((sb->s_flags & MS_RDONLY) && !root->fs_info->fs_devices->seeding)
 		return -EINVAL;
 
-	bdev = open_bdev_excl(device_path, 0, root->fs_info->bdev_holder);
+	bdev = open_bdev_exclusive(device_path, 0, root->fs_info->bdev_holder);
 	if (!bdev) {
 		return -EIO;
 	}
@@ -1331,6 +1334,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
 	device->dev_root = root->fs_info->dev_root;
 	device->bdev = bdev;
 	device->in_fs_metadata = 1;
+	device->mode = 0;
 	set_blocksize(device->bdev, 4096);
 
 	if (seeding_dev) {
@@ -1379,7 +1383,7 @@ out:
 	mutex_unlock(&root->fs_info->volume_mutex);
 	return ret;
 error:
-	close_bdev_excl(bdev);
+	close_bdev_exclusive(bdev, 0);
 	if (seeding_dev) {
 		mutex_unlock(&uuid_mutex);
 		up_write(&sb->s_umount);
@@ -2907,7 +2911,8 @@ static int open_seed_devices(struct btrfs_root *root, u8 *fsid)
 		goto out;
 	}
 
-	ret = __btrfs_open_devices(fs_devices, root->fs_info->bdev_holder);
+	ret = __btrfs_open_devices(fs_devices, MS_RDONLY,
+				   root->fs_info->bdev_holder);
 	if (ret)
 		goto out;
 
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 1f6f25a5787f..9b41e4d3984d 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -42,6 +42,9 @@ struct btrfs_device {
 
 	struct block_device *bdev;
 
+	/* the mode sent to open_bdev_exclusive */
+	fmode_t mode;
+
 	char *name;
 
 	/* the internal btrfs device id */
-- 
cgit v1.2.3


From 4b4e25f2a6ddb070bab7f7dd2bd2926fb8db9e04 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 20 Nov 2008 10:22:27 -0500
Subject: Btrfs: compat code fixes

The btrfs git kernel trees is used to build a standalone tree for
compiling against older kernels.  This commit makes the standalone tree
work with 2.6.27

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/compat.h      | 1 +
 fs/btrfs/compression.c | 2 +-
 fs/btrfs/disk-io.c     | 3 ++-
 fs/btrfs/extent-tree.c | 5 ++++-
 fs/btrfs/inode.c       | 2 +-
 fs/btrfs/ioctl.c       | 1 +
 fs/btrfs/super.c       | 2 ++
 fs/btrfs/volumes.c     | 2 ++
 8 files changed, 14 insertions(+), 4 deletions(-)

(limited to 'fs/btrfs/volumes.c')

diff --git a/fs/btrfs/compat.h b/fs/btrfs/compat.h
index dd1defdbfa36..75e4426d6fbb 100644
--- a/fs/btrfs/compat.h
+++ b/fs/btrfs/compat.h
@@ -25,6 +25,7 @@ static inline struct dentry *d_obtain_alias(struct inode *inode)
 # define  __pagevec_lru_add_file __pagevec_lru_add
 # define open_bdev_exclusive open_bdev_excl
 # define close_bdev_exclusive(bdev, mode) close_bdev_excl(bdev)
+typedef unsigned __bitwise__ fmode_t;
 #endif
 
 
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index df05f513e1eb..4febe2eb0b83 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -34,13 +34,13 @@
 #include <linux/bit_spinlock.h>
 #include <linux/version.h>
 #include <linux/pagevec.h>
+#include "compat.h"
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"
 #include "btrfs_inode.h"
 #include "volumes.h"
 #include "ordered-data.h"
-#include "compat.h"
 #include "compression.h"
 #include "extent_io.h"
 #include "extent_map.h"
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index c8dcb47b6d7d..981652233f70 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -26,7 +26,8 @@
 #include <linux/buffer_head.h> // for block_sync_page
 #include <linux/workqueue.h>
 #include <linux/kthread.h>
-# include <linux/freezer.h>
+#include <linux/freezer.h>
+#include "compat.h"
 #include "crc32c.h"
 #include "ctree.h"
 #include "disk-io.h"
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 62d49705d140..b33e0bfb99e1 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -19,6 +19,8 @@
 #include <linux/pagemap.h>
 #include <linux/writeback.h>
 #include <linux/blkdev.h>
+#include <linux/version.h>
+#include "compat.h"
 #include "hash.h"
 #include "crc32c.h"
 #include "ctree.h"
@@ -900,6 +902,7 @@ static int noinline remove_extent_backref(struct btrfs_trans_handle *trans,
 	return ret;
 }
 
+#ifdef BIO_RW_DISCARD
 static void btrfs_issue_discard(struct block_device *bdev,
 				u64 start, u64 len)
 {
@@ -909,7 +912,7 @@ static void btrfs_issue_discard(struct block_device *bdev,
 	blkdev_issue_discard(bdev, start >> 9, len >> 9);
 #endif
 }
-
+#endif
 
 static int noinline free_extents(struct btrfs_trans_handle *trans,
 				 struct btrfs_root *extent_root,
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 2c77e0957f74..7a8ad4292f75 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -38,6 +38,7 @@
 #include <linux/xattr.h>
 #include <linux/posix_acl.h>
 #include <linux/falloc.h>
+#include "compat.h"
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"
@@ -47,7 +48,6 @@
 #include "volumes.h"
 #include "ordered-data.h"
 #include "xattr.h"
-#include "compat.h"
 #include "tree-log.h"
 #include "ref-cache.h"
 #include "compression.h"
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 8828109fa58e..f3d68457e66a 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -41,6 +41,7 @@
 #include <linux/version.h>
 #include <linux/xattr.h>
 #include <linux/vmalloc.h>
+#include "compat.h"
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 77c5eff3e209..1975ea273dc3 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -37,6 +37,8 @@
 #include <linux/ctype.h>
 #include <linux/namei.h>
 #include <linux/miscdevice.h>
+#include <linux/version.h>
+#include "compat.h"
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index c3ee63f92a5f..724ead54529b 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -20,7 +20,9 @@
 #include <linux/buffer_head.h>
 #include <linux/blkdev.h>
 #include <linux/random.h>
+#include <linux/version.h>
 #include <asm/div64.h>
+#include "compat.h"
 #include "ctree.h"
 #include "extent_map.h"
 #include "disk-io.h"
-- 
cgit v1.2.3


From b2950863c61bc24cf0f63bc05947d9d50663c4c0 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 2 Dec 2008 09:54:17 -0500
Subject: Btrfs: make things static and include the right headers

Shut up various sparse warnings about symbols that should be either
static or have their declarations in scope.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/btrfs/ctree.c            |  2 +-
 fs/btrfs/disk-io.c          | 16 ++++++++--------
 fs/btrfs/extent-tree.c      | 12 ++++++------
 fs/btrfs/extent_io.c        | 35 +++++++++++++++--------------------
 fs/btrfs/free-space-cache.c |  6 ++++--
 fs/btrfs/inode-item.c       |  2 +-
 fs/btrfs/inode.c            | 26 +++++++++++++-------------
 fs/btrfs/ioctl.c            | 14 +++++++-------
 fs/btrfs/root-tree.c        |  2 ++
 fs/btrfs/super.c            |  2 +-
 fs/btrfs/tree-log.c         |  5 +++--
 fs/btrfs/volumes.c          | 12 ++++++------
 fs/btrfs/zlib.c             |  1 +
 13 files changed, 68 insertions(+), 67 deletions(-)

(limited to 'fs/btrfs/volumes.c')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 71ef0a2e2daa..a83cbdf1d8c4 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -217,7 +217,7 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
  * this uses that block instead of allocating a new one.  btrfs_alloc_reserved_extent
  * is used to finish the allocation.
  */
-int noinline __btrfs_cow_block(struct btrfs_trans_handle *trans,
+static int noinline __btrfs_cow_block(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *root,
 			     struct extent_buffer *buf,
 			     struct extent_buffer *parent, int parent_slot,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index fb04665e5005..8a2bcc7024fe 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -93,9 +93,9 @@ struct async_submit_bio {
  * extents on the btree inode are pretty simple, there's one extent
  * that covers the entire device
  */
-struct extent_map *btree_get_extent(struct inode *inode, struct page *page,
-				    size_t page_offset, u64 start, u64 len,
-				    int create)
+static struct extent_map *btree_get_extent(struct inode *inode,
+		struct page *page, size_t page_offset, u64 start, u64 len,
+		int create)
 {
 	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
 	struct extent_map *em;
@@ -295,7 +295,7 @@ printk("read extent buffer pages failed with ret %d mirror no %d\n", ret, mirror
  * checksum a dirty tree block before IO.  This has extra checks to make
  * sure we only fill in the checksum field in the first page of a multi-page block
  */
-int csum_dirty_buffer(struct btrfs_root *root, struct page *page)
+static int csum_dirty_buffer(struct btrfs_root *root, struct page *page)
 {
 	struct extent_io_tree *tree;
 	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
@@ -365,7 +365,7 @@ static int check_tree_block_fsid(struct btrfs_root *root,
 	return ret;
 }
 
-int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
+static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
 			       struct extent_state *state)
 {
 	struct extent_io_tree *tree;
@@ -660,7 +660,7 @@ static int btree_writepages(struct address_space *mapping,
 	return extent_writepages(tree, mapping, btree_get_extent, wbc);
 }
 
-int btree_readpage(struct file *file, struct page *page)
+static int btree_readpage(struct file *file, struct page *page)
 {
 	struct extent_io_tree *tree;
 	tree = &BTRFS_I(page->mapping->host)->io_tree;
@@ -1200,7 +1200,7 @@ static void __unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
 	}
 }
 
-void btrfs_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
+static void btrfs_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
 {
 	struct inode *inode;
 	struct extent_map_tree *em_tree;
@@ -1842,7 +1842,7 @@ static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate)
 	put_bh(bh);
 }
 
-int write_all_supers(struct btrfs_root *root)
+static int write_all_supers(struct btrfs_root *root)
 {
 	struct list_head *cur;
 	struct list_head *head = &root->fs_info->fs_devices->devices;
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index a970472eab17..d15638529389 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -74,7 +74,7 @@ static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
  * this adds the block group to the fs_info rb tree for the block group
  * cache
  */
-int btrfs_add_block_group_cache(struct btrfs_fs_info *info,
+static int btrfs_add_block_group_cache(struct btrfs_fs_info *info,
 				struct btrfs_block_group_cache *block_group)
 {
 	struct rb_node **p;
@@ -289,7 +289,7 @@ err:
 /*
  * return the block group that starts at or after bytenr
  */
-struct btrfs_block_group_cache *btrfs_lookup_first_block_group(struct
+static struct btrfs_block_group_cache *btrfs_lookup_first_block_group(struct
 						       btrfs_fs_info *info,
 							 u64 bytenr)
 {
@@ -3445,7 +3445,7 @@ static int noinline cache_drop_leaf_ref(struct btrfs_trans_handle *trans,
 	return 0;
 }
 
-int drop_snap_lookup_refcount(struct btrfs_root *root, u64 start, u64 len,
+static int drop_snap_lookup_refcount(struct btrfs_root *root, u64 start, u64 len,
 			      u32 *refs)
 {
 	int ret;
@@ -5434,7 +5434,7 @@ static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
 	return flags;
 }
 
-int __alloc_chunk_for_shrink(struct btrfs_root *root,
+static int __alloc_chunk_for_shrink(struct btrfs_root *root,
 		     struct btrfs_block_group_cache *shrink_block_group,
 		     int force)
 {
@@ -5703,8 +5703,8 @@ out:
 	return ret;
 }
 
-int find_first_block_group(struct btrfs_root *root, struct btrfs_path *path,
-			   struct btrfs_key *key)
+static int find_first_block_group(struct btrfs_root *root,
+		struct btrfs_path *path, struct btrfs_key *key)
 {
 	int ret = 0;
 	struct btrfs_key found_key;
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index d79ccdbfdd95..c3dfe2a0ec85 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -112,7 +112,7 @@ void extent_io_tree_init(struct extent_io_tree *tree,
 }
 EXPORT_SYMBOL(extent_io_tree_init);
 
-struct extent_state *alloc_extent_state(gfp_t mask)
+static struct extent_state *alloc_extent_state(gfp_t mask)
 {
 	struct extent_state *state;
 #ifdef LEAK_DEBUG
@@ -136,7 +136,7 @@ struct extent_state *alloc_extent_state(gfp_t mask)
 }
 EXPORT_SYMBOL(alloc_extent_state);
 
-void free_extent_state(struct extent_state *state)
+static void free_extent_state(struct extent_state *state)
 {
 	if (!state)
 		return;
@@ -662,7 +662,7 @@ static void set_state_bits(struct extent_io_tree *tree,
  * [start, end] is inclusive
  * This takes the tree lock.
  */
-int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits,
+static int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits,
 		   int exclusive, u64 *failed_start, gfp_t mask)
 {
 	struct extent_state *state;
@@ -879,12 +879,11 @@ int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
 }
 EXPORT_SYMBOL(set_extent_new);
 
-int clear_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
+static int clear_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
 		       gfp_t mask)
 {
 	return clear_extent_bit(tree, start, end, EXTENT_NEW, 0, 0, mask);
 }
-EXPORT_SYMBOL(clear_extent_new);
 
 int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
 			gfp_t mask)
@@ -894,27 +893,24 @@ int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
 }
 EXPORT_SYMBOL(set_extent_uptodate);
 
-int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
+static int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
 			  gfp_t mask)
 {
 	return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0, mask);
 }
-EXPORT_SYMBOL(clear_extent_uptodate);
 
-int set_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end,
+static int set_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end,
 			 gfp_t mask)
 {
 	return set_extent_bit(tree, start, end, EXTENT_WRITEBACK,
 			      0, NULL, mask);
 }
-EXPORT_SYMBOL(set_extent_writeback);
 
-int clear_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end,
+static int clear_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end,
 			   gfp_t mask)
 {
 	return clear_extent_bit(tree, start, end, EXTENT_WRITEBACK, 1, 0, mask);
 }
-EXPORT_SYMBOL(clear_extent_writeback);
 
 int wait_on_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end)
 {
@@ -994,7 +990,7 @@ EXPORT_SYMBOL(set_range_dirty);
 /*
  * helper function to set both pages and extents in the tree writeback
  */
-int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end)
+static int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end)
 {
 	unsigned long index = start >> PAGE_CACHE_SHIFT;
 	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
@@ -1010,7 +1006,6 @@ int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end)
 	set_extent_writeback(tree, start, end, GFP_NOFS);
 	return 0;
 }
-EXPORT_SYMBOL(set_range_writeback);
 
 /*
  * find the first offset in the io tree with 'bits' set. zero is
@@ -1432,11 +1427,13 @@ out:
 	spin_unlock_irq(&tree->lock);
 	return total_bytes;
 }
+
+#if 0
 /*
  * helper function to lock both pages and extents in the tree.
  * pages must be locked first.
  */
-int lock_range(struct extent_io_tree *tree, u64 start, u64 end)
+static int lock_range(struct extent_io_tree *tree, u64 start, u64 end)
 {
 	unsigned long index = start >> PAGE_CACHE_SHIFT;
 	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
@@ -1473,12 +1470,11 @@ failed:
 	}
 	return err;
 }
-EXPORT_SYMBOL(lock_range);
 
 /*
  * helper function to unlock both pages and extents in the tree.
  */
-int unlock_range(struct extent_io_tree *tree, u64 start, u64 end)
+static int unlock_range(struct extent_io_tree *tree, u64 start, u64 end)
 {
 	unsigned long index = start >> PAGE_CACHE_SHIFT;
 	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
@@ -1493,7 +1489,7 @@ int unlock_range(struct extent_io_tree *tree, u64 start, u64 end)
 	unlock_extent(tree, start, end, GFP_NOFS);
 	return 0;
 }
-EXPORT_SYMBOL(unlock_range);
+#endif
 
 /*
  * set the private field for a given byte offset in the tree.  If there isn't
@@ -1956,7 +1952,7 @@ void set_page_extent_mapped(struct page *page)
 }
 EXPORT_SYMBOL(set_page_extent_mapped);
 
-void set_page_extent_head(struct page *page, unsigned long len)
+static void set_page_extent_head(struct page *page, unsigned long len)
 {
 	set_page_private(page, EXTENT_PAGE_PRIVATE_FIRST_PAGE | len << 2);
 }
@@ -2397,7 +2393,7 @@ update_nr_written:
  * WB_SYNC_ALL then we were called for data integrity and we must wait for
  * existing IO to complete.
  */
-int extent_write_cache_pages(struct extent_io_tree *tree,
+static int extent_write_cache_pages(struct extent_io_tree *tree,
 			     struct address_space *mapping,
 			     struct writeback_control *wbc,
 			     writepage_t writepage, void *data,
@@ -2502,7 +2498,6 @@ retry:
 	}
 	return ret;
 }
-EXPORT_SYMBOL(extent_write_cache_pages);
 
 static noinline void flush_write_bio(void *data)
 {
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index f4926c0f3c8c..09462adfbe33 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -443,7 +443,8 @@ void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group)
 	mutex_unlock(&block_group->alloc_mutex);
 }
 
-struct btrfs_free_space *btrfs_find_free_space_offset(struct
+#if 0
+static struct btrfs_free_space *btrfs_find_free_space_offset(struct
 						      btrfs_block_group_cache
 						      *block_group, u64 offset,
 						      u64 bytes)
@@ -458,7 +459,7 @@ struct btrfs_free_space *btrfs_find_free_space_offset(struct
 	return ret;
 }
 
-struct btrfs_free_space *btrfs_find_free_space_bytes(struct
+static struct btrfs_free_space *btrfs_find_free_space_bytes(struct
 						     btrfs_block_group_cache
 						     *block_group, u64 offset,
 						     u64 bytes)
@@ -472,6 +473,7 @@ struct btrfs_free_space *btrfs_find_free_space_bytes(struct
 
 	return ret;
 }
+#endif
 
 struct btrfs_free_space *btrfs_find_free_space(struct btrfs_block_group_cache
 					       *block_group, u64 offset,
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index d93451c66ba1..3d46fa1f29a4 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -20,7 +20,7 @@
 #include "disk-io.h"
 #include "transaction.h"
 
-int find_name_in_backref(struct btrfs_path *path, const char * name,
+static int find_name_in_backref(struct btrfs_path *path, const char *name,
 			 int name_len, struct btrfs_inode_ref **ref_ret)
 {
 	struct extent_buffer *leaf;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index b3d4078b69a6..bd58ba655a4d 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1130,7 +1130,7 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
  * bytes in this file, and to maintain the list of inodes that
  * have pending delalloc work to be done.
  */
-int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,
+static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,
 		       unsigned long old, unsigned long bits)
 {
 	unsigned long flags;
@@ -1151,7 +1151,7 @@ int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,
 /*
  * extent_io.c clear_bit_hook, see set_bit_hook for why
  */
-int btrfs_clear_bit_hook(struct inode *inode, u64 start, u64 end,
+static int btrfs_clear_bit_hook(struct inode *inode, u64 start, u64 end,
 			 unsigned long old, unsigned long bits)
 {
 	if ((old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
@@ -1215,7 +1215,7 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
  * At IO completion time the cums attached on the ordered extent record
  * are inserted into the btree
  */
-int __btrfs_submit_bio_start(struct inode *inode, int rw, struct bio *bio,
+static int __btrfs_submit_bio_start(struct inode *inode, int rw, struct bio *bio,
 			  int mirror_num, unsigned long bio_flags)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -1234,7 +1234,7 @@ int __btrfs_submit_bio_start(struct inode *inode, int rw, struct bio *bio,
  * At IO completion time the cums attached on the ordered extent record
  * are inserted into the btree
  */
-int __btrfs_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
+static int __btrfs_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
 			  int mirror_num, unsigned long bio_flags)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -1245,7 +1245,7 @@ int __btrfs_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
  * extent_io.c submission hook. This does the right thing for csum calculation on write,
  * or reading the csums from the tree before a read
  */
-int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
+static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
 			  int mirror_num, unsigned long bio_flags)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -1313,7 +1313,7 @@ struct btrfs_writepage_fixup {
 	struct btrfs_work work;
 };
 
-void btrfs_writepage_fixup_worker(struct btrfs_work *work)
+static void btrfs_writepage_fixup_worker(struct btrfs_work *work)
 {
 	struct btrfs_writepage_fixup *fixup;
 	struct btrfs_ordered_extent *ordered;
@@ -1372,7 +1372,7 @@ out_page:
  * to fix it up.  The async helper will wait for ordered extents, set
  * the delalloc bit and make it safe to write the page.
  */
-int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end)
+static int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end)
 {
 	struct inode *inode = page->mapping->host;
 	struct btrfs_writepage_fixup *fixup;
@@ -1526,7 +1526,7 @@ nocow:
 	return 0;
 }
 
-int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
+static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
 				struct extent_state *state, int uptodate)
 {
 	return btrfs_finish_ordered_io(page->mapping->host, start, end);
@@ -1548,7 +1548,7 @@ struct io_failure_record {
 	int last_mirror;
 };
 
-int btrfs_io_failed_hook(struct bio *failed_bio,
+static int btrfs_io_failed_hook(struct bio *failed_bio,
 			 struct page *page, u64 start, u64 end,
 			 struct extent_state *state)
 {
@@ -1642,7 +1642,7 @@ int btrfs_io_failed_hook(struct bio *failed_bio,
  * each time an IO finishes, we do a fast check in the IO failure tree
  * to see if we need to process or clean up an io_failure_record
  */
-int btrfs_clean_io_failures(struct inode *inode, u64 start)
+static int btrfs_clean_io_failures(struct inode *inode, u64 start)
 {
 	u64 private;
 	u64 private_failure;
@@ -1675,7 +1675,7 @@ int btrfs_clean_io_failures(struct inode *inode, u64 start)
  * if there's a match, we allow the bio to finish.  If not, we go through
  * the io_failure_record routines to find good copies
  */
-int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end,
+static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end,
 			       struct extent_state *state)
 {
 	size_t offset = start - ((u64)page->index << PAGE_CACHE_SHIFT);
@@ -4362,8 +4362,8 @@ out:
  * Invalidate a single dcache entry at the root of the filesystem.
  * Needed after creation of snapshot or subvolume.
  */
-void btrfs_invalidate_dcache_root(struct inode *dir, char *name,
-				  int namelen)
+static void btrfs_invalidate_dcache_root(struct inode *dir,
+		char *name, int namelen)
 {
 	struct dentry *alias, *entry;
 	struct qstr qstr;
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 35f650e183e1..cc7c5161e269 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -354,7 +354,7 @@ out_unlock:
 }
 
 
-int btrfs_defrag_file(struct file *file)
+static int btrfs_defrag_file(struct file *file)
 {
 	struct inode *inode = fdentry(file)->d_inode;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -649,7 +649,7 @@ static int btrfs_ioctl_defrag(struct file *file)
 	return 0;
 }
 
-long btrfs_ioctl_add_dev(struct btrfs_root *root, void __user *arg)
+static long btrfs_ioctl_add_dev(struct btrfs_root *root, void __user *arg)
 {
 	struct btrfs_ioctl_vol_args *vol_args;
 	int ret;
@@ -671,7 +671,7 @@ out:
 	return ret;
 }
 
-long btrfs_ioctl_rm_dev(struct btrfs_root *root, void __user *arg)
+static long btrfs_ioctl_rm_dev(struct btrfs_root *root, void __user *arg)
 {
 	struct btrfs_ioctl_vol_args *vol_args;
 	int ret;
@@ -696,8 +696,8 @@ out:
 	return ret;
 }
 
-long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, u64 off,
-		       u64 olen, u64 destoff)
+static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
+		u64 off, u64 olen, u64 destoff)
 {
 	struct inode *inode = fdentry(file)->d_inode;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -1035,7 +1035,7 @@ out_fput:
 	return ret;
 }
 
-long btrfs_ioctl_clone_range(struct file *file, unsigned long argptr)
+static long btrfs_ioctl_clone_range(struct file *file, unsigned long argptr)
 {
 	struct btrfs_ioctl_clone_range_args args;
 
@@ -1051,7 +1051,7 @@ long btrfs_ioctl_clone_range(struct file *file, unsigned long argptr)
  * basically own the machine, and have a very in depth understanding
  * of all the possible deadlocks and enospc problems.
  */
-long btrfs_ioctl_trans_start(struct file *file)
+static long btrfs_ioctl_trans_start(struct file *file)
 {
 	struct inode *inode = fdentry(file)->d_inode;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index dbe20d4c6ea4..f99335a999d6 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -276,6 +276,7 @@ out:
 	return ret;
 }
 
+#if 0 /* this will get used when snapshot deletion is implemented */
 int btrfs_del_root_ref(struct btrfs_trans_handle *trans,
 		       struct btrfs_root *tree_root,
 		       u64 root_id, u8 type, u64 ref_id)
@@ -299,6 +300,7 @@ int btrfs_del_root_ref(struct btrfs_trans_handle *trans,
 	btrfs_free_path(path);
 	return ret;
 }
+#endif
 
 int btrfs_find_root_ref(struct btrfs_root *tree_root,
 		   struct btrfs_path *path,
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 1975ea273dc3..93a21c77064a 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -647,7 +647,7 @@ static int btrfs_interface_init(void)
 	return misc_register(&btrfs_misc);
 }
 
-void btrfs_interface_exit(void)
+static void btrfs_interface_exit(void)
 {
 	if (misc_deregister(&btrfs_misc) < 0)
 		printk("misc_deregister failed for control device");
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index be4fc30a30e4..4fcfc8b1189b 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -23,6 +23,7 @@
 #include "locking.h"
 #include "print-tree.h"
 #include "compat.h"
+#include "tree-log.h"
 
 /* magic values for the inode_only field in btrfs_log_inode:
  *
@@ -78,7 +79,7 @@ static int __btrfs_log_inode(struct btrfs_trans_handle *trans,
  * tree of log tree roots.  This must be called with a tree log transaction
  * running (see start_log_trans).
  */
-int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
+static int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
 		      struct btrfs_root *root)
 {
 	struct btrfs_key key;
@@ -1934,7 +1935,7 @@ static int walk_log_tree(struct btrfs_trans_handle *trans,
 	return ret;
 }
 
-int wait_log_commit(struct btrfs_root *log)
+static int wait_log_commit(struct btrfs_root *log)
 {
 	DEFINE_WAIT(wait);
 	u64 transid = log->fs_info->tree_log_transid;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 724ead54529b..769f2c5d9e9e 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -238,7 +238,7 @@ done:
 	return 0;
 }
 
-void pending_bios_fn(struct btrfs_work *work)
+static void pending_bios_fn(struct btrfs_work *work)
 {
 	struct btrfs_device *device;
 
@@ -686,7 +686,7 @@ error:
 	return ret;
 }
 
-int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
+static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
 			  struct btrfs_device *device,
 			  u64 start)
 {
@@ -1393,7 +1393,7 @@ error:
 	goto out;
 }
 
-int noinline btrfs_update_device(struct btrfs_trans_handle *trans,
+static int noinline btrfs_update_device(struct btrfs_trans_handle *trans,
 				 struct btrfs_device *device)
 {
 	int ret;
@@ -1497,7 +1497,7 @@ static int btrfs_free_chunk(struct btrfs_trans_handle *trans,
 	return 0;
 }
 
-int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64
+static int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64
 			chunk_offset)
 {
 	struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
@@ -1543,7 +1543,7 @@ int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64
 	return ret;
 }
 
-int btrfs_relocate_chunk(struct btrfs_root *root,
+static int btrfs_relocate_chunk(struct btrfs_root *root,
 			 u64 chunk_tree, u64 chunk_objectid,
 			 u64 chunk_offset)
 {
@@ -1884,7 +1884,7 @@ done:
 	return ret;
 }
 
-int btrfs_add_system_chunk(struct btrfs_trans_handle *trans,
+static int btrfs_add_system_chunk(struct btrfs_trans_handle *trans,
 			   struct btrfs_root *root,
 			   struct btrfs_key *key,
 			   struct btrfs_chunk *chunk, int item_size)
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
index 5b9f7002513c..c4617cde6c73 100644
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c
@@ -30,6 +30,7 @@
 #include <linux/sched.h>
 #include <linux/pagemap.h>
 #include <linux/bio.h>
+#include "compression.h"
 
 /* Plan: call deflate() with avail_in == *sourcelen,
 	avail_out = *dstlen - 12 and flush == Z_FINISH.
-- 
cgit v1.2.3


From 97288f2c71e46965a2010baea10548fcb5f3af73 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 2 Dec 2008 06:36:09 -0500
Subject: Btrfs: corret fmode_t annotations

Make sure to propagate fmode_t properly and use the right constants for
it.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/btrfs/super.c   | 14 +++++++++-----
 fs/btrfs/volumes.c | 18 +++++++++---------
 fs/btrfs/volumes.h |  4 ++--
 3 files changed, 20 insertions(+), 16 deletions(-)

(limited to 'fs/btrfs/volumes.c')

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 93a21c77064a..09908f25fca9 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -242,7 +242,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
  * All other options will be parsed on much later in the mount process and
  * only when we need to allocate a new super block.
  */
-static int btrfs_parse_early_options(const char *options, int flags,
+static int btrfs_parse_early_options(const char *options, fmode_t flags,
 		void *holder, char **subvol_name,
 		struct btrfs_fs_devices **fs_devices)
 {
@@ -418,18 +418,22 @@ static int btrfs_get_sb(struct file_system_type *fs_type, int flags,
 	struct super_block *s;
 	struct dentry *root;
 	struct btrfs_fs_devices *fs_devices = NULL;
+	fmode_t mode = FMODE_READ;
 	int error = 0;
 
-	error = btrfs_parse_early_options(data, flags, fs_type,
+	if (!(flags & MS_RDONLY))
+		mode |= FMODE_WRITE;
+
+	error = btrfs_parse_early_options(data, mode, fs_type,
 					  &subvol_name, &fs_devices);
 	if (error)
 		goto error;
 
-	error = btrfs_scan_one_device(dev_name, flags, fs_type, &fs_devices);
+	error = btrfs_scan_one_device(dev_name, mode, fs_type, &fs_devices);
 	if (error)
 		goto error_free_subvol_name;
 
-	error = btrfs_open_devices(fs_devices, flags, fs_type);
+	error = btrfs_open_devices(fs_devices, mode, fs_type);
 	if (error)
 		goto error_free_subvol_name;
 
@@ -591,7 +595,7 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd,
 	len = strnlen(vol->name, BTRFS_PATH_NAME_MAX);
 	switch (cmd) {
 	case BTRFS_IOC_SCAN_DEV:
-		ret = btrfs_scan_one_device(vol->name, MS_RDONLY,
+		ret = btrfs_scan_one_device(vol->name, FMODE_READ,
 					    &btrfs_fs_type, &fs_devices);
 		break;
 	}
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 769f2c5d9e9e..6c523b3360f6 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -394,7 +394,7 @@ int btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
 }
 
 int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
-			 int flags, void *holder)
+			 fmode_t flags, void *holder)
 {
 	struct block_device *bdev;
 	struct list_head *head = &fs_devices->devices;
@@ -469,7 +469,7 @@ int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
 error_brelse:
 		brelse(bh);
 error_close:
-		close_bdev_exclusive(bdev, MS_RDONLY);
+		close_bdev_exclusive(bdev, FMODE_READ);
 error:
 		continue;
 	}
@@ -488,7 +488,7 @@ out:
 }
 
 int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
-		       int flags, void *holder)
+		       fmode_t flags, void *holder)
 {
 	int ret;
 
@@ -507,7 +507,7 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
 	return ret;
 }
 
-int btrfs_scan_one_device(const char *path, int flags, void *holder,
+int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
 			  struct btrfs_fs_devices **fs_devices_ret)
 {
 	struct btrfs_super_block *disk_super;
@@ -1008,7 +1008,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
 			goto out;
 		}
 	} else {
-		bdev = open_bdev_exclusive(device_path, MS_RDONLY,
+		bdev = open_bdev_exclusive(device_path, FMODE_READ,
 				      root->fs_info->bdev_holder);
 		if (IS_ERR(bdev)) {
 			ret = PTR_ERR(bdev);
@@ -1078,7 +1078,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
 		BUG_ON(device->writeable);
 		brelse(bh);
 		if (bdev)
-			close_bdev_exclusive(bdev, MS_RDONLY);
+			close_bdev_exclusive(bdev, FMODE_READ);
 
 		if (device->bdev) {
 			close_bdev_exclusive(device->bdev, device->mode);
@@ -1121,7 +1121,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
 	}
 	if (bdev) {
 		/* one close for us */
-		close_bdev_exclusive(bdev, MS_RDONLY);
+		close_bdev_exclusive(bdev, FMODE_READ);
 	}
 	kfree(device->name);
 	kfree(device);
@@ -1132,7 +1132,7 @@ error_brelse:
 	brelse(bh);
 error_close:
 	if (bdev)
-		close_bdev_exclusive(bdev, MS_RDONLY);
+		close_bdev_exclusive(bdev, FMODE_READ);
 out:
 	mutex_unlock(&root->fs_info->volume_mutex);
 	mutex_unlock(&uuid_mutex);
@@ -2913,7 +2913,7 @@ static int open_seed_devices(struct btrfs_root *root, u8 *fsid)
 		goto out;
 	}
 
-	ret = __btrfs_open_devices(fs_devices, MS_RDONLY,
+	ret = __btrfs_open_devices(fs_devices, FMODE_READ,
 				   root->fs_info->bdev_holder);
 	if (ret)
 		goto out;
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 9b41e4d3984d..fcbdcb3ae13e 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -135,8 +135,8 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
 		  int mirror_num, int async_submit);
 int btrfs_read_super_device(struct btrfs_root *root, struct extent_buffer *buf);
 int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
-		       int flags, void *holder);
-int btrfs_scan_one_device(const char *path, int flags, void *holder,
+		       fmode_t flags, void *holder);
+int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
 			  struct btrfs_fs_devices **fs_devices_ret);
 int btrfs_close_devices(struct btrfs_fs_devices *fs_devices);
 int btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices);
-- 
cgit v1.2.3


From d20f7043fa65659136c1a7c3c456eeeb5c6f431f Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 8 Dec 2008 16:58:54 -0500
Subject: Btrfs: move data checksumming into a dedicated tree

Btrfs stores checksums for each data block.  Until now, they have
been stored in the subvolume trees, indexed by the inode that is
referencing the data block.  This means that when we read the inode,
we've probably read in at least some checksums as well.

But, this has a few problems:

* The checksums are indexed by logical offset in the file.  When
compression is on, this means we have to do the expensive checksumming
on the uncompressed data.  It would be faster if we could checksum
the compressed data instead.

* If we implement encryption, we'll be checksumming the plain text and
storing that on disk.  This is significantly less secure.

* For either compression or encryption, we have to get the plain text
back before we can verify the checksum as correct.  This makes the raid
layer balancing and extent moving much more expensive.

* It makes the front end caching code more complex, as we have touch
the subvolume and inodes as we cache extents.

* There is potentitally one copy of the checksum in each subvolume
referencing an extent.

The solution used here is to store the extent checksums in a dedicated
tree.  This allows us to index the checksums by phyiscal extent
start and length.  It means:

* The checksum is against the data stored on disk, after any compression
or encryption is done.

* The checksum is stored in a central location, and can be verified without
following back references, or reading inodes.

This makes compression significantly faster by reducing the amount of
data that needs to be checksummed.  It will also allow much faster
raid management code in general.

The checksums are indexed by a key with a fixed objectid (a magic value
in ctree.h) and offset set to the starting byte of the extent.  This
allows us to copy the checksum items into the fsync log tree directly (or
any other tree), without having to invent a second format for them.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/compression.c  | 124 ++++++++++++++++++++++++++++----
 fs/btrfs/ctree.h        |  30 +++++---
 fs/btrfs/disk-io.c      |  45 ++++++++++--
 fs/btrfs/extent_io.c    |   5 ++
 fs/btrfs/file-item.c    | 185 ++++++++++++++++++------------------------------
 fs/btrfs/inode.c        |  45 ++++++------
 fs/btrfs/ioctl.c        |  55 +-------------
 fs/btrfs/ordered-data.c |   7 +-
 fs/btrfs/ordered-data.h |  10 +--
 fs/btrfs/tree-log.c     | 121 +++++++++++++++++++++++++++----
 fs/btrfs/volumes.c      |   1 +
 11 files changed, 387 insertions(+), 241 deletions(-)

(limited to 'fs/btrfs/volumes.c')

diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 4febe2eb0b83..ad7274137309 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -69,11 +69,27 @@ struct compressed_bio {
 
 	/* IO errors */
 	int errors;
+	int mirror_num;
 
 	/* for reads, this is the bio we are copying the data into */
 	struct bio *orig_bio;
+
+	/*
+	 * the start of a variable length array of checksums only
+	 * used by reads
+	 */
+	u32 sums;
 };
 
+static inline int compressed_bio_size(struct btrfs_root *root,
+				      unsigned long disk_size)
+{
+	u16 csum_size = btrfs_super_csum_size(&root->fs_info->super_copy);
+	return sizeof(struct compressed_bio) +
+		((disk_size + root->sectorsize - 1) / root->sectorsize) *
+		csum_size;
+}
+
 static struct bio *compressed_bio_alloc(struct block_device *bdev,
 					u64 first_byte, gfp_t gfp_flags)
 {
@@ -96,6 +112,47 @@ static struct bio *compressed_bio_alloc(struct block_device *bdev,
 	return bio;
 }
 
+static int check_compressed_csum(struct inode *inode,
+				 struct compressed_bio *cb,
+				 u64 disk_start)
+{
+	int ret;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct page *page;
+	unsigned long i;
+	char *kaddr;
+	u32 csum;
+	u32 *cb_sum = &cb->sums;
+
+	if (btrfs_test_opt(root, NODATASUM) ||
+	    btrfs_test_flag(inode, NODATASUM))
+		return 0;
+
+	for (i = 0; i < cb->nr_pages; i++) {
+		page = cb->compressed_pages[i];
+		csum = ~(u32)0;
+
+		kaddr = kmap_atomic(page, KM_USER0);
+		csum = btrfs_csum_data(root, kaddr, csum, PAGE_CACHE_SIZE);
+		btrfs_csum_final(csum, (char *)&csum);
+		kunmap_atomic(kaddr, KM_USER0);
+
+		if (csum != *cb_sum) {
+			printk("btrfs csum failed ino %lu extent %llu csum %u "
+			       "wanted %u mirror %d\n", inode->i_ino,
+			       (unsigned long long)disk_start,
+			       csum, *cb_sum, cb->mirror_num);
+			ret = -EIO;
+			goto fail;
+		}
+		cb_sum++;
+
+	}
+	ret = 0;
+fail:
+	return ret;
+}
+
 /* when we finish reading compressed pages from the disk, we
  * decompress them and then run the bio end_io routines on the
  * decompressed pages (in the inode address space).
@@ -124,16 +181,21 @@ static void end_compressed_bio_read(struct bio *bio, int err)
 	if (!atomic_dec_and_test(&cb->pending_bios))
 		goto out;
 
+	inode = cb->inode;
+	ret = check_compressed_csum(inode, cb, (u64)bio->bi_sector << 9);
+	if (ret)
+		goto csum_failed;
+
 	/* ok, we're the last bio for this extent, lets start
 	 * the decompression.
 	 */
-	inode = cb->inode;
 	tree = &BTRFS_I(inode)->io_tree;
 	ret = btrfs_zlib_decompress_biovec(cb->compressed_pages,
 					cb->start,
 					cb->orig_bio->bi_io_vec,
 					cb->orig_bio->bi_vcnt,
 					cb->compressed_len);
+csum_failed:
 	if (ret)
 		cb->errors = 1;
 
@@ -148,8 +210,21 @@ static void end_compressed_bio_read(struct bio *bio, int err)
 	/* do io completion on the original bio */
 	if (cb->errors) {
 		bio_io_error(cb->orig_bio);
-	} else
+	} else {
+		int bio_index = 0;
+		struct bio_vec *bvec = cb->orig_bio->bi_io_vec;
+
+		/*
+		 * we have verified the checksum already, set page
+		 * checked so the end_io handlers know about it
+		 */
+		while(bio_index < cb->orig_bio->bi_vcnt) {
+			SetPageChecked(bvec->bv_page);
+			bvec++;
+			bio_index++;
+		}
 		bio_endio(cb->orig_bio, 0);
+	}
 
 	/* finally free the cb struct */
 	kfree(cb->compressed_pages);
@@ -277,12 +352,13 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
 	int ret;
 
 	WARN_ON(start & ((u64)PAGE_CACHE_SIZE - 1));
-	cb = kmalloc(sizeof(*cb), GFP_NOFS);
+	cb = kmalloc(compressed_bio_size(root, compressed_len), GFP_NOFS);
 	atomic_set(&cb->pending_bios, 0);
 	cb->errors = 0;
 	cb->inode = inode;
 	cb->start = start;
 	cb->len = len;
+	cb->mirror_num = 0;
 	cb->compressed_pages = compressed_pages;
 	cb->compressed_len = compressed_len;
 	cb->orig_bio = NULL;
@@ -290,9 +366,6 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
 
 	bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
 
-	ret = btrfs_csum_file_bytes(root, inode, start, len);
-	BUG_ON(ret);
-
 	bio = compressed_bio_alloc(bdev, first_byte, GFP_NOFS);
 	bio->bi_private = cb;
 	bio->bi_end_io = end_compressed_bio_write;
@@ -325,6 +398,9 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
 			ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
 			BUG_ON(ret);
 
+			ret = btrfs_csum_one_bio(root, inode, bio, start, 1);
+			BUG_ON(ret);
+
 			ret = btrfs_map_bio(root, WRITE, bio, 0, 1);
 			BUG_ON(ret);
 
@@ -348,6 +424,9 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
 	ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
 	BUG_ON(ret);
 
+	ret = btrfs_csum_one_bio(root, inode, bio, start, 1);
+	BUG_ON(ret);
+
 	ret = btrfs_map_bio(root, WRITE, bio, 0, 1);
 	BUG_ON(ret);
 
@@ -510,6 +589,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
 	u64 em_start;
 	struct extent_map *em;
 	int ret;
+	u32 *sums;
 
 	tree = &BTRFS_I(inode)->io_tree;
 	em_tree = &BTRFS_I(inode)->extent_tree;
@@ -521,15 +601,18 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
 				   PAGE_CACHE_SIZE);
 	spin_unlock(&em_tree->lock);
 
-	cb = kmalloc(sizeof(*cb), GFP_NOFS);
+	compressed_len = em->block_len;
+	cb = kmalloc(compressed_bio_size(root, compressed_len), GFP_NOFS);
 	atomic_set(&cb->pending_bios, 0);
 	cb->errors = 0;
 	cb->inode = inode;
+	cb->mirror_num = mirror_num;
+	sums = &cb->sums;
 
 	cb->start = em->orig_start;
-	compressed_len = em->block_len;
 	em_len = em->len;
 	em_start = em->start;
+
 	free_extent_map(em);
 	em = NULL;
 
@@ -551,11 +634,6 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
 
 	add_ra_bio_pages(inode, em_start + em_len, cb);
 
-	if (!btrfs_test_opt(root, NODATASUM) &&
-	    !btrfs_test_flag(inode, NODATASUM)) {
-		btrfs_lookup_bio_sums(root, inode, cb->orig_bio);
-	}
-
 	/* include any pages we added in add_ra-bio_pages */
 	uncompressed_len = bio->bi_vcnt * PAGE_CACHE_SIZE;
 	cb->len = uncompressed_len;
@@ -568,6 +646,8 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
 	for (page_index = 0; page_index < nr_pages; page_index++) {
 		page = cb->compressed_pages[page_index];
 		page->mapping = inode->i_mapping;
+		page->index = em_start >> PAGE_CACHE_SHIFT;
+
 		if (comp_bio->bi_size)
 			ret = tree->ops->merge_bio_hook(page, 0,
 							PAGE_CACHE_SIZE,
@@ -591,7 +671,16 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
 			 */
 			atomic_inc(&cb->pending_bios);
 
-			ret = btrfs_map_bio(root, READ, comp_bio, 0, 0);
+			if (!btrfs_test_opt(root, NODATASUM) &&
+			    !btrfs_test_flag(inode, NODATASUM)) {
+				btrfs_lookup_bio_sums(root, inode, comp_bio,
+						      sums);
+			}
+			sums += (comp_bio->bi_size + root->sectorsize - 1) /
+				root->sectorsize;
+
+			ret = btrfs_map_bio(root, READ, comp_bio,
+					    mirror_num, 0);
 			BUG_ON(ret);
 
 			bio_put(comp_bio);
@@ -610,7 +699,12 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
 	ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, 0);
 	BUG_ON(ret);
 
-	ret = btrfs_map_bio(root, READ, comp_bio, 0, 0);
+	if (!btrfs_test_opt(root, NODATASUM) &&
+	    !btrfs_test_flag(inode, NODATASUM)) {
+		btrfs_lookup_bio_sums(root, inode, comp_bio, sums);
+	}
+
+	ret = btrfs_map_bio(root, READ, comp_bio, mirror_num, 0);
 	BUG_ON(ret);
 
 	bio_put(comp_bio);
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 96f2ec7ad5bd..242b961ae6de 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -73,6 +73,9 @@ struct btrfs_ordered_sum;
 /* directory objectid inside the root tree */
 #define BTRFS_ROOT_TREE_DIR_OBJECTID 6ULL
 
+/* holds checksums of all the data extents */
+#define BTRFS_CSUM_TREE_OBJECTID 7ULL
+
 /* orhpan objectid for tracking unlinked/truncated files */
 #define BTRFS_ORPHAN_OBJECTID -5ULL
 
@@ -84,6 +87,13 @@ struct btrfs_ordered_sum;
 #define BTRFS_TREE_RELOC_OBJECTID -8ULL
 #define BTRFS_DATA_RELOC_TREE_OBJECTID -9ULL
 
+/*
+ * extent checksums all have this objectid
+ * this allows them to share the logging tree
+ * for fsyncs
+ */
+#define BTRFS_EXTENT_CSUM_OBJECTID -10ULL
+
 /* dummy objectid represents multiple objectids */
 #define BTRFS_MULTIPLE_OBJECTIDS -255ULL
 
@@ -634,6 +644,7 @@ struct btrfs_fs_info {
 	struct btrfs_root *chunk_root;
 	struct btrfs_root *dev_root;
 	struct btrfs_root *fs_root;
+	struct btrfs_root *csum_root;
 
 	/* the log root tree is a directory of all the other log roots */
 	struct btrfs_root *log_root_tree;
@@ -716,6 +727,7 @@ struct btrfs_fs_info {
 	struct btrfs_workers workers;
 	struct btrfs_workers delalloc_workers;
 	struct btrfs_workers endio_workers;
+	struct btrfs_workers endio_meta_workers;
 	struct btrfs_workers endio_write_workers;
 	struct btrfs_workers submit_workers;
 	/*
@@ -858,13 +870,12 @@ struct btrfs_root {
  * extent data is for file data
  */
 #define BTRFS_EXTENT_DATA_KEY	108
+
 /*
- * csum items have the checksums for data in the extents
+ * extent csums are stored in a separate tree and hold csums for
+ * an entire extent on disk.
  */
-#define BTRFS_CSUM_ITEM_KEY	120
-
-
-/* reserve 21-31 for other file/dir stuff */
+#define BTRFS_EXTENT_CSUM_KEY	128
 
 /*
  * root items point to tree roots.  There are typically in the root
@@ -1917,7 +1928,7 @@ int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root
 
 /* file-item.c */
 int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
-			  struct bio *bio);
+			  struct bio *bio, u32 *dst);
 int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *root,
 			     u64 objectid, u64 pos,
@@ -1929,17 +1940,16 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
 			     struct btrfs_path *path, u64 objectid,
 			     u64 bytenr, int mod);
 int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
-			   struct btrfs_root *root, struct inode *inode,
+			   struct btrfs_root *root,
 			   struct btrfs_ordered_sum *sums);
 int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
-		       struct bio *bio);
+		       struct bio *bio, u64 file_start, int contig);
 int btrfs_csum_file_bytes(struct btrfs_root *root, struct inode *inode,
 			  u64 start, unsigned long len);
 struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans,
 					  struct btrfs_root *root,
 					  struct btrfs_path *path,
-					  u64 objectid, u64 offset,
-					  int cow);
+					  u64 bytenr, int cow);
 int btrfs_csum_truncate(struct btrfs_trans_handle *trans,
 			struct btrfs_root *root, struct btrfs_path *path,
 			u64 isize);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 3eb7c2576fe5..61dc3b2c834b 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -445,11 +445,18 @@ static void end_workqueue_bio(struct bio *bio, int err)
 	end_io_wq->error = err;
 	end_io_wq->work.func = end_workqueue_fn;
 	end_io_wq->work.flags = 0;
-	if (bio->bi_rw & (1 << BIO_RW))
+
+	if (bio->bi_rw & (1 << BIO_RW)) {
 		btrfs_queue_worker(&fs_info->endio_write_workers,
 				   &end_io_wq->work);
-	else
-		btrfs_queue_worker(&fs_info->endio_workers, &end_io_wq->work);
+	} else {
+		if (end_io_wq->metadata)
+			btrfs_queue_worker(&fs_info->endio_meta_workers,
+					   &end_io_wq->work);
+		else
+			btrfs_queue_worker(&fs_info->endio_workers,
+					   &end_io_wq->work);
+	}
 }
 
 int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
@@ -1208,6 +1215,9 @@ static void __unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
 	info = (struct btrfs_fs_info *)bdi->unplug_io_data;
 	list_for_each(cur, &info->fs_devices->devices) {
 		device = list_entry(cur, struct btrfs_device, dev_list);
+		if (!device->bdev)
+			continue;
+
 		bdi = blk_get_backing_dev_info(device->bdev);
 		if (bdi->unplug_io_fn) {
 			bdi->unplug_io_fn(bdi, page);
@@ -1344,7 +1354,7 @@ static void end_workqueue_fn(struct btrfs_work *work)
 	 * blocksize <= pagesize, it is basically a noop
 	 */
 	if (end_io_wq->metadata && !bio_ready_for_csum(bio)) {
-		btrfs_queue_worker(&fs_info->endio_workers,
+		btrfs_queue_worker(&fs_info->endio_meta_workers,
 				   &end_io_wq->work);
 		return;
 	}
@@ -1454,6 +1464,8 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	struct buffer_head *bh;
 	struct btrfs_root *extent_root = kzalloc(sizeof(struct btrfs_root),
 						 GFP_NOFS);
+	struct btrfs_root *csum_root = kzalloc(sizeof(struct btrfs_root),
+						 GFP_NOFS);
 	struct btrfs_root *tree_root = kzalloc(sizeof(struct btrfs_root),
 					       GFP_NOFS);
 	struct btrfs_fs_info *fs_info = kzalloc(sizeof(*fs_info),
@@ -1470,7 +1482,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	struct btrfs_super_block *disk_super;
 
 	if (!extent_root || !tree_root || !fs_info ||
-	    !chunk_root || !dev_root) {
+	    !chunk_root || !dev_root || !csum_root) {
 		err = -ENOMEM;
 		goto fail;
 	}
@@ -1487,6 +1499,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	init_completion(&fs_info->kobj_unregister);
 	fs_info->tree_root = tree_root;
 	fs_info->extent_root = extent_root;
+	fs_info->csum_root = csum_root;
 	fs_info->chunk_root = chunk_root;
 	fs_info->dev_root = dev_root;
 	fs_info->fs_devices = fs_devices;
@@ -1652,6 +1665,8 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	btrfs_init_workers(&fs_info->fixup_workers, "fixup", 1);
 	btrfs_init_workers(&fs_info->endio_workers, "endio",
 			   fs_info->thread_pool_size);
+	btrfs_init_workers(&fs_info->endio_meta_workers, "endio-meta",
+			   fs_info->thread_pool_size);
 	btrfs_init_workers(&fs_info->endio_write_workers, "endio-write",
 			   fs_info->thread_pool_size);
 
@@ -1667,6 +1682,8 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	btrfs_start_workers(&fs_info->delalloc_workers, 1);
 	btrfs_start_workers(&fs_info->fixup_workers, 1);
 	btrfs_start_workers(&fs_info->endio_workers, fs_info->thread_pool_size);
+	btrfs_start_workers(&fs_info->endio_meta_workers,
+			    fs_info->thread_pool_size);
 	btrfs_start_workers(&fs_info->endio_write_workers,
 			    fs_info->thread_pool_size);
 
@@ -1751,6 +1768,13 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	if (ret)
 		goto fail_extent_root;
 
+	ret = find_and_setup_root(tree_root, fs_info,
+				  BTRFS_CSUM_TREE_OBJECTID, csum_root);
+	if (ret)
+		goto fail_extent_root;
+
+	csum_root->track_dirty = 1;
+
 	btrfs_read_block_groups(extent_root);
 
 	fs_info->generation = generation + 1;
@@ -1761,7 +1785,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root,
 					       "btrfs-cleaner");
 	if (!fs_info->cleaner_kthread)
-		goto fail_extent_root;
+		goto fail_csum_root;
 
 	fs_info->transaction_kthread = kthread_run(transaction_kthread,
 						   tree_root,
@@ -1825,6 +1849,8 @@ fail_cleaner:
 	filemap_write_and_wait(fs_info->btree_inode->i_mapping);
 	invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
 
+fail_csum_root:
+	free_extent_buffer(csum_root->node);
 fail_extent_root:
 	free_extent_buffer(extent_root->node);
 fail_tree_root:
@@ -1838,6 +1864,7 @@ fail_sb_buffer:
 	btrfs_stop_workers(&fs_info->delalloc_workers);
 	btrfs_stop_workers(&fs_info->workers);
 	btrfs_stop_workers(&fs_info->endio_workers);
+	btrfs_stop_workers(&fs_info->endio_meta_workers);
 	btrfs_stop_workers(&fs_info->endio_write_workers);
 	btrfs_stop_workers(&fs_info->submit_workers);
 fail_iput:
@@ -1853,6 +1880,7 @@ fail:
 	kfree(fs_info);
 	kfree(chunk_root);
 	kfree(dev_root);
+	kfree(csum_root);
 	return ERR_PTR(err);
 }
 
@@ -2131,6 +2159,9 @@ int close_ctree(struct btrfs_root *root)
 	if (root->fs_info->dev_root->node);
 		free_extent_buffer(root->fs_info->dev_root->node);
 
+	if (root->fs_info->csum_root->node);
+		free_extent_buffer(root->fs_info->csum_root->node);
+
 	btrfs_free_block_groups(root->fs_info);
 
 	del_fs_roots(fs_info);
@@ -2141,6 +2172,7 @@ int close_ctree(struct btrfs_root *root)
 	btrfs_stop_workers(&fs_info->delalloc_workers);
 	btrfs_stop_workers(&fs_info->workers);
 	btrfs_stop_workers(&fs_info->endio_workers);
+	btrfs_stop_workers(&fs_info->endio_meta_workers);
 	btrfs_stop_workers(&fs_info->endio_write_workers);
 	btrfs_stop_workers(&fs_info->submit_workers);
 
@@ -2163,6 +2195,7 @@ int close_ctree(struct btrfs_root *root)
 	kfree(fs_info->tree_root);
 	kfree(fs_info->chunk_root);
 	kfree(fs_info->dev_root);
+	kfree(fs_info->csum_root);
 	return 0;
 }
 
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index c3dfe2a0ec85..7449ecf32c50 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1732,6 +1732,9 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
 	int whole_page;
 	int ret;
 
+	if (err)
+		uptodate = 0;
+
 	do {
 		struct page *page = bvec->bv_page;
 		tree = &BTRFS_I(page->mapping->host)->io_tree;
@@ -1761,6 +1764,8 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
 			if (ret == 0) {
 				uptodate =
 					test_bit(BIO_UPTODATE, &bio->bi_flags);
+				if (err)
+					uptodate = 0;
 				continue;
 			}
 		}
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 234ed441736c..a3ad2ce00116 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -74,8 +74,7 @@ out:
 struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans,
 					  struct btrfs_root *root,
 					  struct btrfs_path *path,
-					  u64 objectid, u64 offset,
-					  int cow)
+					  u64 bytenr, int cow)
 {
 	int ret;
 	struct btrfs_key file_key;
@@ -87,9 +86,9 @@ struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans,
 		btrfs_super_csum_size(&root->fs_info->super_copy);
 	int csums_in_item;
 
-	file_key.objectid = objectid;
-	file_key.offset = offset;
-	btrfs_set_key_type(&file_key, BTRFS_CSUM_ITEM_KEY);
+	file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
+	file_key.offset = bytenr;
+	btrfs_set_key_type(&file_key, BTRFS_EXTENT_CSUM_KEY);
 	ret = btrfs_search_slot(trans, root, &file_key, path, 0, cow);
 	if (ret < 0)
 		goto fail;
@@ -100,11 +99,10 @@ struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans,
 			goto fail;
 		path->slots[0]--;
 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
-		if (btrfs_key_type(&found_key) != BTRFS_CSUM_ITEM_KEY ||
-		    found_key.objectid != objectid) {
+		if (btrfs_key_type(&found_key) != BTRFS_EXTENT_CSUM_KEY)
 			goto fail;
-		}
-		csum_offset = (offset - found_key.offset) >>
+
+		csum_offset = (bytenr - found_key.offset) >>
 				root->fs_info->sb->s_blocksize_bits;
 		csums_in_item = btrfs_item_size_nr(leaf, path->slots[0]);
 		csums_in_item /= csum_size;
@@ -143,7 +141,7 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
 }
 
 int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
-			  struct bio *bio)
+			  struct bio *bio, u32 *dst)
 {
 	u32 sum;
 	struct bio_vec *bvec = bio->bi_io_vec;
@@ -151,6 +149,7 @@ int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
 	u64 offset;
 	u64 item_start_offset = 0;
 	u64 item_last_offset = 0;
+	u64 disk_bytenr;
 	u32 diff;
 	u16 csum_size =
 		btrfs_super_csum_size(&root->fs_info->super_copy);
@@ -165,21 +164,22 @@ int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
 
 	WARN_ON(bio->bi_vcnt <= 0);
 
+	disk_bytenr = (u64)bio->bi_sector << 9;
 	while(bio_index < bio->bi_vcnt) {
 		offset = page_offset(bvec->bv_page) + bvec->bv_offset;
-		ret = btrfs_find_ordered_sum(inode, offset, &sum);
+		ret = btrfs_find_ordered_sum(inode, offset, disk_bytenr, &sum);
 		if (ret == 0)
 			goto found;
 
-		if (!item || offset < item_start_offset ||
-		    offset >= item_last_offset) {
+		if (!item || disk_bytenr < item_start_offset ||
+		    disk_bytenr >= item_last_offset) {
 			struct btrfs_key found_key;
 			u32 item_size;
 
 			if (item)
 				btrfs_release_path(root, path);
-			item = btrfs_lookup_csum(NULL, root, path,
-						 inode->i_ino, offset, 0);
+			item = btrfs_lookup_csum(NULL, root->fs_info->csum_root,
+						 path, disk_bytenr, 0);
 			if (IS_ERR(item)) {
 				ret = PTR_ERR(item);
 				if (ret == -ENOENT || ret == -EFBIG)
@@ -208,7 +208,7 @@ int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
 		 * this byte range must be able to fit inside
 		 * a single leaf so it will also fit inside a u32
 		 */
-		diff = offset - item_start_offset;
+		diff = disk_bytenr - item_start_offset;
 		diff = diff / root->sectorsize;
 		diff = diff * csum_size;
 
@@ -216,7 +216,11 @@ int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
 				   ((unsigned long)item) + diff,
 				   csum_size);
 found:
-		set_state_private(io_tree, offset, sum);
+		if (dst)
+			*dst++ = sum;
+		else
+			set_state_private(io_tree, offset, sum);
+		disk_bytenr += bvec->bv_len;
 		bio_index++;
 		bvec++;
 	}
@@ -224,75 +228,8 @@ found:
 	return 0;
 }
 
-int btrfs_csum_file_bytes(struct btrfs_root *root, struct inode *inode,
-			  u64 start, unsigned long len)
-{
-	struct btrfs_ordered_sum *sums;
-	struct btrfs_sector_sum *sector_sum;
-	struct btrfs_ordered_extent *ordered;
-	char *data;
-	struct page *page;
-	unsigned long total_bytes = 0;
-	unsigned long this_sum_bytes = 0;
-
-	sums = kzalloc(btrfs_ordered_sum_size(root, len), GFP_NOFS);
-	if (!sums)
-		return -ENOMEM;
-
-	sector_sum = sums->sums;
-	sums->file_offset = start;
-	sums->len = len;
-	INIT_LIST_HEAD(&sums->list);
-	ordered = btrfs_lookup_ordered_extent(inode, sums->file_offset);
-	BUG_ON(!ordered);
-
-	while(len > 0) {
-		if (start >= ordered->file_offset + ordered->len ||
-		    start < ordered->file_offset) {
-			sums->len = this_sum_bytes;
-			this_sum_bytes = 0;
-			btrfs_add_ordered_sum(inode, ordered, sums);
-			btrfs_put_ordered_extent(ordered);
-
-			sums = kzalloc(btrfs_ordered_sum_size(root, len),
-				       GFP_NOFS);
-			BUG_ON(!sums);
-			sector_sum = sums->sums;
-			sums->len = len;
-			sums->file_offset = start;
-			ordered = btrfs_lookup_ordered_extent(inode,
-						      sums->file_offset);
-			BUG_ON(!ordered);
-		}
-
-		page = find_get_page(inode->i_mapping,
-				     start >> PAGE_CACHE_SHIFT);
-
-		data = kmap_atomic(page, KM_USER0);
-		sector_sum->sum = ~(u32)0;
-		sector_sum->sum = btrfs_csum_data(root, data, sector_sum->sum,
-						  PAGE_CACHE_SIZE);
-		kunmap_atomic(data, KM_USER0);
-		btrfs_csum_final(sector_sum->sum,
-				 (char *)&sector_sum->sum);
-		sector_sum->offset = page_offset(page);
-		page_cache_release(page);
-
-		sector_sum++;
-		total_bytes += PAGE_CACHE_SIZE;
-		this_sum_bytes += PAGE_CACHE_SIZE;
-		start += PAGE_CACHE_SIZE;
-
-		WARN_ON(len < PAGE_CACHE_SIZE);
-		len -= PAGE_CACHE_SIZE;
-	}
-	btrfs_add_ordered_sum(inode, ordered, sums);
-	btrfs_put_ordered_extent(ordered);
-	return 0;
-}
-
 int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
-		       struct bio *bio)
+		       struct bio *bio, u64 file_start, int contig)
 {
 	struct btrfs_ordered_sum *sums;
 	struct btrfs_sector_sum *sector_sum;
@@ -303,6 +240,7 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
 	unsigned long total_bytes = 0;
 	unsigned long this_sum_bytes = 0;
 	u64 offset;
+	u64 disk_bytenr;
 
 	WARN_ON(bio->bi_vcnt <= 0);
 	sums = kzalloc(btrfs_ordered_sum_size(root, bio->bi_size), GFP_NOFS);
@@ -310,16 +248,25 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
 		return -ENOMEM;
 
 	sector_sum = sums->sums;
-	sums->file_offset = page_offset(bvec->bv_page) + bvec->bv_offset;
+	disk_bytenr = (u64)bio->bi_sector << 9;
 	sums->len = bio->bi_size;
 	INIT_LIST_HEAD(&sums->list);
-	ordered = btrfs_lookup_ordered_extent(inode, sums->file_offset);
+
+	if (contig)
+		offset = file_start;
+	else
+		offset = page_offset(bvec->bv_page) + bvec->bv_offset;
+
+	ordered = btrfs_lookup_ordered_extent(inode, offset);
 	BUG_ON(!ordered);
+	sums->bytenr = ordered->start;
 
 	while(bio_index < bio->bi_vcnt) {
-		offset = page_offset(bvec->bv_page) + bvec->bv_offset;
-		if (offset >= ordered->file_offset + ordered->len ||
-		    offset < ordered->file_offset) {
+		if (!contig)
+			offset = page_offset(bvec->bv_page) + bvec->bv_offset;
+
+		if (!contig && (offset >= ordered->file_offset + ordered->len ||
+		    offset < ordered->file_offset)) {
 			unsigned long bytes_left;
 			sums->len = this_sum_bytes;
 			this_sum_bytes = 0;
@@ -333,10 +280,9 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
 			BUG_ON(!sums);
 			sector_sum = sums->sums;
 			sums->len = bytes_left;
-			sums->file_offset = offset;
-			ordered = btrfs_lookup_ordered_extent(inode,
-						      sums->file_offset);
+			ordered = btrfs_lookup_ordered_extent(inode, offset);
 			BUG_ON(!ordered);
+			sums->bytenr = ordered->start;
 		}
 
 		data = kmap_atomic(bvec->bv_page, KM_USER0);
@@ -348,13 +294,14 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
 		kunmap_atomic(data, KM_USER0);
 		btrfs_csum_final(sector_sum->sum,
 				 (char *)&sector_sum->sum);
-		sector_sum->offset = page_offset(bvec->bv_page) +
-			bvec->bv_offset;
+		sector_sum->bytenr = disk_bytenr;
 
 		sector_sum++;
 		bio_index++;
 		total_bytes += bvec->bv_len;
 		this_sum_bytes += bvec->bv_len;
+		disk_bytenr += bvec->bv_len;
+		offset += bvec->bv_len;
 		bvec++;
 	}
 	this_sum_bytes = 0;
@@ -364,11 +311,10 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
 }
 
 int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
-			   struct btrfs_root *root, struct inode *inode,
+			   struct btrfs_root *root,
 			   struct btrfs_ordered_sum *sums)
 {
-	u64 objectid = inode->i_ino;
-	u64 offset;
+	u64 bytenr;
 	int ret;
 	struct btrfs_key file_key;
 	struct btrfs_key found_key;
@@ -396,13 +342,12 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
 again:
 	next_offset = (u64)-1;
 	found_next = 0;
-	offset = sector_sum->offset;
-	file_key.objectid = objectid;
-	file_key.offset = offset;
-	btrfs_set_key_type(&file_key, BTRFS_CSUM_ITEM_KEY);
+	file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
+	file_key.offset = sector_sum->bytenr;
+	bytenr = sector_sum->bytenr;
+	btrfs_set_key_type(&file_key, BTRFS_EXTENT_CSUM_KEY);
 
-	mutex_lock(&BTRFS_I(inode)->csum_mutex);
-	item = btrfs_lookup_csum(trans, root, path, objectid, offset, 1);
+	item = btrfs_lookup_csum(trans, root, path, sector_sum->bytenr, 1);
 	if (!IS_ERR(item)) {
 		leaf = path->nodes[0];
 		ret = 0;
@@ -432,8 +377,8 @@ again:
 			slot = 0;
 		}
 		btrfs_item_key_to_cpu(path->nodes[0], &found_key, slot);
-		if (found_key.objectid != objectid ||
-		    found_key.type != BTRFS_CSUM_ITEM_KEY) {
+		if (found_key.objectid != BTRFS_EXTENT_CSUM_OBJECTID ||
+		    found_key.type != BTRFS_EXTENT_CSUM_KEY) {
 			found_next = 1;
 			goto insert;
 		}
@@ -460,10 +405,10 @@ again:
 	path->slots[0]--;
 	leaf = path->nodes[0];
 	btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
-	csum_offset = (offset - found_key.offset) >>
+	csum_offset = (bytenr - found_key.offset) >>
 			root->fs_info->sb->s_blocksize_bits;
-	if (btrfs_key_type(&found_key) != BTRFS_CSUM_ITEM_KEY ||
-	    found_key.objectid != objectid ||
+	if (btrfs_key_type(&found_key) != BTRFS_EXTENT_CSUM_KEY ||
+	    found_key.objectid != BTRFS_EXTENT_CSUM_OBJECTID ||
 	    csum_offset >= MAX_CSUM_ITEMS(root, csum_size)) {
 		goto insert;
 	}
@@ -482,8 +427,18 @@ insert:
 	btrfs_release_path(root, path);
 	csum_offset = 0;
 	if (found_next) {
-		u64 tmp = min((u64)i_size_read(inode), next_offset);
-		tmp -= offset & ~((u64)root->sectorsize -1);
+		u64 tmp = total_bytes + root->sectorsize;
+		u64 next_sector = sector_sum->bytenr;
+		struct btrfs_sector_sum *next = sector_sum + 1;
+
+		while(tmp < sums->len) {
+			if (next_sector + root->sectorsize != next->bytenr)
+				break;
+			tmp += root->sectorsize;
+			next_sector = next->bytenr;
+			next++;
+		}
+		tmp = min(tmp, next_offset - file_key.offset);
 		tmp >>= root->fs_info->sb->s_blocksize_bits;
 		tmp = max((u64)1, tmp);
 		tmp = min(tmp, (u64)MAX_CSUM_ITEMS(root, csum_size));
@@ -510,7 +465,6 @@ found:
 	item_end = (struct btrfs_csum_item *)((unsigned char *)item_end +
 				      btrfs_item_size_nr(leaf, path->slots[0]));
 	eb_token = NULL;
-	mutex_unlock(&BTRFS_I(inode)->csum_mutex);
 	cond_resched();
 next_sector:
 
@@ -541,9 +495,9 @@ next_sector:
 	if (total_bytes < sums->len) {
 		item = (struct btrfs_csum_item *)((char *)item +
 						  csum_size);
-		if (item < item_end && offset + PAGE_CACHE_SIZE ==
-		    sector_sum->offset) {
-			    offset = sector_sum->offset;
+		if (item < item_end && bytenr + PAGE_CACHE_SIZE ==
+		    sector_sum->bytenr) {
+			bytenr = sector_sum->bytenr;
 			goto next_sector;
 		}
 	}
@@ -562,7 +516,6 @@ out:
 	return ret;
 
 fail_unlock:
-	mutex_unlock(&BTRFS_I(inode)->csum_mutex);
 	goto out;
 }
 
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 09efc9473a3d..c03d847b8c4e 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1221,7 +1221,7 @@ static int __btrfs_submit_bio_start(struct inode *inode, int rw, struct bio *bio
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	int ret = 0;
 
-	ret = btrfs_csum_one_bio(root, inode, bio);
+	ret = btrfs_csum_one_bio(root, inode, bio, 0, 0);
 	BUG_ON(ret);
 	return 0;
 }
@@ -1259,12 +1259,11 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
 		btrfs_test_flag(inode, NODATASUM);
 
 	if (!(rw & (1 << BIO_RW))) {
-
-		if (bio_flags & EXTENT_BIO_COMPRESSED)
+		if (bio_flags & EXTENT_BIO_COMPRESSED) {
 			return btrfs_submit_compressed_read(inode, bio,
 						    mirror_num, bio_flags);
-		else if (!skip_sum)
-			btrfs_lookup_bio_sums(root, inode, bio);
+		} else if (!skip_sum)
+			btrfs_lookup_bio_sums(root, inode, bio, NULL);
 		goto mapit;
 	} else if (!skip_sum) {
 		/* we're doing a write, do the async checksumming */
@@ -1292,8 +1291,8 @@ static noinline int add_pending_csums(struct btrfs_trans_handle *trans,
 	btrfs_set_trans_block_group(trans, inode);
 	list_for_each(cur, list) {
 		sum = list_entry(cur, struct btrfs_ordered_sum, list);
-		btrfs_csum_file_blocks(trans, BTRFS_I(inode)->root,
-				       inode, sum);
+		btrfs_csum_file_blocks(trans,
+		       BTRFS_I(inode)->root->fs_info->csum_root, sum);
 	}
 	return 0;
 }
@@ -1545,6 +1544,7 @@ struct io_failure_record {
 	u64 start;
 	u64 len;
 	u64 logical;
+	unsigned long bio_flags;
 	int last_mirror;
 };
 
@@ -1563,7 +1563,6 @@ static int btrfs_io_failed_hook(struct bio *failed_bio,
 	int ret;
 	int rw;
 	u64 logical;
-	unsigned long bio_flags = 0;
 
 	ret = get_state_private(failure_tree, start, &private);
 	if (ret) {
@@ -1573,6 +1572,7 @@ static int btrfs_io_failed_hook(struct bio *failed_bio,
 		failrec->start = start;
 		failrec->len = end - start + 1;
 		failrec->last_mirror = 0;
+		failrec->bio_flags = 0;
 
 		spin_lock(&em_tree->lock);
 		em = lookup_extent_mapping(em_tree, start, failrec->len);
@@ -1588,8 +1588,10 @@ static int btrfs_io_failed_hook(struct bio *failed_bio,
 		}
 		logical = start - em->start;
 		logical = em->block_start + logical;
-		if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
-			bio_flags = EXTENT_BIO_COMPRESSED;
+		if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
+			logical = em->block_start;
+			failrec->bio_flags = EXTENT_BIO_COMPRESSED;
+		}
 		failrec->logical = logical;
 		free_extent_map(em);
 		set_extent_bits(failure_tree, start, end, EXTENT_LOCKED |
@@ -1626,6 +1628,7 @@ static int btrfs_io_failed_hook(struct bio *failed_bio,
 	bio->bi_sector = failrec->logical >> 9;
 	bio->bi_bdev = failed_bio->bi_bdev;
 	bio->bi_size = 0;
+
 	bio_add_page(bio, page, failrec->len, start - page_offset(page));
 	if (failed_bio->bi_rw & (1 << BIO_RW))
 		rw = WRITE;
@@ -1634,7 +1637,7 @@ static int btrfs_io_failed_hook(struct bio *failed_bio,
 
 	BTRFS_I(inode)->io_tree.ops->submit_bio_hook(inode, rw, bio,
 						      failrec->last_mirror,
-						      bio_flags);
+						      failrec->bio_flags);
 	return 0;
 }
 
@@ -1688,9 +1691,14 @@ static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end,
 	u32 csum = ~(u32)0;
 	unsigned long flags;
 
+	if (PageChecked(page)) {
+		ClearPageChecked(page);
+		goto good;
+	}
 	if (btrfs_test_opt(root, NODATASUM) ||
 	    btrfs_test_flag(inode, NODATASUM))
 		return 0;
+
 	if (state && state->start == start) {
 		private = state->private;
 		ret = 0;
@@ -1709,7 +1717,7 @@ static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end,
 	}
 	kunmap_atomic(kaddr, KM_IRQ0);
 	local_irq_restore(flags);
-
+good:
 	/* if the io failure tree for this inode is non-empty,
 	 * check to see if we've recovered from a failed IO
 	 */
@@ -2243,6 +2251,7 @@ fail:
 	return err;
 }
 
+#if 0
 /*
  * when truncating bytes in a file, it is possible to avoid reading
  * the leaves that contain only checksum items.  This can be the
@@ -2410,6 +2419,8 @@ out:
 	return ret;
 }
 
+#endif
+
 /*
  * this can truncate away extent items, csum items and directory items.
  * It starts at a high offset and removes keys until it can't find
@@ -2459,9 +2470,6 @@ noinline int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
 
 	btrfs_init_path(path);
 
-	ret = drop_csum_leaves(trans, root, path, inode, new_size);
-	BUG_ON(ret);
-
 search_again:
 	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
 	if (ret < 0) {
@@ -2509,16 +2517,11 @@ search_again:
 			}
 			item_end--;
 		}
-		if (found_type == BTRFS_CSUM_ITEM_KEY) {
-			ret = btrfs_csum_truncate(trans, root, path,
-						  new_size);
-			BUG_ON(ret);
-		}
 		if (item_end < new_size) {
 			if (found_type == BTRFS_DIR_ITEM_KEY) {
 				found_type = BTRFS_INODE_ITEM_KEY;
 			} else if (found_type == BTRFS_EXTENT_ITEM_KEY) {
-				found_type = BTRFS_CSUM_ITEM_KEY;
+				found_type = BTRFS_EXTENT_DATA_KEY;
 			} else if (found_type == BTRFS_EXTENT_DATA_KEY) {
 				found_type = BTRFS_XATTR_ITEM_KEY;
 			} else if (found_type == BTRFS_XATTR_ITEM_KEY) {
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index b4da53d55c82..6228b69c2b93 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -714,8 +714,7 @@ static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
 	u64 len = olen;
 	u64 bs = root->fs_info->sb->s_blocksize;
 	u64 hint_byte;
-	u16 csum_size =
-		btrfs_super_csum_size(&root->fs_info->super_copy);
+
 	/*
 	 * TODO:
 	 * - split compressed inline extents.  annoying: we need to
@@ -833,7 +832,7 @@ static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
 		slot = path->slots[0];
 
 		btrfs_item_key_to_cpu(leaf, &key, slot);
-		if (btrfs_key_type(&key) > BTRFS_CSUM_ITEM_KEY ||
+		if (btrfs_key_type(&key) > BTRFS_EXTENT_DATA_KEY ||
 		    key.objectid != src->i_ino)
 			break;
 
@@ -958,56 +957,6 @@ static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
 			btrfs_mark_buffer_dirty(leaf);
 		}
 
-		if (btrfs_key_type(&key) == BTRFS_CSUM_ITEM_KEY) {
-			u32 size;
-			struct btrfs_key new_key;
-			u64 coverslen;
-			int coff, clen;
-
-			size = btrfs_item_size_nr(leaf, slot);
-			coverslen = (size / csum_size) <<
-				root->fs_info->sb->s_blocksize_bits;
-			printk("csums for %llu~%llu\n",
-			       key.offset, coverslen);
-			if (key.offset + coverslen < off ||
-			    key.offset >= off+len)
-				goto next;
-
-			read_extent_buffer(leaf, buf,
-					   btrfs_item_ptr_offset(leaf, slot),
-					   size);
-			btrfs_release_path(root, path);
-
-			coff = 0;
-			if (off > key.offset)
-				coff = ((off - key.offset) >>
-					root->fs_info->sb->s_blocksize_bits) *
-					csum_size;
-			clen = size - coff;
-			if (key.offset + coverslen > off+len)
-				clen -= ((key.offset+coverslen-off-len) >>
-					 root->fs_info->sb->s_blocksize_bits) *
-					csum_size;
-			printk(" will dup %d~%d of %d\n",
-			       coff, clen, size);
-
-			memcpy(&new_key, &key, sizeof(new_key));
-			new_key.objectid = inode->i_ino;
-			new_key.offset = key.offset + destoff - off;
-
-			ret = btrfs_insert_empty_item(trans, root, path,
-						      &new_key, clen);
-			if (ret)
-				goto out;
-
-			leaf = path->nodes[0];
-			slot = path->slots[0];
-			write_extent_buffer(leaf, buf + coff,
-					    btrfs_item_ptr_offset(leaf, slot),
-					    clen);
-			btrfs_mark_buffer_dirty(leaf);
-		}
-
 	next:
 		btrfs_release_path(root, path);
 		key.offset++;
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 027ad6b3839e..d9e232227da4 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -610,7 +610,8 @@ out:
  * try to find a checksum.  This is used because we allow pages to
  * be reclaimed before their checksum is actually put into the btree
  */
-int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u32 *sum)
+int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
+			   u32 *sum)
 {
 	struct btrfs_ordered_sum *ordered_sum;
 	struct btrfs_sector_sum *sector_sums;
@@ -629,11 +630,11 @@ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u32 *sum)
 	mutex_lock(&tree->mutex);
 	list_for_each_prev(cur, &ordered->list) {
 		ordered_sum = list_entry(cur, struct btrfs_ordered_sum, list);
-		if (offset >= ordered_sum->file_offset) {
+		if (disk_bytenr >= ordered_sum->bytenr) {
 			num_sectors = ordered_sum->len / sectorsize;
 			sector_sums = ordered_sum->sums;
 			for (i = 0; i < num_sectors; i++) {
-				if (sector_sums[i].offset == offset) {
+				if (sector_sums[i].bytenr == disk_bytenr) {
 					*sum = sector_sums[i].sum;
 					ret = 0;
 					goto out;
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 260bf95dfe0c..ab66d5e8d6d6 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -33,15 +33,17 @@ struct btrfs_ordered_inode_tree {
  * the ordered extent are on disk
  */
 struct btrfs_sector_sum {
-	u64 offset;
+	/* bytenr on disk */
+	u64 bytenr;
 	u32 sum;
 };
 
 struct btrfs_ordered_sum {
-	u64 file_offset;
+	/* bytenr is the start of this extent on disk */
+	u64 bytenr;
+
 	/*
 	 * this is the length in bytes covered by the sums array below.
-	 * But, the sums array may not be contiguous in the file.
 	 */
 	unsigned long len;
 	struct list_head list;
@@ -147,7 +149,7 @@ struct btrfs_ordered_extent *
 btrfs_lookup_first_ordered_extent(struct inode * inode, u64 file_offset);
 int btrfs_ordered_update_i_size(struct inode *inode,
 				struct btrfs_ordered_extent *ordered);
-int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u32 *sum);
+int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum);
 int btrfs_wait_on_page_writeback_range(struct address_space *mapping,
 				       pgoff_t start, pgoff_t end);
 int btrfs_fdatawrite_range(struct address_space *mapping, loff_t start,
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index c766649ad453..08469ec05850 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -934,24 +934,17 @@ static noinline int replay_one_csum(struct btrfs_trans_handle *trans,
 	unsigned long file_bytes;
 	struct btrfs_ordered_sum *sums;
 	struct btrfs_sector_sum *sector_sum;
-	struct inode *inode;
 	unsigned long ptr;
 
 	file_bytes = (item_size / csum_size) * root->sectorsize;
-	inode = read_one_inode(root, key->objectid);
-	if (!inode) {
-		return -EIO;
-	}
-
 	sums = kzalloc(btrfs_ordered_sum_size(root, file_bytes), GFP_NOFS);
 	if (!sums) {
-		iput(inode);
 		return -ENOMEM;
 	}
 
 	INIT_LIST_HEAD(&sums->list);
 	sums->len = file_bytes;
-	sums->file_offset = key->offset;
+	sums->bytenr = key->offset;
 
 	/*
 	 * copy all the sums into the ordered sum struct
@@ -960,7 +953,7 @@ static noinline int replay_one_csum(struct btrfs_trans_handle *trans,
 	cur_offset = key->offset;
 	ptr = btrfs_item_ptr_offset(eb, slot);
 	while(item_size > 0) {
-		sector_sum->offset = cur_offset;
+		sector_sum->bytenr = cur_offset;
 		read_extent_buffer(eb, &sector_sum->sum, ptr, csum_size);
 		sector_sum++;
 		item_size -= csum_size;
@@ -969,11 +962,9 @@ static noinline int replay_one_csum(struct btrfs_trans_handle *trans,
 	}
 
 	/* let btrfs_csum_file_blocks add them into the file */
-	ret = btrfs_csum_file_blocks(trans, root, inode, sums);
+	ret = btrfs_csum_file_blocks(trans, root->fs_info->csum_root, sums);
 	BUG_ON(ret);
 	kfree(sums);
-	iput(inode);
-
 	return 0;
 }
 /*
@@ -1670,7 +1661,7 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
 			ret = replay_one_extent(wc->trans, root, path,
 						eb, i, &key);
 			BUG_ON(ret);
-		} else if (key.type == BTRFS_CSUM_ITEM_KEY) {
+		} else if (key.type == BTRFS_EXTENT_CSUM_KEY) {
 			ret = replay_one_csum(wc->trans, root, path,
 					      eb, i, &key);
 			BUG_ON(ret);
@@ -2466,6 +2457,85 @@ static int drop_objectid_items(struct btrfs_trans_handle *trans,
 	return 0;
 }
 
+static noinline int copy_extent_csums(struct btrfs_trans_handle *trans,
+				      struct list_head *list,
+				      struct btrfs_root *root,
+				      u64 disk_bytenr, u64 len)
+{
+	struct btrfs_ordered_sum *sums;
+	struct btrfs_sector_sum *sector_sum;
+	int ret;
+	struct btrfs_path *path;
+	struct btrfs_csum_item *item = NULL;
+	u64 end = disk_bytenr + len;
+	u64 item_start_offset = 0;
+	u64 item_last_offset = 0;
+	u32 diff;
+	u32 sum;
+	u16 csum_size = btrfs_super_csum_size(&root->fs_info->super_copy);
+
+	sums = kzalloc(btrfs_ordered_sum_size(root, len), GFP_NOFS);
+
+	sector_sum = sums->sums;
+	sums->bytenr = disk_bytenr;
+	sums->len = len;
+	list_add_tail(&sums->list, list);
+
+	path = btrfs_alloc_path();
+	while(disk_bytenr < end) {
+		if (!item || disk_bytenr < item_start_offset ||
+		    disk_bytenr >= item_last_offset) {
+			struct btrfs_key found_key;
+			u32 item_size;
+
+			if (item)
+				btrfs_release_path(root, path);
+			item = btrfs_lookup_csum(NULL, root, path,
+						 disk_bytenr, 0);
+			if (IS_ERR(item)) {
+				ret = PTR_ERR(item);
+				if (ret == -ENOENT || ret == -EFBIG)
+					ret = 0;
+				sum = 0;
+				printk("log no csum found for byte %llu\n",
+				       (unsigned long long)disk_bytenr);
+				item = NULL;
+				btrfs_release_path(root, path);
+				goto found;
+			}
+			btrfs_item_key_to_cpu(path->nodes[0], &found_key,
+					      path->slots[0]);
+
+			item_start_offset = found_key.offset;
+			item_size = btrfs_item_size_nr(path->nodes[0],
+						       path->slots[0]);
+			item_last_offset = item_start_offset +
+				(item_size / csum_size) *
+				root->sectorsize;
+			item = btrfs_item_ptr(path->nodes[0], path->slots[0],
+					      struct btrfs_csum_item);
+		}
+		/*
+		 * this byte range must be able to fit inside
+		 * a single leaf so it will also fit inside a u32
+		 */
+		diff = disk_bytenr - item_start_offset;
+		diff = diff / root->sectorsize;
+		diff = diff * csum_size;
+
+		read_extent_buffer(path->nodes[0], &sum,
+				   ((unsigned long)item) + diff,
+				   csum_size);
+found:
+		sector_sum->bytenr = disk_bytenr;
+		sector_sum->sum = sum;
+		disk_bytenr += root->sectorsize;
+		sector_sum++;
+	}
+	btrfs_free_path(path);
+	return 0;
+}
+
 static noinline int copy_items(struct btrfs_trans_handle *trans,
 			       struct btrfs_root *log,
 			       struct btrfs_path *dst_path,
@@ -2481,6 +2551,9 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
 	u32 *ins_sizes;
 	char *ins_data;
 	int i;
+	struct list_head ordered_sums;
+
+	INIT_LIST_HEAD(&ordered_sums);
 
 	ins_data = kmalloc(nr * sizeof(struct btrfs_key) +
 			   nr * sizeof(u32), GFP_NOFS);
@@ -2535,6 +2608,9 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
 								   extent);
 				u64 dl = btrfs_file_extent_disk_num_bytes(src,
 								      extent);
+				u64 cs = btrfs_file_extent_offset(src, extent);
+				u64 cl = btrfs_file_extent_num_bytes(src,
+								     extent);;
 				/* ds == 0 is a hole */
 				if (ds != 0) {
 					ret = btrfs_inc_extent_ref(trans, log,
@@ -2544,6 +2620,11 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
 						   trans->transid,
 						   ins_keys[i].objectid);
 					BUG_ON(ret);
+					ret = copy_extent_csums(trans,
+						&ordered_sums,
+						log->fs_info->csum_root,
+						ds + cs, cl);
+					BUG_ON(ret);
 				}
 			}
 		}
@@ -2553,6 +2634,20 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
 	btrfs_mark_buffer_dirty(dst_path->nodes[0]);
 	btrfs_release_path(log, dst_path);
 	kfree(ins_data);
+
+	/*
+	 * we have to do this after the loop above to avoid changing the
+	 * log tree while trying to change the log tree.
+	 */
+	while(!list_empty(&ordered_sums)) {
+		struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next,
+						   struct btrfs_ordered_sum,
+						   list);
+		ret = btrfs_csum_file_blocks(trans, log, sums);
+		BUG_ON(ret);
+		list_del(&sums->list);
+		kfree(sums);
+	}
 	return 0;
 }
 
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 6c523b3360f6..2049d179ccd5 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -2771,6 +2771,7 @@ static struct btrfs_device *add_missing_dev(struct btrfs_root *root,
 	device->work.func = pending_bios_fn;
 	fs_devices->num_devices++;
 	spin_lock_init(&device->io_lock);
+	INIT_LIST_HEAD(&device->dev_alloc_list);
 	memcpy(device->uuid, dev_uuid, BTRFS_UUID_SIZE);
 	return device;
 }
-- 
cgit v1.2.3


From a512bbf855ff0af474257475f2e6da7acd854f52 Mon Sep 17 00:00:00 2001
From: Yan Zheng <zheng.yan@oracle.com>
Date: Mon, 8 Dec 2008 16:46:26 -0500
Subject: Btrfs: superblock duplication

This patch implements superblock duplication. Superblocks
are stored at offset 16K, 64M and 256G on every devices.
Spaces used by superblocks are preserved by the allocator,
which uses a reverse mapping function to find the logical
addresses that correspond to superblocks. Thank you,

Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
---
 fs/btrfs/disk-io.c          | 208 ++++++++++++++++++++++++++++++--------------
 fs/btrfs/disk-io.h          |  17 +++-
 fs/btrfs/extent-tree.c      |  54 ++++++------
 fs/btrfs/free-space-cache.c |   1 -
 fs/btrfs/transaction.c      |   2 +-
 fs/btrfs/tree-log.c         |   3 +-
 fs/btrfs/volumes.c          | 107 ++++++++++++++++++-----
 fs/btrfs/volumes.h          |   6 +-
 8 files changed, 279 insertions(+), 119 deletions(-)

(limited to 'fs/btrfs/volumes.c')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 61dc3b2c834b..c72f4f3b912c 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1595,8 +1595,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 		     fs_info, BTRFS_ROOT_TREE_OBJECTID);
 
 
-	bh = __bread(fs_devices->latest_bdev,
-		     BTRFS_SUPER_INFO_OFFSET / 4096, 4096);
+	bh = btrfs_read_dev_super(fs_devices->latest_bdev);
 	if (!bh)
 		goto fail_iput;
 
@@ -1710,7 +1709,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	}
 
 	mutex_lock(&fs_info->chunk_mutex);
-	ret = btrfs_read_sys_array(tree_root);
+	ret = btrfs_read_sys_array(tree_root, btrfs_super_bytenr(disk_super));
 	mutex_unlock(&fs_info->chunk_mutex);
 	if (ret) {
 		printk("btrfs: failed to read the system array on %s\n",
@@ -1905,19 +1904,147 @@ static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate)
 	put_bh(bh);
 }
 
-static int write_all_supers(struct btrfs_root *root)
+struct buffer_head *btrfs_read_dev_super(struct block_device *bdev)
+{
+	struct buffer_head *bh;
+	struct buffer_head *latest = NULL;
+	struct btrfs_super_block *super;
+	int i;
+	u64 transid = 0;
+	u64 bytenr;
+
+	/* we would like to check all the supers, but that would make
+	 * a btrfs mount succeed after a mkfs from a different FS.
+	 * So, we need to add a special mount option to scan for
+	 * later supers, using BTRFS_SUPER_MIRROR_MAX instead
+	 */
+	for (i = 0; i < 1; i++) {
+		bytenr = btrfs_sb_offset(i);
+		if (bytenr + 4096 >= i_size_read(bdev->bd_inode))
+			break;
+		bh = __bread(bdev, bytenr / 4096, 4096);
+		if (!bh)
+			continue;
+
+		super = (struct btrfs_super_block *)bh->b_data;
+		if (btrfs_super_bytenr(super) != bytenr ||
+		    strncmp((char *)(&super->magic), BTRFS_MAGIC,
+			    sizeof(super->magic))) {
+			brelse(bh);
+			continue;
+		}
+
+		if (!latest || btrfs_super_generation(super) > transid) {
+			brelse(latest);
+			latest = bh;
+			transid = btrfs_super_generation(super);
+		} else {
+			brelse(bh);
+		}
+	}
+	return latest;
+}
+
+static int write_dev_supers(struct btrfs_device *device,
+			    struct btrfs_super_block *sb,
+			    int do_barriers, int wait, int max_mirrors)
+{
+	struct buffer_head *bh;
+	int i;
+	int ret;
+	int errors = 0;
+	u32 crc;
+	u64 bytenr;
+	int last_barrier = 0;
+
+	if (max_mirrors == 0)
+		max_mirrors = BTRFS_SUPER_MIRROR_MAX;
+
+	/* make sure only the last submit_bh does a barrier */
+	if (do_barriers) {
+		for (i = 0; i < max_mirrors; i++) {
+			bytenr = btrfs_sb_offset(i);
+			if (bytenr + BTRFS_SUPER_INFO_SIZE >=
+			    device->total_bytes)
+				break;
+			last_barrier = i;
+		}
+	}
+
+	for (i = 0; i < max_mirrors; i++) {
+		bytenr = btrfs_sb_offset(i);
+		if (bytenr + BTRFS_SUPER_INFO_SIZE >= device->total_bytes)
+			break;
+
+		if (wait) {
+			bh = __find_get_block(device->bdev, bytenr / 4096,
+					      BTRFS_SUPER_INFO_SIZE);
+			BUG_ON(!bh);
+			brelse(bh);
+			wait_on_buffer(bh);
+			if (buffer_uptodate(bh)) {
+				brelse(bh);
+				continue;
+			}
+		} else {
+			btrfs_set_super_bytenr(sb, bytenr);
+
+			crc = ~(u32)0;
+			crc = btrfs_csum_data(NULL, (char *)sb +
+					      BTRFS_CSUM_SIZE, crc,
+					      BTRFS_SUPER_INFO_SIZE -
+					      BTRFS_CSUM_SIZE);
+			btrfs_csum_final(crc, sb->csum);
+
+			bh = __getblk(device->bdev, bytenr / 4096,
+				      BTRFS_SUPER_INFO_SIZE);
+			memcpy(bh->b_data, sb, BTRFS_SUPER_INFO_SIZE);
+
+			set_buffer_uptodate(bh);
+			get_bh(bh);
+			lock_buffer(bh);
+			bh->b_end_io = btrfs_end_buffer_write_sync;
+		}
+
+		if (i == last_barrier && do_barriers && device->barriers) {
+			ret = submit_bh(WRITE_BARRIER, bh);
+			if (ret == -EOPNOTSUPP) {
+				printk("btrfs: disabling barriers on dev %s\n",
+				       device->name);
+				set_buffer_uptodate(bh);
+				device->barriers = 0;
+				get_bh(bh);
+				lock_buffer(bh);
+				ret = submit_bh(WRITE, bh);
+			}
+		} else {
+			ret = submit_bh(WRITE, bh);
+		}
+
+		if (!ret && wait) {
+			wait_on_buffer(bh);
+			if (!buffer_uptodate(bh))
+				errors++;
+		} else if (ret) {
+			errors++;
+		}
+		if (wait)
+			brelse(bh);
+	}
+	return errors < i ? 0 : -1;
+}
+
+int write_all_supers(struct btrfs_root *root, int max_mirrors)
 {
 	struct list_head *cur;
 	struct list_head *head = &root->fs_info->fs_devices->devices;
 	struct btrfs_device *dev;
 	struct btrfs_super_block *sb;
 	struct btrfs_dev_item *dev_item;
-	struct buffer_head *bh;
 	int ret;
 	int do_barriers;
 	int max_errors;
 	int total_errors = 0;
-	u32 crc;
 	u64 flags;
 
 	max_errors = btrfs_super_num_devices(&root->fs_info->super_copy) - 1;
@@ -1944,40 +2071,11 @@ static int write_all_supers(struct btrfs_root *root)
 		btrfs_set_stack_device_sector_size(dev_item, dev->sector_size);
 		memcpy(dev_item->uuid, dev->uuid, BTRFS_UUID_SIZE);
 		memcpy(dev_item->fsid, dev->fs_devices->fsid, BTRFS_UUID_SIZE);
+
 		flags = btrfs_super_flags(sb);
 		btrfs_set_super_flags(sb, flags | BTRFS_HEADER_FLAG_WRITTEN);
 
-
-		crc = ~(u32)0;
-		crc = btrfs_csum_data(root, (char *)sb + BTRFS_CSUM_SIZE, crc,
-				      BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE);
-		btrfs_csum_final(crc, sb->csum);
-
-		bh = __getblk(dev->bdev, BTRFS_SUPER_INFO_OFFSET / 4096,
-			      BTRFS_SUPER_INFO_SIZE);
-
-		memcpy(bh->b_data, sb, BTRFS_SUPER_INFO_SIZE);
-		dev->pending_io = bh;
-
-		get_bh(bh);
-		set_buffer_uptodate(bh);
-		lock_buffer(bh);
-		bh->b_end_io = btrfs_end_buffer_write_sync;
-
-		if (do_barriers && dev->barriers) {
-			ret = submit_bh(WRITE_BARRIER, bh);
-			if (ret == -EOPNOTSUPP) {
-				printk("btrfs: disabling barriers on dev %s\n",
-				       dev->name);
-				set_buffer_uptodate(bh);
-				dev->barriers = 0;
-				get_bh(bh);
-				lock_buffer(bh);
-				ret = submit_bh(WRITE, bh);
-			}
-		} else {
-			ret = submit_bh(WRITE, bh);
-		}
+		ret = write_dev_supers(dev, sb, do_barriers, 0, max_mirrors);
 		if (ret)
 			total_errors++;
 	}
@@ -1985,8 +2083,8 @@ static int write_all_supers(struct btrfs_root *root)
 		printk("btrfs: %d errors while writing supers\n", total_errors);
 		BUG();
 	}
-	total_errors = 0;
 
+	total_errors = 0;
 	list_for_each(cur, head) {
 		dev = list_entry(cur, struct btrfs_device, dev_list);
 		if (!dev->bdev)
@@ -1994,29 +2092,9 @@ static int write_all_supers(struct btrfs_root *root)
 		if (!dev->in_fs_metadata || !dev->writeable)
 			continue;
 
-		BUG_ON(!dev->pending_io);
-		bh = dev->pending_io;
-		wait_on_buffer(bh);
-		if (!buffer_uptodate(dev->pending_io)) {
-			if (do_barriers && dev->barriers) {
-				printk("btrfs: disabling barriers on dev %s\n",
-				       dev->name);
-				set_buffer_uptodate(bh);
-				get_bh(bh);
-				lock_buffer(bh);
-				dev->barriers = 0;
-				ret = submit_bh(WRITE, bh);
-				BUG_ON(ret);
-				wait_on_buffer(bh);
-				if (!buffer_uptodate(bh))
-					total_errors++;
-			} else {
-				total_errors++;
-			}
-
-		}
-		dev->pending_io = NULL;
-		brelse(bh);
+		ret = write_dev_supers(dev, sb, do_barriers, 1, max_mirrors);
+		if (ret)
+			total_errors++;
 	}
 	if (total_errors > max_errors) {
 		printk("btrfs: %d errors while writing supers\n", total_errors);
@@ -2025,12 +2103,12 @@ static int write_all_supers(struct btrfs_root *root)
 	return 0;
 }
 
-int write_ctree_super(struct btrfs_trans_handle *trans, struct btrfs_root
-		      *root)
+int write_ctree_super(struct btrfs_trans_handle *trans,
+		      struct btrfs_root *root, int max_mirrors)
 {
 	int ret;
 
-	ret = write_all_supers(root);
+	ret = write_all_supers(root, max_mirrors);
 	return ret;
 }
 
@@ -2116,7 +2194,7 @@ int btrfs_commit_super(struct btrfs_root *root)
 	ret = btrfs_write_and_wait_transaction(NULL, root);
 	BUG_ON(ret);
 
-	ret = write_ctree_super(NULL, root);
+	ret = write_ctree_super(NULL, root, 0);
 	return ret;
 }
 
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 717e94811e4e..c0ff404c31b7 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -19,8 +19,20 @@
 #ifndef __DISKIO__
 #define __DISKIO__
 
-#define BTRFS_SUPER_INFO_OFFSET (16 * 1024)
+#define BTRFS_SUPER_INFO_OFFSET (64 * 1024)
 #define BTRFS_SUPER_INFO_SIZE 4096
+
+#define BTRFS_SUPER_MIRROR_MAX	 3
+#define BTRFS_SUPER_MIRROR_SHIFT 12
+
+static inline u64 btrfs_sb_offset(int mirror)
+{
+	u64 start = 16 * 1024;
+	if (mirror)
+		return start << (BTRFS_SUPER_MIRROR_SHIFT * mirror);
+	return BTRFS_SUPER_INFO_OFFSET;
+}
+
 struct btrfs_device;
 struct btrfs_fs_devices;
 
@@ -37,7 +49,8 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 			      char *options);
 int close_ctree(struct btrfs_root *root);
 int write_ctree_super(struct btrfs_trans_handle *trans,
-		      struct btrfs_root *root);
+		      struct btrfs_root *root, int max_mirrors);
+struct buffer_head *btrfs_read_dev_super(struct block_device *bdev);
 int btrfs_commit_super(struct btrfs_root *root);
 struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
 					    u64 bytenr, u32 blocksize);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index d15638529389..803647bc8400 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -189,6 +189,29 @@ static int add_new_free_space(struct btrfs_block_group_cache *block_group,
 	return 0;
 }
 
+static int remove_sb_from_cache(struct btrfs_root *root,
+				struct btrfs_block_group_cache *cache)
+{
+	u64 bytenr;
+	u64 *logical;
+	int stripe_len;
+	int i, nr, ret;
+
+	for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
+		bytenr = btrfs_sb_offset(i);
+		ret = btrfs_rmap_block(&root->fs_info->mapping_tree,
+				       cache->key.objectid, bytenr, 0,
+				       &logical, &nr, &stripe_len);
+		BUG_ON(ret);
+		while (nr--) {
+			btrfs_remove_free_space(cache, logical[nr],
+						stripe_len);
+		}
+		kfree(logical);
+	}
+	return 0;
+}
+
 static int cache_block_group(struct btrfs_root *root,
 			     struct btrfs_block_group_cache *block_group)
 {
@@ -197,9 +220,7 @@ static int cache_block_group(struct btrfs_root *root,
 	struct btrfs_key key;
 	struct extent_buffer *leaf;
 	int slot;
-	u64 last = 0;
-	u64 first_free;
-	int found = 0;
+	u64 last = block_group->key.objectid;
 
 	if (!block_group)
 		return 0;
@@ -220,23 +241,13 @@ static int cache_block_group(struct btrfs_root *root,
 	 * skip the locking here
 	 */
 	path->skip_locking = 1;
-	first_free = max_t(u64, block_group->key.objectid,
-			   BTRFS_SUPER_INFO_OFFSET + BTRFS_SUPER_INFO_SIZE);
-	key.objectid = block_group->key.objectid;
+	key.objectid = last;
 	key.offset = 0;
 	btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
 	if (ret < 0)
 		goto err;
-	ret = btrfs_previous_item(root, path, 0, BTRFS_EXTENT_ITEM_KEY);
-	if (ret < 0)
-		goto err;
-	if (ret == 0) {
-		leaf = path->nodes[0];
-		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
-		if (key.objectid + key.offset > first_free)
-			first_free = key.objectid + key.offset;
-	}
+
 	while(1) {
 		leaf = path->nodes[0];
 		slot = path->slots[0];
@@ -258,11 +269,6 @@ static int cache_block_group(struct btrfs_root *root,
 			break;
 
 		if (btrfs_key_type(&key) == BTRFS_EXTENT_ITEM_KEY) {
-			if (!found) {
-				last = first_free;
-				found = 1;
-			}
-
 			add_new_free_space(block_group, root->fs_info, last,
 					   key.objectid);
 
@@ -272,13 +278,11 @@ next:
 		path->slots[0]++;
 	}
 
-	if (!found)
-		last = first_free;
-
 	add_new_free_space(block_group, root->fs_info, last,
 			   block_group->key.objectid +
 			   block_group->key.offset);
 
+	remove_sb_from_cache(root, block_group);
 	block_group->cached = 1;
 	ret = 0;
 err:
@@ -1974,10 +1978,8 @@ static int update_block_group(struct btrfs_trans_handle *trans,
 		if (alloc) {
 			old_val += num_bytes;
 			cache->space_info->bytes_used += num_bytes;
-			if (cache->ro) {
+			if (cache->ro)
 				cache->space_info->bytes_readonly -= num_bytes;
-				WARN_ON(1);
-			}
 			btrfs_set_block_group_used(&cache->item, old_val);
 			spin_unlock(&cache->lock);
 			spin_unlock(&cache->space_info->lock);
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 09462adfbe33..2e69b9c30437 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -290,7 +290,6 @@ __btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
 			ret = -EINVAL;
 			goto out;
 		}
-
 		unlink_free_space(block_group, info);
 
 		if (info->bytes == bytes) {
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index c38f6a0e30b1..47cd5fcad2c8 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -1038,7 +1038,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 	mutex_unlock(&root->fs_info->trans_mutex);
 	ret = btrfs_write_and_wait_transaction(trans, root);
 	BUG_ON(ret);
-	write_ctree_super(trans, root);
+	write_ctree_super(trans, root, 0);
 
 	/*
 	 * the super is written, we can safely allow the tree-loggers
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 08469ec05850..d3f9c2c663c4 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -1996,7 +1996,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 	btrfs_set_super_log_root_level(&root->fs_info->super_for_commit,
 		       btrfs_header_level(log->fs_info->log_root_tree->node));
 
-	write_ctree_super(trans, log->fs_info->tree_root);
+	write_ctree_super(trans, log->fs_info->tree_root, 2);
 	log->fs_info->tree_log_transid++;
 	log->fs_info->tree_log_batch = 0;
 	atomic_set(&log->fs_info->tree_log_commit, 0);
@@ -2006,7 +2006,6 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 out:
 	mutex_unlock(&log->fs_info->tree_log_mutex);
 	return 0;
-
 }
 
 /* * free all the extents used by the tree log.  This should be called
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 2049d179ccd5..a79b3cc09e94 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -423,15 +423,11 @@ int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
 		}
 		set_blocksize(bdev, 4096);
 
-		bh = __bread(bdev, BTRFS_SUPER_INFO_OFFSET / 4096, 4096);
+		bh = btrfs_read_dev_super(bdev);
 		if (!bh)
 			goto error_close;
 
 		disk_super = (struct btrfs_super_block *)bh->b_data;
-		if (strncmp((char *)(&disk_super->magic), BTRFS_MAGIC,
-		    sizeof(disk_super->magic)))
-			goto error_brelse;
-
 		devid = le64_to_cpu(disk_super->dev_item.devid);
 		if (devid != device->devid)
 			goto error_brelse;
@@ -529,17 +525,12 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
 	ret = set_blocksize(bdev, 4096);
 	if (ret)
 		goto error_close;
-	bh = __bread(bdev, BTRFS_SUPER_INFO_OFFSET / 4096, 4096);
+	bh = btrfs_read_dev_super(bdev);
 	if (!bh) {
 		ret = -EIO;
 		goto error_close;
 	}
 	disk_super = (struct btrfs_super_block *)bh->b_data;
-	if (strncmp((char *)(&disk_super->magic), BTRFS_MAGIC,
-	    sizeof(disk_super->magic))) {
-		ret = -EINVAL;
-		goto error_brelse;
-	}
 	devid = le64_to_cpu(disk_super->dev_item.devid);
 	transid = btrfs_super_generation(disk_super);
 	if (disk_super->label[0])
@@ -553,7 +544,6 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
 	printk("devid %Lu transid %Lu %s\n", devid, transid, path);
 	ret = device_list_add(path, disk_super, devid, fs_devices_ret);
 
-error_brelse:
 	brelse(bh);
 error_close:
 	close_bdev_exclusive(bdev, flags);
@@ -1016,17 +1006,12 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
 		}
 
 		set_blocksize(bdev, 4096);
-		bh = __bread(bdev, BTRFS_SUPER_INFO_OFFSET / 4096, 4096);
+		bh = btrfs_read_dev_super(bdev);
 		if (!bh) {
 			ret = -EIO;
 			goto error_close;
 		}
 		disk_super = (struct btrfs_super_block *)bh->b_data;
-		if (strncmp((char *)(&disk_super->magic), BTRFS_MAGIC,
-			    sizeof(disk_super->magic))) {
-			ret = -ENOENT;
-			goto error_brelse;
-		}
 		devid = le64_to_cpu(disk_super->dev_item.devid);
 		dev_uuid = disk_super->dev_item.uuid;
 		device = btrfs_find_device(root, devid, dev_uuid,
@@ -2563,6 +2548,88 @@ int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
 				 mirror_num, NULL);
 }
 
+int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
+		     u64 chunk_start, u64 physical, u64 devid,
+		     u64 **logical, int *naddrs, int *stripe_len)
+{
+	struct extent_map_tree *em_tree = &map_tree->map_tree;
+	struct extent_map *em;
+	struct map_lookup *map;
+	u64 *buf;
+	u64 bytenr;
+	u64 length;
+	u64 stripe_nr;
+	int i, j, nr = 0;
+
+	spin_lock(&em_tree->lock);
+	em = lookup_extent_mapping(em_tree, chunk_start, 1);
+	spin_unlock(&em_tree->lock);
+
+	BUG_ON(!em || em->start != chunk_start);
+	map = (struct map_lookup *)em->bdev;
+
+	length = em->len;
+	if (map->type & BTRFS_BLOCK_GROUP_RAID10)
+		do_div(length, map->num_stripes / map->sub_stripes);
+	else if (map->type & BTRFS_BLOCK_GROUP_RAID0)
+		do_div(length, map->num_stripes);
+
+	buf = kzalloc(sizeof(u64) * map->num_stripes, GFP_NOFS);
+	BUG_ON(!buf);
+
+	for (i = 0; i < map->num_stripes; i++) {
+		if (devid && map->stripes[i].dev->devid != devid)
+			continue;
+		if (map->stripes[i].physical > physical ||
+		    map->stripes[i].physical + length <= physical)
+			continue;
+
+		stripe_nr = physical - map->stripes[i].physical;
+		do_div(stripe_nr, map->stripe_len);
+
+		if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
+			stripe_nr = stripe_nr * map->num_stripes + i;
+			do_div(stripe_nr, map->sub_stripes);
+		} else if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
+			stripe_nr = stripe_nr * map->num_stripes + i;
+		}
+		bytenr = chunk_start + stripe_nr * map->stripe_len;
+		for (j = 0; j < nr; j++) {
+			if (buf[j] == bytenr)
+				break;
+		}
+		if (j == nr)
+			buf[nr++] = bytenr;
+	}
+
+	for (i = 0; i > nr; i++) {
+		struct btrfs_multi_bio *multi;
+		struct btrfs_bio_stripe *stripe;
+		int ret;
+
+		length = 1;
+		ret = btrfs_map_block(map_tree, WRITE, buf[i],
+				      &length, &multi, 0);
+		BUG_ON(ret);
+
+		stripe = multi->stripes;
+		for (j = 0; j < multi->num_stripes; j++) {
+			if (stripe->physical >= physical &&
+			    physical < stripe->physical + length)
+				break;
+		}
+		BUG_ON(j >= multi->num_stripes);
+		kfree(multi);
+	}
+
+	*logical = buf;
+	*naddrs = nr;
+	*stripe_len = map->stripe_len;
+
+	free_extent_map(em);
+	return 0;
+}
+
 int btrfs_unplug_page(struct btrfs_mapping_tree *map_tree,
 		      u64 logical, struct page *page)
 {
@@ -3003,7 +3070,7 @@ int btrfs_read_super_device(struct btrfs_root *root, struct extent_buffer *buf)
 	return read_one_dev(root, buf, dev_item);
 }
 
-int btrfs_read_sys_array(struct btrfs_root *root)
+int btrfs_read_sys_array(struct btrfs_root *root, u64 sb_bytenr)
 {
 	struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
 	struct extent_buffer *sb;
@@ -3018,7 +3085,7 @@ int btrfs_read_sys_array(struct btrfs_root *root)
 	u32 cur;
 	struct btrfs_key key;
 
-	sb = btrfs_find_create_tree_block(root, BTRFS_SUPER_INFO_OFFSET,
+	sb = btrfs_find_create_tree_block(root, sb_bytenr,
 					  BTRFS_SUPER_INFO_SIZE);
 	if (!sb)
 		return -ENOMEM;
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index fcbdcb3ae13e..bdebe83c3195 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -28,7 +28,6 @@ struct btrfs_device {
 	struct list_head dev_alloc_list;
 	struct btrfs_fs_devices *fs_devices;
 	struct btrfs_root *dev_root;
-	struct buffer_head *pending_io;
 	struct bio *pending_bios;
 	struct bio *pending_bio_tail;
 	int running_pending;
@@ -125,7 +124,10 @@ int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
 int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
 		    u64 logical, u64 *length,
 		    struct btrfs_multi_bio **multi_ret, int mirror_num);
-int btrfs_read_sys_array(struct btrfs_root *root);
+int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
+		     u64 chunk_start, u64 physical, u64 devid,
+		     u64 **logical, int *naddrs, int *stripe_len);
+int btrfs_read_sys_array(struct btrfs_root *root, u64 sb_bytenr);
 int btrfs_read_chunk_tree(struct btrfs_root *root);
 int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 		      struct btrfs_root *extent_root, u64 type);
-- 
cgit v1.2.3


From 934d375bacf9ea8a37fbfff5f3cf1c093f324095 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 8 Dec 2008 16:43:10 -0500
Subject: Btrfs: Use map_private_extent_buffer during generic_bin_search

It is possible that generic_bin_search will be called on a tree block
that has not been locked.  This happens because cache_block_block skips
locking on the tree blocks.

Since the tree block isn't locked, we aren't allowed to change
the extent_buffer->map_token field.  Using map_private_extent_buffer
avoids any changes to the internal extent buffer fields.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c     | 5 ++++-
 fs/btrfs/extent_io.c | 1 +
 fs/btrfs/volumes.c   | 5 ++++-
 3 files changed, 9 insertions(+), 2 deletions(-)

(limited to 'fs/btrfs/volumes.c')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index a83cbdf1d8c4..19c0dd33b1e8 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -813,7 +813,8 @@ static noinline int generic_bin_search(struct extent_buffer *eb,
 				unmap_extent_buffer(eb, map_token, KM_USER0);
 				map_token = NULL;
 			}
-			err = map_extent_buffer(eb, offset,
+
+			err = map_private_extent_buffer(eb, offset,
 						sizeof(struct btrfs_disk_key),
 						&map_token, &kaddr,
 						&map_start, &map_len, KM_USER0);
@@ -3585,6 +3586,7 @@ int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
 	int level;
 	int ret = 1;
 
+	WARN_ON(!path->keep_locks);
 again:
 	cur = btrfs_lock_root_node(root);
 	level = btrfs_header_level(cur);
@@ -3708,6 +3710,7 @@ int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
 	int slot;
 	struct extent_buffer *c;
 
+	WARN_ON(!path->keep_locks);
 	while(level < BTRFS_MAX_LEVEL) {
 		if (!path->nodes[level])
 			return 1;
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 7449ecf32c50..607f5ff2791c 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -3471,6 +3471,7 @@ int map_extent_buffer(struct extent_buffer *eb, unsigned long start,
 		unmap_extent_buffer(eb, eb->map_token, km);
 		eb->map_token = NULL;
 		save = 1;
+		WARN_ON(!mutex_is_locked(&eb->mutex));
 	}
 	err = map_private_extent_buffer(eb, start, min_len, token, map,
 				       map_start, map_len, km);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index a79b3cc09e94..825364fae690 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -2594,12 +2594,15 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
 			stripe_nr = stripe_nr * map->num_stripes + i;
 		}
 		bytenr = chunk_start + stripe_nr * map->stripe_len;
+		WARN_ON(nr >= map->num_stripes);
 		for (j = 0; j < nr; j++) {
 			if (buf[j] == bytenr)
 				break;
 		}
-		if (j == nr)
+		if (j == nr) {
+			WARN_ON(nr >= map->num_stripes);
 			buf[nr++] = bytenr;
+		}
 	}
 
 	for (i = 0; i > nr; i++) {
-- 
cgit v1.2.3


From c3027eb5523d6983f12628f3fe13d8a7576db701 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 8 Dec 2008 16:40:21 -0500
Subject: Btrfs: Add inode sequence number for NFS and reserved space in a few
 structs

This adds a sequence number to the btrfs inode that is increased on
every update.  NFS will be able to use that to detect when an inode has
changed, without relying on inaccurate time fields.

While we're here, this also:

Puts reserved space into the super block and inode

Adds a log root transid to the super so we can pick the newest super
based on the fsync log as well as the main transaction ID.  For now
the log root transid is always zero, but that'll get fixed.

Adds a starting offset to the dev_item.  This will let us do better
alignment calculations if we know the start of a partition on the disk.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/btrfs_inode.h |  6 +++---
 fs/btrfs/ctree.h       | 26 ++++++++++++++++++++++++++
 fs/btrfs/file.c        |  1 +
 fs/btrfs/inode.c       |  4 +++-
 fs/btrfs/volumes.c     |  1 +
 5 files changed, 34 insertions(+), 4 deletions(-)

(limited to 'fs/btrfs/volumes.c')

diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 0b2e623cf421..1b9ec1ab1f68 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -49,9 +49,6 @@ struct btrfs_inode {
 	 */
 	struct extent_io_tree io_failure_tree;
 
-	/* held while inserting checksums to avoid races */
-	struct mutex csum_mutex;
-
 	/* held while inesrting or deleting extents from files */
 	struct mutex extent_mutex;
 
@@ -79,6 +76,9 @@ struct btrfs_inode {
 	 */
 	u64 generation;
 
+	/* sequence number for NFS changes */
+	u64 sequence;
+
 	/*
 	 * transid of the trans_handle that last modified this inode
 	 */
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 242b961ae6de..f72b43819349 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -196,6 +196,12 @@ struct btrfs_dev_item {
 	/* expected generation for this device */
 	__le64 generation;
 
+	/*
+	 * starting byte of this partition on the device,
+	 * to allowr for stripe alignment in the future
+	 */
+	__le64 start_offset;
+
 	/* grouping information for allocation decisions */
 	__le32 dev_group;
 
@@ -311,6 +317,9 @@ struct btrfs_super_block {
 	__le64 root;
 	__le64 chunk_root;
 	__le64 log_root;
+
+	/* this will help find the new super based on the log root */
+	__le64 log_root_transid;
 	__le64 total_bytes;
 	__le64 bytes_used;
 	__le64 root_dir_objectid;
@@ -329,7 +338,11 @@ struct btrfs_super_block {
 	u8 chunk_root_level;
 	u8 log_root_level;
 	struct btrfs_dev_item dev_item;
+
 	char label[BTRFS_LABEL_SIZE];
+
+	/* future expansion */
+	__le64 reserved[32];
 	u8 sys_chunk_array[BTRFS_SYSTEM_CHUNK_ARRAY_SIZE];
 } __attribute__ ((__packed__));
 
@@ -463,6 +476,14 @@ struct btrfs_inode_item {
 	__le64 rdev;
 	__le64 flags;
 
+	/* modification sequence number for NFS */
+	__le64 sequence;
+
+	/*
+	 * a little future expansion, for more than this we can
+	 * just grow the inode item and version it
+	 */
+	__le64 reserved[4];
 	struct btrfs_timespec atime;
 	struct btrfs_timespec ctime;
 	struct btrfs_timespec mtime;
@@ -1001,6 +1022,8 @@ BTRFS_SETGET_FUNCS(device_total_bytes, struct btrfs_dev_item, total_bytes, 64);
 BTRFS_SETGET_FUNCS(device_bytes_used, struct btrfs_dev_item, bytes_used, 64);
 BTRFS_SETGET_FUNCS(device_io_align, struct btrfs_dev_item, io_align, 32);
 BTRFS_SETGET_FUNCS(device_io_width, struct btrfs_dev_item, io_width, 32);
+BTRFS_SETGET_FUNCS(device_start_offset, struct btrfs_dev_item,
+		   start_offset, 64);
 BTRFS_SETGET_FUNCS(device_sector_size, struct btrfs_dev_item, sector_size, 32);
 BTRFS_SETGET_FUNCS(device_id, struct btrfs_dev_item, devid, 64);
 BTRFS_SETGET_FUNCS(device_group, struct btrfs_dev_item, dev_group, 32);
@@ -1135,6 +1158,7 @@ BTRFS_SETGET_FUNCS(inode_ref_index, struct btrfs_inode_ref, index, 64);
 
 /* struct btrfs_inode_item */
 BTRFS_SETGET_FUNCS(inode_generation, struct btrfs_inode_item, generation, 64);
+BTRFS_SETGET_FUNCS(inode_sequence, struct btrfs_inode_item, sequence, 64);
 BTRFS_SETGET_FUNCS(inode_transid, struct btrfs_inode_item, transid, 64);
 BTRFS_SETGET_FUNCS(inode_size, struct btrfs_inode_item, size, 64);
 BTRFS_SETGET_FUNCS(inode_nbytes, struct btrfs_inode_item, nbytes, 64);
@@ -1519,6 +1543,8 @@ BTRFS_SETGET_STACK_FUNCS(super_chunk_root_level, struct btrfs_super_block,
 			 chunk_root_level, 8);
 BTRFS_SETGET_STACK_FUNCS(super_log_root, struct btrfs_super_block,
 			 log_root, 64);
+BTRFS_SETGET_STACK_FUNCS(super_log_root_transid, struct btrfs_super_block,
+			 log_root_transid, 64);
 BTRFS_SETGET_STACK_FUNCS(super_log_root_level, struct btrfs_super_block,
 			 log_root_level, 8);
 BTRFS_SETGET_STACK_FUNCS(super_total_bytes, struct btrfs_super_block,
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 1c9243560eab..b5a6a2b6f668 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1055,6 +1055,7 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
 	pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL);
 
 	mutex_lock(&inode->i_mutex);
+	BTRFS_I(inode)->sequence++;
 	first_index = pos >> PAGE_CACHE_SHIFT;
 	last_index = (pos + count) >> PAGE_CACHE_SHIFT;
 
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index c03d847b8c4e..932d8c0b2c05 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1963,6 +1963,7 @@ void btrfs_read_locked_inode(struct inode *inode)
 
 	inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item));
 	BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item);
+	BTRFS_I(inode)->sequence = btrfs_inode_sequence(leaf, inode_item);
 	inode->i_generation = BTRFS_I(inode)->generation;
 	inode->i_rdev = 0;
 	rdev = btrfs_inode_rdev(leaf, inode_item);
@@ -2043,6 +2044,7 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
 
 	btrfs_set_inode_nbytes(leaf, item, inode_get_bytes(inode));
 	btrfs_set_inode_generation(leaf, item, BTRFS_I(inode)->generation);
+	btrfs_set_inode_sequence(leaf, item, BTRFS_I(inode)->sequence);
 	btrfs_set_inode_transid(leaf, item, trans->transid);
 	btrfs_set_inode_rdev(leaf, item, inode->i_rdev);
 	btrfs_set_inode_flags(leaf, item, BTRFS_I(inode)->flags);
@@ -2945,6 +2947,7 @@ static noinline void init_btrfs_i(struct inode *inode)
 	bi->i_default_acl = NULL;
 
 	bi->generation = 0;
+	bi->sequence = 0;
 	bi->last_trans = 0;
 	bi->logged_trans = 0;
 	bi->delalloc_bytes = 0;
@@ -2959,7 +2962,6 @@ static noinline void init_btrfs_i(struct inode *inode)
 			     inode->i_mapping, GFP_NOFS);
 	INIT_LIST_HEAD(&BTRFS_I(inode)->delalloc_inodes);
 	btrfs_ordered_inode_tree_init(&BTRFS_I(inode)->ordered_tree);
-	mutex_init(&BTRFS_I(inode)->csum_mutex);
 	mutex_init(&BTRFS_I(inode)->extent_mutex);
 	mutex_init(&BTRFS_I(inode)->log_mutex);
 }
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 825364fae690..4d210a731d40 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -890,6 +890,7 @@ int btrfs_add_device(struct btrfs_trans_handle *trans,
 	btrfs_set_device_group(leaf, dev_item, 0);
 	btrfs_set_device_seek_speed(leaf, dev_item, 0);
 	btrfs_set_device_bandwidth(leaf, dev_item, 0);
+	btrfs_set_device_start_offset(leaf, dev_item, 0);
 
 	ptr = (unsigned long)btrfs_device_uuid(dev_item);
 	write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
-- 
cgit v1.2.3


From e4404d6e8da678d852b7f767f665f8edf76c9e9f Mon Sep 17 00:00:00 2001
From: Yan Zheng <zheng.yan@oracle.com>
Date: Fri, 12 Dec 2008 10:03:26 -0500
Subject: Btrfs: shared seed device

This patch makes seed device possible to be shared by
multiple mounted file systems. The sharing is achieved
by cloning seed device's btrfs_fs_devices structure.
Thanks you,

Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
---
 fs/btrfs/disk-io.c     |   2 +-
 fs/btrfs/extent-tree.c |  37 ++++----
 fs/btrfs/super.c       |   8 +-
 fs/btrfs/volumes.c     | 240 +++++++++++++++++++++++++++----------------------
 fs/btrfs/volumes.h     |   3 +-
 5 files changed, 156 insertions(+), 134 deletions(-)

(limited to 'fs/btrfs/volumes.c')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 6f042de1ac43..541a8279ac71 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1711,7 +1711,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	}
 
 	mutex_lock(&fs_info->chunk_mutex);
-	ret = btrfs_read_sys_array(tree_root, btrfs_super_bytenr(disk_super));
+	ret = btrfs_read_sys_array(tree_root);
 	mutex_unlock(&fs_info->chunk_mutex);
 	if (ret) {
 		printk("btrfs: failed to read the system array on %s\n",
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 1cc89246ee2f..171057a32679 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -218,7 +218,7 @@ static int cache_block_group(struct btrfs_root *root,
 	struct btrfs_key key;
 	struct extent_buffer *leaf;
 	int slot;
-	u64 last = block_group->key.objectid;
+	u64 last;
 
 	if (!block_group)
 		return 0;
@@ -239,7 +239,8 @@ static int cache_block_group(struct btrfs_root *root,
 	 * skip the locking here
 	 */
 	path->skip_locking = 1;
-	key.objectid = max_t(u64, last, BTRFS_SUPER_INFO_OFFSET);
+	last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
+	key.objectid = last;
 	key.offset = 0;
 	btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
@@ -5335,8 +5336,20 @@ static int noinline relocate_one_extent(struct btrfs_root *extent_root,
 			prev_block = block_start;
 		}
 
-		if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID &&
-		    pass >= 2) {
+		btrfs_record_root_in_trans(found_root);
+		if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
+			/*
+			 * try to update data extent references while
+			 * keeping metadata shared between snapshots.
+			 */
+			if (pass == 1) {
+				ret = relocate_one_path(trans, found_root,
+						path, &first_key, ref_path,
+						group, reloc_inode);
+				if (ret < 0)
+					goto out;
+				continue;
+			}
 			/*
 			 * use fallback method to process the remaining
 			 * references.
@@ -5359,23 +5372,9 @@ static int noinline relocate_one_extent(struct btrfs_root *extent_root,
 						path, extent_key,
 						&first_key, ref_path,
 						new_extents, nr_extents);
-			if (ret < 0)
-				goto out;
-			continue;
-		}
-
-		btrfs_record_root_in_trans(found_root);
-		if (ref_path->owner_objectid < BTRFS_FIRST_FREE_OBJECTID) {
+		} else {
 			ret = relocate_tree_block(trans, found_root, path,
 						  &first_key, ref_path);
-		} else {
-			/*
-			 * try to update data extent references while
-			 * keeping metadata shared between snapshots.
-			 */
-			ret = relocate_one_path(trans, found_root, path,
-						&first_key, ref_path,
-						group, reloc_inode);
 		}
 		if (ret < 0)
 			goto out;
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 09908f25fca9..84c3b66564d0 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -58,14 +58,15 @@ static struct super_operations btrfs_super_ops;
 static void btrfs_put_super (struct super_block * sb)
 {
 	struct btrfs_root *root = btrfs_sb(sb);
-	struct btrfs_fs_info *fs = root->fs_info;
 	int ret;
 
 	ret = close_ctree(root);
 	if (ret) {
 		printk("close ctree returns %d\n", ret);
 	}
-	btrfs_sysfs_del_super(fs);
+#if 0
+	btrfs_sysfs_del_super(root->fs_info);
+#endif
 	sb->s_fs_info = NULL;
 }
 
@@ -349,11 +350,12 @@ static int btrfs_fill_super(struct super_block * sb,
 		err = -ENOMEM;
 		goto fail_close;
 	}
-
+#if 0
 	/* this does the super kobj at the same time */
 	err = btrfs_sysfs_add_super(tree_root->fs_info);
 	if (err)
 		goto fail_close;
+#endif
 
 	sb->s_root = root_dentry;
 
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 4d210a731d40..6672adcec9f8 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -47,7 +47,6 @@ static int init_first_rw_device(struct btrfs_trans_handle *trans,
 				struct btrfs_device *device);
 static int btrfs_relocate_sys_chunks(struct btrfs_root *root);
 
-
 #define map_lookup_size(n) (sizeof(struct map_lookup) + \
 			    (sizeof(struct btrfs_bio_stripe) * (n)))
 
@@ -74,34 +73,29 @@ static void unlock_chunks(struct btrfs_root *root)
 	mutex_unlock(&root->fs_info->chunk_mutex);
 }
 
+static void free_fs_devices(struct btrfs_fs_devices *fs_devices)
+{
+	struct btrfs_device *device;
+	WARN_ON(fs_devices->opened);
+	while (!list_empty(&fs_devices->devices)) {
+		device = list_entry(fs_devices->devices.next,
+				    struct btrfs_device, dev_list);
+		list_del(&device->dev_list);
+		kfree(device->name);
+		kfree(device);
+	}
+	kfree(fs_devices);
+}
+
 int btrfs_cleanup_fs_uuids(void)
 {
 	struct btrfs_fs_devices *fs_devices;
-	struct btrfs_device *dev;
 
 	while (!list_empty(&fs_uuids)) {
 		fs_devices = list_entry(fs_uuids.next,
 					struct btrfs_fs_devices, list);
 		list_del(&fs_devices->list);
-		while(!list_empty(&fs_devices->devices)) {
-			dev = list_entry(fs_devices->devices.next,
-					 struct btrfs_device, dev_list);
-			if (dev->bdev) {
-				close_bdev_exclusive(dev->bdev, dev->mode);
-				fs_devices->open_devices--;
-			}
-			fs_devices->num_devices--;
-			if (dev->writeable)
-				fs_devices->rw_devices--;
-			list_del(&dev->dev_list);
-			list_del(&dev->dev_alloc_list);
-			kfree(dev->name);
-			kfree(dev);
-		}
-		WARN_ON(fs_devices->num_devices);
-		WARN_ON(fs_devices->open_devices);
-		WARN_ON(fs_devices->rw_devices);
-		kfree(fs_devices);
+		free_fs_devices(fs_devices);
 	}
 	return 0;
 }
@@ -304,12 +298,55 @@ static noinline int device_list_add(const char *path,
 	return 0;
 }
 
+static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
+{
+	struct btrfs_fs_devices *fs_devices;
+	struct btrfs_device *device;
+	struct btrfs_device *orig_dev;
+
+	fs_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS);
+	if (!fs_devices)
+		return ERR_PTR(-ENOMEM);
+
+	INIT_LIST_HEAD(&fs_devices->devices);
+	INIT_LIST_HEAD(&fs_devices->alloc_list);
+	INIT_LIST_HEAD(&fs_devices->list);
+	fs_devices->latest_devid = orig->latest_devid;
+	fs_devices->latest_trans = orig->latest_trans;
+	memcpy(fs_devices->fsid, orig->fsid, sizeof(fs_devices->fsid));
+
+	list_for_each_entry(orig_dev, &orig->devices, dev_list) {
+		device = kzalloc(sizeof(*device), GFP_NOFS);
+		if (!device)
+			goto error;
+
+		device->name = kstrdup(orig_dev->name, GFP_NOFS);
+		if (!device->name)
+			goto error;
+
+		device->devid = orig_dev->devid;
+		device->work.func = pending_bios_fn;
+		memcpy(device->uuid, orig_dev->uuid, sizeof(device->uuid));
+		device->barriers = 1;
+		spin_lock_init(&device->io_lock);
+		INIT_LIST_HEAD(&device->dev_list);
+		INIT_LIST_HEAD(&device->dev_alloc_list);
+
+		list_add(&device->dev_list, &fs_devices->devices);
+		device->fs_devices = fs_devices;
+		fs_devices->num_devices++;
+	}
+	return fs_devices;
+error:
+	free_fs_devices(fs_devices);
+	return ERR_PTR(-ENOMEM);
+}
+
 int btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices)
 {
 	struct list_head *tmp;
 	struct list_head *cur;
 	struct btrfs_device *device;
-	int seed_devices = 0;
 
 	mutex_lock(&uuid_mutex);
 again:
@@ -328,17 +365,14 @@ again:
 			device->writeable = 0;
 			fs_devices->rw_devices--;
 		}
-		if (!seed_devices) {
-			list_del_init(&device->dev_list);
-			fs_devices->num_devices--;
-			kfree(device->name);
-			kfree(device);
-		}
+		list_del_init(&device->dev_list);
+		fs_devices->num_devices--;
+		kfree(device->name);
+		kfree(device);
 	}
 
 	if (fs_devices->seed) {
 		fs_devices = fs_devices->seed;
-		seed_devices = 1;
 		goto again;
 	}
 
@@ -348,10 +382,9 @@ again:
 
 static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
 {
-	struct btrfs_fs_devices *seed_devices;
 	struct list_head *cur;
 	struct btrfs_device *device;
-again:
+
 	if (--fs_devices->opened > 0)
 		return 0;
 
@@ -370,31 +403,38 @@ again:
 		device->writeable = 0;
 		device->in_fs_metadata = 0;
 	}
+	WARN_ON(fs_devices->open_devices);
+	WARN_ON(fs_devices->rw_devices);
 	fs_devices->opened = 0;
 	fs_devices->seeding = 0;
-	fs_devices->sprouted = 0;
 
-	seed_devices = fs_devices->seed;
-	fs_devices->seed = NULL;
-	if (seed_devices) {
-		fs_devices = seed_devices;
-		goto again;
-	}
 	return 0;
 }
 
 int btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
 {
+	struct btrfs_fs_devices *seed_devices = NULL;
 	int ret;
 
 	mutex_lock(&uuid_mutex);
 	ret = __btrfs_close_devices(fs_devices);
+	if (!fs_devices->opened) {
+		seed_devices = fs_devices->seed;
+		fs_devices->seed = NULL;
+	}
 	mutex_unlock(&uuid_mutex);
+
+	while (seed_devices) {
+		fs_devices = seed_devices;
+		seed_devices = fs_devices->seed;
+		__btrfs_close_devices(fs_devices);
+		free_fs_devices(fs_devices);
+	}
 	return ret;
 }
 
-int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
-			 fmode_t flags, void *holder)
+static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
+				fmode_t flags, void *holder)
 {
 	struct block_device *bdev;
 	struct list_head *head = &fs_devices->devices;
@@ -490,12 +530,8 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
 
 	mutex_lock(&uuid_mutex);
 	if (fs_devices->opened) {
-		if (fs_devices->sprouted) {
-			ret = -EBUSY;
-		} else {
-			fs_devices->opened++;
-			ret = 0;
-		}
+		fs_devices->opened++;
+		ret = 0;
 	} else {
 		ret = __btrfs_open_devices(fs_devices, flags, holder);
 	}
@@ -1043,12 +1079,8 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
 		goto error_brelse;
 
 	device->in_fs_metadata = 0;
-	if (device->fs_devices == root->fs_info->fs_devices) {
-		list_del_init(&device->dev_list);
-		root->fs_info->fs_devices->num_devices--;
-		if (device->bdev)
-			device->fs_devices->open_devices--;
-	}
+	list_del_init(&device->dev_list);
+	device->fs_devices->num_devices--;
 
 	next_device = list_entry(root->fs_info->fs_devices->devices.next,
 				 struct btrfs_device, dev_list);
@@ -1057,34 +1089,27 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
 	if (device->bdev == root->fs_info->fs_devices->latest_bdev)
 		root->fs_info->fs_devices->latest_bdev = next_device->bdev;
 
+	if (device->bdev) {
+		close_bdev_exclusive(device->bdev, device->mode);
+		device->bdev = NULL;
+		device->fs_devices->open_devices--;
+	}
+
 	num_devices = btrfs_super_num_devices(&root->fs_info->super_copy) - 1;
 	btrfs_set_super_num_devices(&root->fs_info->super_copy, num_devices);
 
-	if (device->fs_devices != root->fs_info->fs_devices) {
-		BUG_ON(device->writeable);
-		brelse(bh);
-		if (bdev)
-			close_bdev_exclusive(bdev, FMODE_READ);
-
-		if (device->bdev) {
-			close_bdev_exclusive(device->bdev, device->mode);
-			device->bdev = NULL;
-			device->fs_devices->open_devices--;
-		}
-		if (device->fs_devices->open_devices == 0) {
-			struct btrfs_fs_devices *fs_devices;
-			fs_devices = root->fs_info->fs_devices;
-			while (fs_devices) {
-				if (fs_devices->seed == device->fs_devices)
-					break;
-				fs_devices = fs_devices->seed;
-			}
-			fs_devices->seed = device->fs_devices->seed;
-			device->fs_devices->seed = NULL;
-			__btrfs_close_devices(device->fs_devices);
+	if (device->fs_devices->open_devices == 0) {
+		struct btrfs_fs_devices *fs_devices;
+		fs_devices = root->fs_info->fs_devices;
+		while (fs_devices) {
+			if (fs_devices->seed == device->fs_devices)
+				break;
+			fs_devices = fs_devices->seed;
 		}
-		ret = 0;
-		goto out;
+		fs_devices->seed = device->fs_devices->seed;
+		device->fs_devices->seed = NULL;
+		__btrfs_close_devices(device->fs_devices);
+		free_fs_devices(device->fs_devices);
 	}
 
 	/*
@@ -1099,20 +1124,10 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
 		set_buffer_dirty(bh);
 		sync_dirty_buffer(bh);
 	}
-	brelse(bh);
 
-	if (device->bdev) {
-		/* one close for the device struct or super_block */
-		close_bdev_exclusive(device->bdev, device->mode);
-	}
-	if (bdev) {
-		/* one close for us */
-		close_bdev_exclusive(bdev, FMODE_READ);
-	}
 	kfree(device->name);
 	kfree(device);
 	ret = 0;
-	goto out;
 
 error_brelse:
 	brelse(bh);
@@ -1133,34 +1148,41 @@ static int btrfs_prepare_sprout(struct btrfs_trans_handle *trans,
 {
 	struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
 	struct btrfs_fs_devices *old_devices;
+	struct btrfs_fs_devices *seed_devices;
 	struct btrfs_super_block *disk_super = &root->fs_info->super_copy;
 	struct btrfs_device *device;
 	u64 super_flags;
 
 	BUG_ON(!mutex_is_locked(&uuid_mutex));
-	if (!fs_devices->seeding || fs_devices->opened != 1)
+	if (!fs_devices->seeding)
 		return -EINVAL;
 
-	old_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS);
-	if (!old_devices)
+	seed_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS);
+	if (!seed_devices)
 		return -ENOMEM;
 
-	memcpy(old_devices, fs_devices, sizeof(*old_devices));
-	old_devices->opened = 1;
-	old_devices->sprouted = 1;
-	INIT_LIST_HEAD(&old_devices->devices);
-	INIT_LIST_HEAD(&old_devices->alloc_list);
-	list_splice_init(&fs_devices->devices, &old_devices->devices);
-	list_splice_init(&fs_devices->alloc_list, &old_devices->alloc_list);
-	list_for_each_entry(device, &old_devices->devices, dev_list) {
-		device->fs_devices = old_devices;
+	old_devices = clone_fs_devices(fs_devices);
+	if (IS_ERR(old_devices)) {
+		kfree(seed_devices);
+		return PTR_ERR(old_devices);
 	}
+
 	list_add(&old_devices->list, &fs_uuids);
 
+	memcpy(seed_devices, fs_devices, sizeof(*seed_devices));
+	seed_devices->opened = 1;
+	INIT_LIST_HEAD(&seed_devices->devices);
+	INIT_LIST_HEAD(&seed_devices->alloc_list);
+	list_splice_init(&fs_devices->devices, &seed_devices->devices);
+	list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list);
+	list_for_each_entry(device, &seed_devices->devices, dev_list) {
+		device->fs_devices = seed_devices;
+	}
+
 	fs_devices->seeding = 0;
 	fs_devices->num_devices = 0;
 	fs_devices->open_devices = 0;
-	fs_devices->seed = old_devices;
+	fs_devices->seed = seed_devices;
 
 	generate_random_uuid(fs_devices->fsid);
 	memcpy(root->fs_info->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
@@ -2642,7 +2664,6 @@ int btrfs_unplug_page(struct btrfs_mapping_tree *map_tree,
 				 NULL, 0, page);
 }
 
-
 static void end_bio_multi_stripe(struct bio *bio, int err)
 {
 	struct btrfs_multi_bio *multi = bio->bi_private;
@@ -2840,6 +2861,7 @@ static struct btrfs_device *add_missing_dev(struct btrfs_root *root,
 	device->dev_root = root->fs_info->dev_root;
 	device->devid = devid;
 	device->work.func = pending_bios_fn;
+	device->fs_devices = fs_devices;
 	fs_devices->num_devices++;
 	spin_lock_init(&device->io_lock);
 	INIT_LIST_HEAD(&device->dev_alloc_list);
@@ -2980,8 +3002,10 @@ static int open_seed_devices(struct btrfs_root *root, u8 *fsid)
 		ret = -ENOENT;
 		goto out;
 	}
-	if (fs_devices->opened) {
-		ret = -EBUSY;
+
+	fs_devices = clone_fs_devices(fs_devices);
+	if (IS_ERR(fs_devices)) {
+		ret = PTR_ERR(fs_devices);
 		goto out;
 	}
 
@@ -2992,13 +3016,13 @@ static int open_seed_devices(struct btrfs_root *root, u8 *fsid)
 
 	if (!fs_devices->seeding) {
 		__btrfs_close_devices(fs_devices);
+		free_fs_devices(fs_devices);
 		ret = -EINVAL;
 		goto out;
 	}
 
 	fs_devices->seed = root->fs_info->fs_devices->seed;
 	root->fs_info->fs_devices->seed = fs_devices;
-	fs_devices->sprouted = 1;
 out:
 	mutex_unlock(&uuid_mutex);
 	return ret;
@@ -3011,7 +3035,6 @@ static int read_one_dev(struct btrfs_root *root,
 	struct btrfs_device *device;
 	u64 devid;
 	int ret;
-	int seed_devices = 0;
 	u8 fs_uuid[BTRFS_UUID_SIZE];
 	u8 dev_uuid[BTRFS_UUID_SIZE];
 
@@ -3025,14 +3048,13 @@ static int read_one_dev(struct btrfs_root *root,
 
 	if (memcmp(fs_uuid, root->fs_info->fsid, BTRFS_UUID_SIZE)) {
 		ret = open_seed_devices(root, fs_uuid);
-		if (ret)
+		if (ret && !btrfs_test_opt(root, DEGRADED))
 			return ret;
-		seed_devices = 1;
 	}
 
 	device = btrfs_find_device(root, devid, dev_uuid, fs_uuid);
 	if (!device || !device->bdev) {
-		if (!btrfs_test_opt(root, DEGRADED) || seed_devices)
+		if (!btrfs_test_opt(root, DEGRADED))
 			return -EIO;
 
 		if (!device) {
@@ -3074,7 +3096,7 @@ int btrfs_read_super_device(struct btrfs_root *root, struct extent_buffer *buf)
 	return read_one_dev(root, buf, dev_item);
 }
 
-int btrfs_read_sys_array(struct btrfs_root *root, u64 sb_bytenr)
+int btrfs_read_sys_array(struct btrfs_root *root)
 {
 	struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
 	struct extent_buffer *sb;
@@ -3089,7 +3111,7 @@ int btrfs_read_sys_array(struct btrfs_root *root, u64 sb_bytenr)
 	u32 cur;
 	struct btrfs_key key;
 
-	sb = btrfs_find_create_tree_block(root, sb_bytenr,
+	sb = btrfs_find_create_tree_block(root, BTRFS_SUPER_INFO_OFFSET,
 					  BTRFS_SUPER_INFO_SIZE);
 	if (!sb)
 		return -ENOMEM;
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index bdebe83c3195..86c44e9ae110 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -93,7 +93,6 @@ struct btrfs_fs_devices {
 
 	struct btrfs_fs_devices *seed;
 	int seeding;
-	int sprouted;
 
 	int opened;
 };
@@ -127,7 +126,7 @@ int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
 int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
 		     u64 chunk_start, u64 physical, u64 devid,
 		     u64 **logical, int *naddrs, int *stripe_len);
-int btrfs_read_sys_array(struct btrfs_root *root, u64 sb_bytenr);
+int btrfs_read_sys_array(struct btrfs_root *root);
 int btrfs_read_chunk_tree(struct btrfs_root *root);
 int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 		      struct btrfs_root *extent_root, u64 type);
-- 
cgit v1.2.3


From d397712bcc6a759a560fd247e6053ecae091f958 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 5 Jan 2009 21:25:51 -0500
Subject: Btrfs: Fix checkpatch.pl warnings

There were many, most are fixed now.  struct-funcs.c generates some warnings
but these are bogus.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/acl.c              |   5 +-
 fs/btrfs/async-thread.c     |   6 +-
 fs/btrfs/compat.h           |   4 +-
 fs/btrfs/compression.c      |  12 +--
 fs/btrfs/ctree.c            | 217 ++++++++++++++++--------------------------
 fs/btrfs/ctree.h            |  30 +++---
 fs/btrfs/dir-item.c         |   2 +-
 fs/btrfs/disk-io.c          | 187 +++++++++++++------------------------
 fs/btrfs/export.c           |   8 +-
 fs/btrfs/extent-tree.c      | 223 +++++++++++++++++++++++---------------------
 fs/btrfs/extent_io.c        | 213 ++++++++++++++++++++----------------------
 fs/btrfs/extent_map.c       |  14 +--
 fs/btrfs/file-item.c        |  18 ++--
 fs/btrfs/file.c             |  49 +++++-----
 fs/btrfs/free-space-cache.c |  37 ++++----
 fs/btrfs/inode-map.c        |   1 -
 fs/btrfs/inode.c            | 173 +++++++++++++++++-----------------
 fs/btrfs/ioctl.c            |  37 ++++----
 fs/btrfs/locking.c          |   5 +-
 fs/btrfs/ordered-data.c     |  34 +++----
 fs/btrfs/print-tree.c       |  73 +++++++++------
 fs/btrfs/ref-cache.c        |  12 +--
 fs/btrfs/root-tree.c        |  17 ++--
 fs/btrfs/struct-funcs.c     |   4 +-
 fs/btrfs/super.c            |  25 +++--
 fs/btrfs/sysfs.c            |   6 +-
 fs/btrfs/transaction.c      |  45 ++++-----
 fs/btrfs/transaction.h      |   6 +-
 fs/btrfs/tree-defrag.c      |   9 +-
 fs/btrfs/tree-log.c         |  70 +++++++-------
 fs/btrfs/volumes.c          |  78 ++++++++--------
 fs/btrfs/xattr.c            |   3 +-
 fs/btrfs/zlib.c             |  45 ++++-----
 33 files changed, 770 insertions(+), 898 deletions(-)

(limited to 'fs/btrfs/volumes.c')

diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index 867eaf1f8efb..1d53b62dbba5 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -161,8 +161,7 @@ static int btrfs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
 	ret = __btrfs_setxattr(inode, name, value, size, 0);
 
 out:
-	if (value)
-		kfree(value);
+	kfree(value);
 
 	if (!ret)
 		btrfs_update_cached_acl(inode, p_acl, acl);
@@ -213,7 +212,7 @@ static int btrfs_xattr_acl_default_get(struct inode *inode, const char *name,
 }
 
 static int btrfs_xattr_acl_default_set(struct inode *inode, const char *name,
-				       const void *value, size_t size, int flags)
+			       const void *value, size_t size, int flags)
 {
 	return btrfs_xattr_set_acl(inode, ACL_TYPE_DEFAULT, value, size);
 }
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index 4229450b7596..8e2fec05dbe0 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -104,7 +104,7 @@ static noinline int run_ordered_completions(struct btrfs_workers *workers,
 
 	spin_lock_irqsave(&workers->lock, flags);
 
-	while(!list_empty(&workers->order_list)) {
+	while (!list_empty(&workers->order_list)) {
 		work = list_entry(workers->order_list.next,
 				  struct btrfs_work, order_list);
 
@@ -143,7 +143,7 @@ static int worker_loop(void *arg)
 	struct btrfs_work *work;
 	do {
 		spin_lock_irq(&worker->lock);
-		while(!list_empty(&worker->pending)) {
+		while (!list_empty(&worker->pending)) {
 			cur = worker->pending.next;
 			work = list_entry(cur, struct btrfs_work, list);
 			list_del(&work->list);
@@ -188,7 +188,7 @@ int btrfs_stop_workers(struct btrfs_workers *workers)
 	struct btrfs_worker_thread *worker;
 
 	list_splice_init(&workers->idle_list, &workers->worker_list);
-	while(!list_empty(&workers->worker_list)) {
+	while (!list_empty(&workers->worker_list)) {
 		cur = workers->worker_list.next;
 		worker = list_entry(cur, struct btrfs_worker_thread,
 				    worker_list);
diff --git a/fs/btrfs/compat.h b/fs/btrfs/compat.h
index 75e4426d6fbb..594d60bdd3c4 100644
--- a/fs/btrfs/compat.h
+++ b/fs/btrfs/compat.h
@@ -4,7 +4,7 @@
 #define btrfs_drop_nlink(inode) drop_nlink(inode)
 #define btrfs_inc_nlink(inode)	inc_nlink(inode)
 
-#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,27)
+#if LINUX_VERSION_CODE <= KERNEL_VERSION(2, 6, 27)
 static inline struct dentry *d_obtain_alias(struct inode *inode)
 {
 	struct dentry *d;
@@ -21,7 +21,7 @@ static inline struct dentry *d_obtain_alias(struct inode *inode)
 }
 #endif
 
-#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,28)
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 28)
 # define  __pagevec_lru_add_file __pagevec_lru_add
 # define open_bdev_exclusive open_bdev_excl
 # define close_bdev_exclusive(bdev, mode) close_bdev_excl(bdev)
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 2436163d5436..ee848d8585d9 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -137,7 +137,8 @@ static int check_compressed_csum(struct inode *inode,
 		kunmap_atomic(kaddr, KM_USER0);
 
 		if (csum != *cb_sum) {
-			printk("btrfs csum failed ino %lu extent %llu csum %u "
+			printk(KERN_INFO "btrfs csum failed ino %lu "
+			       "extent %llu csum %u "
 			       "wanted %u mirror %d\n", inode->i_ino,
 			       (unsigned long long)disk_start,
 			       csum, *cb_sum, cb->mirror_num);
@@ -217,7 +218,7 @@ csum_failed:
 		 * we have verified the checksum already, set page
 		 * checked so the end_io handlers know about it
 		 */
-		while(bio_index < cb->orig_bio->bi_vcnt) {
+		while (bio_index < cb->orig_bio->bi_vcnt) {
 			SetPageChecked(bvec->bv_page);
 			bvec++;
 			bio_index++;
@@ -246,7 +247,7 @@ static noinline int end_compressed_writeback(struct inode *inode, u64 start,
 	int i;
 	int ret;
 
-	while(nr_pages > 0) {
+	while (nr_pages > 0) {
 		ret = find_get_pages_contig(inode->i_mapping, index,
 				     min_t(unsigned long,
 				     nr_pages, ARRAY_SIZE(pages)), pages);
@@ -463,7 +464,7 @@ static noinline int add_ra_bio_pages(struct inode *inode,
 	end_index = (i_size_read(inode) - 1) >> PAGE_CACHE_SHIFT;
 
 	pagevec_init(&pvec, 0);
-	while(last_offset < compressed_end) {
+	while (last_offset < compressed_end) {
 		page_index = last_offset >> PAGE_CACHE_SHIFT;
 
 		if (page_index > end_index)
@@ -697,9 +698,8 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
 	ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, 0);
 	BUG_ON(ret);
 
-	if (!btrfs_test_flag(inode, NODATASUM)) {
+	if (!btrfs_test_flag(inode, NODATASUM))
 		btrfs_lookup_bio_sums(root, inode, comp_bio, sums);
-	}
 
 	ret = btrfs_map_bio(root, READ, comp_bio, mirror_num, 0);
 	BUG_ON(ret);
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 7fad2e3ad6ff..9e46c0776816 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -67,7 +67,7 @@ void btrfs_free_path(struct btrfs_path *p)
  *
  * It is safe to call this on paths that no locks or extent buffers held.
  */
-void noinline btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p)
+noinline void btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p)
 {
 	int i;
 
@@ -112,7 +112,7 @@ struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root)
 {
 	struct extent_buffer *eb;
 
-	while(1) {
+	while (1) {
 		eb = btrfs_root_node(root);
 		btrfs_tree_lock(eb);
 
@@ -202,22 +202,22 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
 }
 
 /*
- * does the dirty work in cow of a single block.  The parent block
- * (if supplied) is updated to point to the new cow copy.  The new
- * buffer is marked dirty and returned locked.  If you modify the block
- * it needs to be marked dirty again.
+ * does the dirty work in cow of a single block.  The parent block (if
+ * supplied) is updated to point to the new cow copy.  The new buffer is marked
+ * dirty and returned locked.  If you modify the block it needs to be marked
+ * dirty again.
  *
  * search_start -- an allocation hint for the new block
  *
- * empty_size -- a hint that you plan on doing more cow.  This is the size in bytes
- * the allocator should try to find free next to the block it returns.  This is
- * just a hint and may be ignored by the allocator.
+ * empty_size -- a hint that you plan on doing more cow.  This is the size in
+ * bytes the allocator should try to find free next to the block it returns.
+ * This is just a hint and may be ignored by the allocator.
  *
  * prealloc_dest -- if you have already reserved a destination for the cow,
- * this uses that block instead of allocating a new one.  btrfs_alloc_reserved_extent
- * is used to finish the allocation.
+ * this uses that block instead of allocating a new one.
+ * btrfs_alloc_reserved_extent is used to finish the allocation.
  */
-static int noinline __btrfs_cow_block(struct btrfs_trans_handle *trans,
+static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *root,
 			     struct extent_buffer *buf,
 			     struct extent_buffer *parent, int parent_slot,
@@ -366,7 +366,7 @@ static int noinline __btrfs_cow_block(struct btrfs_trans_handle *trans,
  * This version of it has extra checks so that a block isn't cow'd more than
  * once per transaction, as long as it hasn't been written yet
  */
-int noinline btrfs_cow_block(struct btrfs_trans_handle *trans,
+noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
 		    struct btrfs_root *root, struct extent_buffer *buf,
 		    struct extent_buffer *parent, int parent_slot,
 		    struct extent_buffer **cow_ret, u64 prealloc_dest)
@@ -375,13 +375,16 @@ int noinline btrfs_cow_block(struct btrfs_trans_handle *trans,
 	int ret;
 
 	if (trans->transaction != root->fs_info->running_transaction) {
-		printk(KERN_CRIT "trans %Lu running %Lu\n", trans->transid,
+		printk(KERN_CRIT "trans %llu running %llu\n",
+		       (unsigned long long)trans->transid,
+		       (unsigned long long)
 		       root->fs_info->running_transaction->transid);
 		WARN_ON(1);
 	}
 	if (trans->transid != root->fs_info->generation) {
-		printk(KERN_CRIT "trans %Lu running %Lu\n", trans->transid,
-		       root->fs_info->generation);
+		printk(KERN_CRIT "trans %llu running %llu\n",
+		       (unsigned long long)trans->transid,
+		       (unsigned long long)root->fs_info->generation);
 		WARN_ON(1);
 	}
 
@@ -489,16 +492,10 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
 	if (cache_only && parent_level != 1)
 		return 0;
 
-	if (trans->transaction != root->fs_info->running_transaction) {
-		printk(KERN_CRIT "trans %Lu running %Lu\n", trans->transid,
-		       root->fs_info->running_transaction->transid);
+	if (trans->transaction != root->fs_info->running_transaction)
 		WARN_ON(1);
-	}
-	if (trans->transid != root->fs_info->generation) {
-		printk(KERN_CRIT "trans %Lu running %Lu\n", trans->transid,
-		       root->fs_info->generation);
+	if (trans->transid != root->fs_info->generation)
 		WARN_ON(1);
-	}
 
 	parent_nritems = btrfs_header_nritems(parent);
 	blocksize = btrfs_level_size(root, parent_level - 1);
@@ -681,51 +678,18 @@ static int check_leaf(struct btrfs_root *root, struct btrfs_path *path,
 		BUG_ON(btrfs_node_blockptr(parent, parent_slot) !=
 		       btrfs_header_bytenr(leaf));
 	}
-#if 0
-	for (i = 0; nritems > 1 && i < nritems - 2; i++) {
-		btrfs_item_key_to_cpu(leaf, &cpukey, i + 1);
-		btrfs_item_key(leaf, &leaf_key, i);
-		if (comp_keys(&leaf_key, &cpukey) >= 0) {
-			btrfs_print_leaf(root, leaf);
-			printk("slot %d offset bad key\n", i);
-			BUG_ON(1);
-		}
-		if (btrfs_item_offset_nr(leaf, i) !=
-			btrfs_item_end_nr(leaf, i + 1)) {
-			btrfs_print_leaf(root, leaf);
-			printk("slot %d offset bad\n", i);
-			BUG_ON(1);
-		}
-		if (i == 0) {
-			if (btrfs_item_offset_nr(leaf, i) +
-			       btrfs_item_size_nr(leaf, i) !=
-			       BTRFS_LEAF_DATA_SIZE(root)) {
-				btrfs_print_leaf(root, leaf);
-				printk("slot %d first offset bad\n", i);
-				BUG_ON(1);
-			}
-		}
-	}
-	if (nritems > 0) {
-		if (btrfs_item_size_nr(leaf, nritems - 1) > 4096) {
-				btrfs_print_leaf(root, leaf);
-				printk("slot %d bad size \n", nritems - 1);
-				BUG_ON(1);
-		}
-	}
-#endif
 	if (slot != 0 && slot < nritems - 1) {
 		btrfs_item_key(leaf, &leaf_key, slot);
 		btrfs_item_key_to_cpu(leaf, &cpukey, slot - 1);
 		if (comp_keys(&leaf_key, &cpukey) <= 0) {
 			btrfs_print_leaf(root, leaf);
-			printk("slot %d offset bad key\n", slot);
+			printk(KERN_CRIT "slot %d offset bad key\n", slot);
 			BUG_ON(1);
 		}
 		if (btrfs_item_offset_nr(leaf, slot - 1) !=
 		       btrfs_item_end_nr(leaf, slot)) {
 			btrfs_print_leaf(root, leaf);
-			printk("slot %d offset bad\n", slot);
+			printk(KERN_CRIT "slot %d offset bad\n", slot);
 			BUG_ON(1);
 		}
 	}
@@ -736,7 +700,7 @@ static int check_leaf(struct btrfs_root *root, struct btrfs_path *path,
 		if (btrfs_item_offset_nr(leaf, slot) !=
 			btrfs_item_end_nr(leaf, slot + 1)) {
 			btrfs_print_leaf(root, leaf);
-			printk("slot %d offset bad\n", slot);
+			printk(KERN_CRIT "slot %d offset bad\n", slot);
 			BUG_ON(1);
 		}
 	}
@@ -745,30 +709,10 @@ static int check_leaf(struct btrfs_root *root, struct btrfs_path *path,
 	return 0;
 }
 
-static int noinline check_block(struct btrfs_root *root,
+static noinline int check_block(struct btrfs_root *root,
 				struct btrfs_path *path, int level)
 {
-	u64 found_start;
 	return 0;
-	if (btrfs_header_level(path->nodes[level]) != level)
-	    printk("warning: bad level %Lu wanted %d found %d\n",
-		   path->nodes[level]->start, level,
-		   btrfs_header_level(path->nodes[level]));
-	found_start = btrfs_header_bytenr(path->nodes[level]);
-	if (found_start != path->nodes[level]->start) {
-	    printk("warning: bad bytentr %Lu found %Lu\n",
-		   path->nodes[level]->start, found_start);
-	}
-#if 0
-	struct extent_buffer *buf = path->nodes[level];
-
-	if (memcmp_extent_buffer(buf, root->fs_info->fsid,
-				 (unsigned long)btrfs_header_fsid(buf),
-				 BTRFS_FSID_SIZE)) {
-		printk("warning bad block %Lu\n", buf->start);
-		return 1;
-	}
-#endif
 	if (level == 0)
 		return check_leaf(root, path, level);
 	return check_node(root, path, level);
@@ -802,7 +746,7 @@ static noinline int generic_bin_search(struct extent_buffer *eb,
 	unsigned long map_len = 0;
 	int err;
 
-	while(low < high) {
+	while (low < high) {
 		mid = (low + high) / 2;
 		offset = p + mid * item_size;
 
@@ -1130,7 +1074,7 @@ enospc:
  * when they are completely full.  This is also done top down, so we
  * have to be pessimistic.
  */
-static int noinline push_nodes_for_insert(struct btrfs_trans_handle *trans,
+static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
 					  struct btrfs_root *root,
 					  struct btrfs_path *path, int level)
 {
@@ -1296,7 +1240,7 @@ static noinline void reada_for_search(struct btrfs_root *root,
 
 	nritems = btrfs_header_nritems(node);
 	nr = slot;
-	while(1) {
+	while (1) {
 		if (direction < 0) {
 			if (nr == 0)
 				break;
@@ -1322,7 +1266,8 @@ static noinline void reada_for_search(struct btrfs_root *root,
 		nscan++;
 		if (path->reada < 2 && (nread > (64 * 1024) || nscan > 32))
 			break;
-		if(nread > (256 * 1024) || nscan > 128)
+
+		if (nread > (256 * 1024) || nscan > 128)
 			break;
 
 		if (search < lowest_read)
@@ -1333,17 +1278,17 @@ static noinline void reada_for_search(struct btrfs_root *root,
 }
 
 /*
- * when we walk down the tree, it is usually safe to unlock the higher layers in
- * the tree.  The exceptions are when our path goes through slot 0, because operations
- * on the tree might require changing key pointers higher up in the tree.
+ * when we walk down the tree, it is usually safe to unlock the higher layers
+ * in the tree.  The exceptions are when our path goes through slot 0, because
+ * operations on the tree might require changing key pointers higher up in the
+ * tree.
  *
- * callers might also have set path->keep_locks, which tells this code to
- * keep the lock if the path points to the last slot in the block.  This is
- * part of walking through the tree, and selecting the next slot in the higher
- * block.
+ * callers might also have set path->keep_locks, which tells this code to keep
+ * the lock if the path points to the last slot in the block.  This is part of
+ * walking through the tree, and selecting the next slot in the higher block.
  *
- * lowest_unlock sets the lowest level in the tree we're allowed to unlock.
- * so if lowest_unlock is 1, level 0 won't be unlocked
+ * lowest_unlock sets the lowest level in the tree we're allowed to unlock.  so
+ * if lowest_unlock is 1, level 0 won't be unlocked
  */
 static noinline void unlock_up(struct btrfs_path *path, int level,
 			       int lowest_unlock)
@@ -1832,9 +1777,8 @@ static int push_node_left(struct btrfs_trans_handle *trans,
 	if (!empty && src_nritems <= 8)
 		return 1;
 
-	if (push_items <= 0) {
+	if (push_items <= 0)
 		return 1;
-	}
 
 	if (empty) {
 		push_items = min(src_nritems, push_items);
@@ -1854,7 +1798,7 @@ static int push_node_left(struct btrfs_trans_handle *trans,
 	copy_extent_buffer(dst, src,
 			   btrfs_node_key_ptr_offset(dst_nritems),
 			   btrfs_node_key_ptr_offset(0),
-		           push_items * sizeof(struct btrfs_key_ptr));
+			   push_items * sizeof(struct btrfs_key_ptr));
 
 	if (push_items < src_nritems) {
 		memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0),
@@ -1899,19 +1843,16 @@ static int balance_node_right(struct btrfs_trans_handle *trans,
 	src_nritems = btrfs_header_nritems(src);
 	dst_nritems = btrfs_header_nritems(dst);
 	push_items = BTRFS_NODEPTRS_PER_BLOCK(root) - dst_nritems;
-	if (push_items <= 0) {
+	if (push_items <= 0)
 		return 1;
-	}
 
-	if (src_nritems < 4) {
+	if (src_nritems < 4)
 		return 1;
-	}
 
 	max_push = src_nritems / 2 + 1;
 	/* don't try to empty the node */
-	if (max_push >= src_nritems) {
+	if (max_push >= src_nritems)
 		return 1;
-	}
 
 	if (max_push < push_items)
 		push_items = max_push;
@@ -1924,7 +1865,7 @@ static int balance_node_right(struct btrfs_trans_handle *trans,
 	copy_extent_buffer(dst, src,
 			   btrfs_node_key_ptr_offset(0),
 			   btrfs_node_key_ptr_offset(src_nritems - push_items),
-		           push_items * sizeof(struct btrfs_key_ptr));
+			   push_items * sizeof(struct btrfs_key_ptr));
 
 	btrfs_set_header_nritems(src, src_nritems - push_items);
 	btrfs_set_header_nritems(dst, dst_nritems + push_items);
@@ -1945,7 +1886,7 @@ static int balance_node_right(struct btrfs_trans_handle *trans,
  *
  * returns zero on success or < 0 on failure.
  */
-static int noinline insert_new_root(struct btrfs_trans_handle *trans,
+static noinline int insert_new_root(struct btrfs_trans_handle *trans,
 			   struct btrfs_root *root,
 			   struct btrfs_path *path, int level)
 {
@@ -2176,14 +2117,15 @@ static int leaf_space_used(struct extent_buffer *l, int start, int nr)
  * the start of the leaf data.  IOW, how much room
  * the leaf has left for both items and data
  */
-int noinline btrfs_leaf_free_space(struct btrfs_root *root,
+noinline int btrfs_leaf_free_space(struct btrfs_root *root,
 				   struct extent_buffer *leaf)
 {
 	int nritems = btrfs_header_nritems(leaf);
 	int ret;
 	ret = BTRFS_LEAF_DATA_SIZE(root) - leaf_space_used(leaf, 0, nritems);
 	if (ret < 0) {
-		printk("leaf free space ret %d, leaf data size %lu, used %d nritems %d\n",
+		printk(KERN_CRIT "leaf free space ret %d, leaf data size %lu, "
+		       "used %d nritems %d\n",
 		       ret, (unsigned long) BTRFS_LEAF_DATA_SIZE(root),
 		       leaf_space_used(leaf, 0, nritems), nritems);
 	}
@@ -2219,9 +2161,9 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
 	int ret;
 
 	slot = path->slots[1];
-	if (!path->nodes[1]) {
+	if (!path->nodes[1])
 		return 1;
-	}
+
 	upper = path->nodes[1];
 	if (slot >= btrfs_header_nritems(upper) - 1)
 		return 1;
@@ -2418,9 +2360,8 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
 		return 1;
 
 	right_nritems = btrfs_header_nritems(right);
-	if (right_nritems == 0) {
+	if (right_nritems == 0)
 		return 1;
-	}
 
 	WARN_ON(!btrfs_tree_locked(path->nodes[1]));
 
@@ -2502,7 +2443,7 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
 			   push_items * sizeof(struct btrfs_item));
 
 	push_space = BTRFS_LEAF_DATA_SIZE(root) -
-		     btrfs_item_offset_nr(right, push_items -1);
+		     btrfs_item_offset_nr(right, push_items - 1);
 
 	copy_extent_buffer(left, right, btrfs_leaf_data(left) +
 		     leaf_data_end(root, left) - push_space,
@@ -2537,7 +2478,8 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
 
 	/* fixup right node */
 	if (push_items > right_nritems) {
-		printk("push items %d nr %u\n", push_items, right_nritems);
+		printk(KERN_CRIT "push items %d nr %u\n", push_items,
+		       right_nritems);
 		WARN_ON(1);
 	}
 
@@ -2640,9 +2582,8 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans,
 	/* first try to make some room by pushing left and right */
 	if (data_size && ins_key->type != BTRFS_DIR_ITEM_KEY) {
 		wret = push_leaf_right(trans, root, path, data_size, 0);
-		if (wret < 0) {
+		if (wret < 0)
 			return wret;
-		}
 		if (wret) {
 			wret = push_leaf_left(trans, root, path, data_size, 0);
 			if (wret < 0)
@@ -2665,7 +2606,7 @@ again:
 	l = path->nodes[0];
 	slot = path->slots[0];
 	nritems = btrfs_header_nritems(l);
-	mid = (nritems + 1)/ 2;
+	mid = (nritems + 1) / 2;
 
 	right = btrfs_alloc_free_block(trans, root, root->leafsize,
 					path->nodes[1]->start,
@@ -2734,7 +2675,7 @@ again:
 				path->slots[0] = 0;
 				if (path->slots[1] == 0) {
 					wret = fixup_low_keys(trans, root,
-					           path, &disk_key, 1);
+						      path, &disk_key, 1);
 					if (wret)
 						ret = wret;
 				}
@@ -3033,8 +2974,8 @@ int btrfs_truncate_item(struct btrfs_trans_handle *trans,
 			    BTRFS_FILE_EXTENT_INLINE) {
 				ptr = btrfs_item_ptr_offset(leaf, slot);
 				memmove_extent_buffer(leaf, ptr,
-				        (unsigned long)fi,
-				        offsetof(struct btrfs_file_extent_item,
+				      (unsigned long)fi,
+				      offsetof(struct btrfs_file_extent_item,
 						 disk_bytenr));
 			}
 		}
@@ -3096,7 +3037,8 @@ int btrfs_extend_item(struct btrfs_trans_handle *trans,
 	BUG_ON(slot < 0);
 	if (slot >= nritems) {
 		btrfs_print_leaf(root, leaf);
-		printk("slot %d too large, nritems %d\n", slot, nritems);
+		printk(KERN_CRIT "slot %d too large, nritems %d\n",
+		       slot, nritems);
 		BUG_ON(1);
 	}
 
@@ -3218,7 +3160,7 @@ int btrfs_insert_some_items(struct btrfs_trans_handle *trans,
 
 		if (old_data < data_end) {
 			btrfs_print_leaf(root, leaf);
-			printk("slot %d old_data %d data_end %d\n",
+			printk(KERN_CRIT "slot %d old_data %d data_end %d\n",
 			       slot, old_data, data_end);
 			BUG_ON(1);
 		}
@@ -3317,9 +3259,8 @@ int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
 	unsigned int data_end;
 	struct btrfs_disk_key disk_key;
 
-	for (i = 0; i < nr; i++) {
+	for (i = 0; i < nr; i++)
 		total_data += data_size[i];
-	}
 
 	total_size = total_data + (nr * sizeof(struct btrfs_item));
 	ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
@@ -3336,7 +3277,7 @@ int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
 
 	if (btrfs_leaf_free_space(root, leaf) < total_size) {
 		btrfs_print_leaf(root, leaf);
-		printk("not enough freespace need %u have %d\n",
+		printk(KERN_CRIT "not enough freespace need %u have %d\n",
 		       total_size, btrfs_leaf_free_space(root, leaf));
 		BUG();
 	}
@@ -3349,7 +3290,7 @@ int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
 
 		if (old_data < data_end) {
 			btrfs_print_leaf(root, leaf);
-			printk("slot %d old_data %d data_end %d\n",
+			printk(KERN_CRIT "slot %d old_data %d data_end %d\n",
 			       slot, old_data, data_end);
 			BUG_ON(1);
 		}
@@ -3457,7 +3398,7 @@ static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 	int wret;
 
 	nritems = btrfs_header_nritems(parent);
-	if (slot != nritems -1) {
+	if (slot != nritems - 1) {
 		memmove_extent_buffer(parent,
 			      btrfs_node_key_ptr_offset(slot),
 			      btrfs_node_key_ptr_offset(slot + 1),
@@ -3614,7 +3555,8 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 
 			if (btrfs_header_nritems(leaf) == 0) {
 				path->slots[1] = slot;
-				ret = btrfs_del_leaf(trans, root, path, leaf->start);
+				ret = btrfs_del_leaf(trans, root, path,
+						     leaf->start);
 				BUG_ON(ret);
 				free_extent_buffer(leaf);
 			} else {
@@ -3717,7 +3659,7 @@ again:
 		ret = 1;
 		goto out;
 	}
-	while(1) {
+	while (1) {
 		nritems = btrfs_header_nritems(cur);
 		level = btrfs_header_level(cur);
 		sret = bin_search(cur, min_key, level, &slot);
@@ -3738,7 +3680,7 @@ again:
 		 * min_trans parameters.  If it isn't in cache or is too
 		 * old, skip to the next one.
 		 */
-		while(slot < nritems) {
+		while (slot < nritems) {
 			u64 blockptr;
 			u64 gen;
 			struct extent_buffer *tmp;
@@ -3830,7 +3772,7 @@ int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
 	struct extent_buffer *c;
 
 	WARN_ON(!path->keep_locks);
-	while(level < BTRFS_MAX_LEVEL) {
+	while (level < BTRFS_MAX_LEVEL) {
 		if (!path->nodes[level])
 			return 1;
 
@@ -3839,9 +3781,8 @@ int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
 next:
 		if (slot >= btrfs_header_nritems(c)) {
 			level++;
-			if (level == BTRFS_MAX_LEVEL) {
+			if (level == BTRFS_MAX_LEVEL)
 				return 1;
-			}
 			continue;
 		}
 		if (level == 0)
@@ -3889,9 +3830,8 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
 	int ret;
 
 	nritems = btrfs_header_nritems(path->nodes[0]);
-	if (nritems == 0) {
+	if (nritems == 0)
 		return 1;
-	}
 
 	btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1);
 
@@ -3915,7 +3855,7 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
 		goto done;
 	}
 
-	while(level < BTRFS_MAX_LEVEL) {
+	while (level < BTRFS_MAX_LEVEL) {
 		if (!path->nodes[level])
 			return 1;
 
@@ -3923,9 +3863,8 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
 		c = path->nodes[level];
 		if (slot >= btrfs_header_nritems(c)) {
 			level++;
-			if (level == BTRFS_MAX_LEVEL) {
+			if (level == BTRFS_MAX_LEVEL)
 				return 1;
-			}
 			continue;
 		}
 
@@ -3946,7 +3885,7 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
 		break;
 	}
 	path->slots[level] = slot;
-	while(1) {
+	while (1) {
 		level--;
 		c = path->nodes[level];
 		if (path->locks[level])
@@ -3986,7 +3925,7 @@ int btrfs_previous_item(struct btrfs_root *root,
 	u32 nritems;
 	int ret;
 
-	while(1) {
+	while (1) {
 		if (path->slots[0] == 0) {
 			ret = btrfs_prev_leaf(root, path);
 			if (ret != 0)
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index ccea0648e106..eee060f88113 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -126,7 +126,6 @@ struct btrfs_ordered_sum;
 static int btrfs_csum_sizes[] = { 4, 0 };
 
 /* four bytes for CRC32 */
-//#define BTRFS_CRC32_SIZE 4
 #define BTRFS_EMPTY_DIR_SIZE 0
 
 #define BTRFS_FT_UNKNOWN	0
@@ -283,8 +282,8 @@ struct btrfs_header {
 } __attribute__ ((__packed__));
 
 #define BTRFS_NODEPTRS_PER_BLOCK(r) (((r)->nodesize - \
-			        sizeof(struct btrfs_header)) / \
-			        sizeof(struct btrfs_key_ptr))
+				      sizeof(struct btrfs_header)) / \
+				     sizeof(struct btrfs_key_ptr))
 #define __BTRFS_LEAF_DATA_SIZE(bs) ((bs) - sizeof(struct btrfs_header))
 #define BTRFS_LEAF_DATA_SIZE(r) (__BTRFS_LEAF_DATA_SIZE(r->leafsize))
 #define BTRFS_MAX_INLINE_DATA_SIZE(r) (BTRFS_LEAF_DATA_SIZE(r) - \
@@ -1512,7 +1511,7 @@ static inline struct btrfs_header *btrfs_buffer_header(struct extent_buffer *eb)
 
 static inline int btrfs_is_leaf(struct extent_buffer *eb)
 {
-	return (btrfs_header_level(eb) == 0);
+	return btrfs_header_level(eb) == 0;
 }
 
 /* struct btrfs_root_item */
@@ -1597,8 +1596,8 @@ static inline unsigned long btrfs_leaf_data(struct extent_buffer *l)
 /* struct btrfs_file_extent_item */
 BTRFS_SETGET_FUNCS(file_extent_type, struct btrfs_file_extent_item, type, 8);
 
-static inline unsigned long btrfs_file_extent_inline_start(struct
-						   btrfs_file_extent_item *e)
+static inline unsigned long
+btrfs_file_extent_inline_start(struct btrfs_file_extent_item *e)
 {
 	unsigned long offset = (unsigned long)e;
 	offset += offsetof(struct btrfs_file_extent_item, disk_bytenr);
@@ -1660,20 +1659,20 @@ static inline int btrfs_set_root_name(struct btrfs_root *root,
 				      const char *name, int len)
 {
 	/* if we already have a name just free it */
-	if (root->name)
-		kfree(root->name);
+	kfree(root->name);
 
 	root->name = kmalloc(len+1, GFP_KERNEL);
 	if (!root->name)
 		return -ENOMEM;
 
 	memcpy(root->name, name, len);
-	root->name[len] ='\0';
+	root->name[len] = '\0';
 
 	return 0;
 }
 
-static inline u32 btrfs_level_size(struct btrfs_root *root, int level) {
+static inline u32 btrfs_level_size(struct btrfs_root *root, int level)
+{
 	if (level == 0)
 		return root->leafsize;
 	return root->nodesize;
@@ -1707,9 +1706,9 @@ int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
 int btrfs_extent_post_op(struct btrfs_trans_handle *trans,
 			 struct btrfs_root *root);
 int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy);
-struct btrfs_block_group_cache *btrfs_lookup_block_group(struct
-							 btrfs_fs_info *info,
-							 u64 bytenr);
+struct btrfs_block_group_cache *btrfs_lookup_block_group(
+						 struct btrfs_fs_info *info,
+						 u64 bytenr);
 u64 btrfs_find_block_group(struct btrfs_root *root,
 			   u64 search_start, u64 search_hint, int owner);
 struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
@@ -1908,8 +1907,9 @@ int btrfs_search_root(struct btrfs_root *root, u64 search_start,
 int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid,
 			  struct btrfs_root *latest_root);
 /* dir-item.c */
-int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root
-			  *root, const char *name, int name_len, u64 dir,
+int btrfs_insert_dir_item(struct btrfs_trans_handle *trans,
+			  struct btrfs_root *root, const char *name,
+			  int name_len, u64 dir,
 			  struct btrfs_key *location, u8 type, u64 index);
 struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans,
 					     struct btrfs_root *root,
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index 5040b71f1900..926a0b287a7d 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -333,7 +333,7 @@ struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root,
 	leaf = path->nodes[0];
 	dir_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dir_item);
 	total_len = btrfs_item_size_nr(leaf, path->slots[0]);
-	while(cur < total_len) {
+	while (cur < total_len) {
 		this_len = sizeof(*dir_item) +
 			btrfs_dir_name_len(leaf, dir_item) +
 			btrfs_dir_data_len(leaf, dir_item);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index dae25e78a6b7..81a313874ae5 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -23,7 +23,7 @@
 #include <linux/swap.h>
 #include <linux/radix-tree.h>
 #include <linux/writeback.h>
-#include <linux/buffer_head.h> // for block_sync_page
+#include <linux/buffer_head.h>
 #include <linux/workqueue.h>
 #include <linux/kthread.h>
 #include <linux/freezer.h>
@@ -40,19 +40,6 @@
 #include "ref-cache.h"
 #include "tree-log.h"
 
-#if 0
-static int check_tree_block(struct btrfs_root *root, struct extent_buffer *buf)
-{
-	if (extent_buffer_blocknr(buf) != btrfs_header_blocknr(buf)) {
-		printk(KERN_CRIT "buf blocknr(buf) is %llu, header is %llu\n",
-		       (unsigned long long)extent_buffer_blocknr(buf),
-		       (unsigned long long)btrfs_header_blocknr(buf));
-		return 1;
-	}
-	return 0;
-}
-#endif
-
 static struct extent_io_ops btree_extent_io_ops;
 static void end_workqueue_fn(struct btrfs_work *work);
 
@@ -128,23 +115,13 @@ static struct extent_map *btree_get_extent(struct inode *inode,
 		u64 failed_start = em->start;
 		u64 failed_len = em->len;
 
-		printk("failed to insert %Lu %Lu -> %Lu into tree\n",
-		       em->start, em->len, em->block_start);
 		free_extent_map(em);
 		em = lookup_extent_mapping(em_tree, start, len);
 		if (em) {
-			printk("after failing, found %Lu %Lu %Lu\n",
-			       em->start, em->len, em->block_start);
 			ret = 0;
 		} else {
 			em = lookup_extent_mapping(em_tree, failed_start,
 						   failed_len);
-			if (em) {
-				printk("double failure lookup gives us "
-				       "%Lu %Lu -> %Lu\n", em->start,
-				       em->len, em->block_start);
-				free_extent_map(em);
-			}
 			ret = -EIO;
 		}
 	} else if (ret) {
@@ -191,15 +168,12 @@ static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf,
 	unsigned long inline_result;
 
 	len = buf->len - offset;
-	while(len > 0) {
+	while (len > 0) {
 		err = map_private_extent_buffer(buf, offset, 32,
 					&map_token, &kaddr,
 					&map_start, &map_len, KM_USER0);
-		if (err) {
-			printk("failed to map extent buffer! %lu\n",
-			       offset);
+		if (err)
 			return 1;
-		}
 		cur_len = min(len, map_len - (offset - map_start));
 		crc = btrfs_csum_data(root, kaddr + offset - map_start,
 				      crc, cur_len);
@@ -218,15 +192,14 @@ static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf,
 	btrfs_csum_final(crc, result);
 
 	if (verify) {
-		/* FIXME, this is not good */
 		if (memcmp_extent_buffer(buf, result, 0, csum_size)) {
 			u32 val;
 			u32 found = 0;
 			memcpy(&found, result, csum_size);
 
 			read_extent_buffer(buf, &val, 0, csum_size);
-			printk("btrfs: %s checksum verify failed on %llu "
-			       "wanted %X found %X level %d\n",
+			printk(KERN_INFO "btrfs: %s checksum verify failed "
+			       "on %llu wanted %X found %X level %d\n",
 			       root->fs_info->sb->s_id,
 			       buf->start, val, found, btrfs_header_level(buf));
 			if (result != (char *)&inline_result)
@@ -293,7 +266,7 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root,
 		if (!ret &&
 		    !verify_parent_transid(io_tree, eb, parent_transid))
 			return ret;
-printk("read extent buffer pages failed with ret %d mirror no %d\n", ret, mirror_num);
+
 		num_copies = btrfs_num_copies(&root->fs_info->mapping_tree,
 					      eb->start, eb->len);
 		if (num_copies == 1)
@@ -307,9 +280,10 @@ printk("read extent buffer pages failed with ret %d mirror no %d\n", ret, mirror
 }
 
 /*
- * checksum a dirty tree block before IO.  This has extra checks to make
- * sure we only fill in the checksum field in the first page of a multi-page block
+ * checksum a dirty tree block before IO.  This has extra checks to make sure
+ * we only fill in the checksum field in the first page of a multi-page block
  */
+
 static int csum_dirty_buffer(struct btrfs_root *root, struct page *page)
 {
 	struct extent_io_tree *tree;
@@ -327,28 +301,22 @@ static int csum_dirty_buffer(struct btrfs_root *root, struct page *page)
 	if (!page->private)
 		goto out;
 	len = page->private >> 2;
-	if (len == 0) {
-		WARN_ON(1);
-	}
+	WARN_ON(len == 0);
+
 	eb = alloc_extent_buffer(tree, start, len, page, GFP_NOFS);
 	ret = btree_read_extent_buffer_pages(root, eb, start + PAGE_CACHE_SIZE,
 					     btrfs_header_generation(eb));
 	BUG_ON(ret);
 	found_start = btrfs_header_bytenr(eb);
 	if (found_start != start) {
-		printk("warning: eb start incorrect %Lu buffer %Lu len %lu\n",
-		       start, found_start, len);
 		WARN_ON(1);
 		goto err;
 	}
 	if (eb->first_page != page) {
-		printk("bad first page %lu %lu\n", eb->first_page->index,
-		       page->index);
 		WARN_ON(1);
 		goto err;
 	}
 	if (!PageUptodate(page)) {
-		printk("csum not up to date page %lu\n", page->index);
 		WARN_ON(1);
 		goto err;
 	}
@@ -396,29 +364,30 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
 		goto out;
 	if (!page->private)
 		goto out;
+
 	len = page->private >> 2;
-	if (len == 0) {
-		WARN_ON(1);
-	}
+	WARN_ON(len == 0);
+
 	eb = alloc_extent_buffer(tree, start, len, page, GFP_NOFS);
 
 	found_start = btrfs_header_bytenr(eb);
 	if (found_start != start) {
-		printk("bad tree block start %llu %llu\n",
+		printk(KERN_INFO "btrfs bad tree block start %llu %llu\n",
 		       (unsigned long long)found_start,
 		       (unsigned long long)eb->start);
 		ret = -EIO;
 		goto err;
 	}
 	if (eb->first_page != page) {
-		printk("bad first page %lu %lu\n", eb->first_page->index,
-		       page->index);
+		printk(KERN_INFO "btrfs bad first page %lu %lu\n",
+		       eb->first_page->index, page->index);
 		WARN_ON(1);
 		ret = -EIO;
 		goto err;
 	}
 	if (check_tree_block_fsid(root, eb)) {
-		printk("bad fsid on block %Lu\n", eb->start);
+		printk(KERN_INFO "btrfs bad fsid on block %llu\n",
+		       (unsigned long long)eb->start);
 		ret = -EIO;
 		goto err;
 	}
@@ -578,7 +547,7 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
 			   HZ/10);
 	}
 #endif
-	while(atomic_read(&fs_info->async_submit_draining) &&
+	while (atomic_read(&fs_info->async_submit_draining) &&
 	      atomic_read(&fs_info->nr_async_submits)) {
 		wait_event(fs_info->async_submit_wait,
 			   (atomic_read(&fs_info->nr_async_submits) == 0));
@@ -594,7 +563,7 @@ static int btree_csum_one_bio(struct bio *bio)
 	struct btrfs_root *root;
 
 	WARN_ON(bio->bi_vcnt <= 0);
-	while(bio_index < bio->bi_vcnt) {
+	while (bio_index < bio->bi_vcnt) {
 		root = BTRFS_I(bvec->bv_page->mapping->host)->root;
 		csum_dirty_buffer(root, bvec->bv_page);
 		bio_index++;
@@ -680,9 +649,8 @@ static int btree_writepages(struct address_space *mapping,
 
 		num_dirty = count_range_bits(tree, &start, (u64)-1,
 					     thresh, EXTENT_DIRTY);
-		if (num_dirty < thresh) {
+		if (num_dirty < thresh)
 			return 0;
-		}
 	}
 	return extent_writepages(tree, mapping, btree_get_extent, wbc);
 }
@@ -701,15 +669,14 @@ static int btree_releasepage(struct page *page, gfp_t gfp_flags)
 	int ret;
 
 	if (PageWriteback(page) || PageDirty(page))
-	    return 0;
+		return 0;
 
 	tree = &BTRFS_I(page->mapping->host)->io_tree;
 	map = &BTRFS_I(page->mapping->host)->extent_tree;
 
 	ret = try_release_extent_state(map, tree, page, gfp_flags);
-	if (!ret) {
+	if (!ret)
 		return 0;
-	}
 
 	ret = try_release_extent_buffer(tree, page);
 	if (ret == 1) {
@@ -728,8 +695,8 @@ static void btree_invalidatepage(struct page *page, unsigned long offset)
 	extent_invalidatepage(tree, page, offset);
 	btree_releasepage(page, GFP_NOFS);
 	if (PagePrivate(page)) {
-		printk("warning page private not zero on page %Lu\n",
-		       page_offset(page));
+		printk(KERN_WARNING "btrfs warning page private not zero "
+		       "on page %llu\n", (unsigned long long)page_offset(page));
 		ClearPagePrivate(page);
 		set_page_private(page, 0);
 		page_cache_release(page);
@@ -813,7 +780,7 @@ int btrfs_write_tree_block(struct extent_buffer *buf)
 int btrfs_wait_tree_block_writeback(struct extent_buffer *buf)
 {
 	return btrfs_wait_on_page_writeback_range(buf->first_page->mapping,
-				  buf->start, buf->start + buf->len -1);
+				  buf->start, buf->start + buf->len - 1);
 }
 
 struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
@@ -832,11 +799,10 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
 
 	ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid);
 
-	if (ret == 0) {
+	if (ret == 0)
 		buf->flags |= EXTENT_UPTODATE;
-	} else {
+	else
 		WARN_ON(1);
-	}
 	return buf;
 
 }
@@ -944,7 +910,7 @@ int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
 	if (!log_root_tree)
 		return 0;
 
-	while(1) {
+	while (1) {
 		ret = find_first_extent_bit(&log_root_tree->dirty_log_pages,
 				    0, &start, &end, EXTENT_DIRTY);
 		if (ret)
@@ -1165,24 +1131,6 @@ struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info,
 	root->in_sysfs = 1;
 	return root;
 }
-#if 0
-static int add_hasher(struct btrfs_fs_info *info, char *type) {
-	struct btrfs_hasher *hasher;
-
-	hasher = kmalloc(sizeof(*hasher), GFP_NOFS);
-	if (!hasher)
-		return -ENOMEM;
-	hasher->hash_tfm = crypto_alloc_hash(type, 0, CRYPTO_ALG_ASYNC);
-	if (!hasher->hash_tfm) {
-		kfree(hasher);
-		return -EINVAL;
-	}
-	spin_lock(&info->hash_lock);
-	list_add(&hasher->list, &info->hashers);
-	spin_unlock(&info->hash_lock);
-	return 0;
-}
-#endif
 
 static int btrfs_congested_fn(void *congested_data, int bdi_bits)
 {
@@ -1226,9 +1174,8 @@ static void __unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
 			continue;
 
 		bdi = blk_get_backing_dev_info(device->bdev);
-		if (bdi->unplug_io_fn) {
+		if (bdi->unplug_io_fn)
 			bdi->unplug_io_fn(bdi, page);
-		}
 	}
 }
 
@@ -1420,8 +1367,9 @@ static int transaction_kthread(void *arg)
 		mutex_lock(&root->fs_info->transaction_kthread_mutex);
 
 		if (root->fs_info->total_ref_cache_size > 20 * 1024 * 1024) {
-			printk("btrfs: total reference cache size %Lu\n",
-				root->fs_info->total_ref_cache_size);
+			printk(KERN_INFO "btrfs: total reference cache "
+			       "size %llu\n",
+			       root->fs_info->total_ref_cache_size);
 		}
 
 		mutex_lock(&root->fs_info->trans_mutex);
@@ -1592,14 +1540,6 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	atomic_set(&fs_info->tree_log_writers, 0);
 	fs_info->tree_log_transid = 0;
 
-#if 0
-	ret = add_hasher(fs_info, "crc32c");
-	if (ret) {
-		printk("btrfs: failed hash setup, modprobe cryptomgr?\n");
-		err = -ENOMEM;
-		goto fail_iput;
-	}
-#endif
 	__setup_root(4096, 4096, 4096, 4096, tree_root,
 		     fs_info, BTRFS_ROOT_TREE_OBJECTID);
 
@@ -1720,7 +1660,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 
 	if (strncmp((char *)(&disk_super->magic), BTRFS_MAGIC,
 		    sizeof(disk_super->magic))) {
-		printk("btrfs: valid FS not found on %s\n", sb->s_id);
+		printk(KERN_INFO "btrfs: valid FS not found on %s\n", sb->s_id);
 		goto fail_sb_buffer;
 	}
 
@@ -1728,8 +1668,8 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	ret = btrfs_read_sys_array(tree_root);
 	mutex_unlock(&fs_info->chunk_mutex);
 	if (ret) {
-		printk("btrfs: failed to read the system array on %s\n",
-		       sb->s_id);
+		printk(KERN_WARNING "btrfs: failed to read the system "
+		       "array on %s\n", sb->s_id);
 		goto fail_sys_array;
 	}
 
@@ -1746,14 +1686,15 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	BUG_ON(!chunk_root->node);
 
 	read_extent_buffer(chunk_root->node, fs_info->chunk_tree_uuid,
-	         (unsigned long)btrfs_header_chunk_tree_uuid(chunk_root->node),
-		 BTRFS_UUID_SIZE);
+	   (unsigned long)btrfs_header_chunk_tree_uuid(chunk_root->node),
+	   BTRFS_UUID_SIZE);
 
 	mutex_lock(&fs_info->chunk_mutex);
 	ret = btrfs_read_chunk_tree(chunk_root);
 	mutex_unlock(&fs_info->chunk_mutex);
 	if (ret) {
-		printk("btrfs: failed to read chunk tree on %s\n", sb->s_id);
+		printk(KERN_WARNING "btrfs: failed to read chunk tree on %s\n",
+		       sb->s_id);
 		goto fail_chunk_root;
 	}
 
@@ -1812,7 +1753,8 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 		u64 bytenr = btrfs_super_log_root(disk_super);
 
 		if (fs_devices->rw_devices == 0) {
-			printk("Btrfs log replay required on RO media\n");
+			printk(KERN_WARNING "Btrfs log replay required "
+			       "on RO media\n");
 			err = -EIO;
 			goto fail_trans_kthread;
 		}
@@ -2097,7 +2039,8 @@ int write_all_supers(struct btrfs_root *root, int max_mirrors)
 			total_errors++;
 	}
 	if (total_errors > max_errors) {
-		printk("btrfs: %d errors while writing supers\n", total_errors);
+		printk(KERN_ERR "btrfs: %d errors while writing supers\n",
+		       total_errors);
 		BUG();
 	}
 
@@ -2114,7 +2057,8 @@ int write_all_supers(struct btrfs_root *root, int max_mirrors)
 			total_errors++;
 	}
 	if (total_errors > max_errors) {
-		printk("btrfs: %d errors while writing supers\n", total_errors);
+		printk(KERN_ERR "btrfs: %d errors while writing supers\n",
+		       total_errors);
 		BUG();
 	}
 	return 0;
@@ -2137,16 +2081,11 @@ int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root)
 		down_write(&root->anon_super.s_umount);
 		kill_anon_super(&root->anon_super);
 	}
-#if 0
-	if (root->in_sysfs)
-		btrfs_sysfs_del_root(root);
-#endif
 	if (root->node)
 		free_extent_buffer(root->node);
 	if (root->commit_root)
 		free_extent_buffer(root->commit_root);
-	if (root->name)
-		kfree(root->name);
+	kfree(root->name);
 	kfree(root);
 	return 0;
 }
@@ -2157,7 +2096,7 @@ static int del_fs_roots(struct btrfs_fs_info *fs_info)
 	struct btrfs_root *gang[8];
 	int i;
 
-	while(1) {
+	while (1) {
 		ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
 					     (void **)gang, 0,
 					     ARRAY_SIZE(gang));
@@ -2228,18 +2167,17 @@ int close_ctree(struct btrfs_root *root)
 
 	if (!(fs_info->sb->s_flags & MS_RDONLY)) {
 		ret =  btrfs_commit_super(root);
-		if (ret) {
-			printk("btrfs: commit super returns %d\n", ret);
-		}
+		if (ret)
+			printk(KERN_ERR "btrfs: commit super ret %d\n", ret);
 	}
 
 	if (fs_info->delalloc_bytes) {
-		printk("btrfs: at unmount delalloc count %Lu\n",
+		printk(KERN_INFO "btrfs: at unmount delalloc count %llu\n",
 		       fs_info->delalloc_bytes);
 	}
 	if (fs_info->total_ref_cache_size) {
-		printk("btrfs: at umount reference cache size %Lu\n",
-			fs_info->total_ref_cache_size);
+		printk(KERN_INFO "btrfs: at umount reference cache size %llu\n",
+		       (unsigned long long)fs_info->total_ref_cache_size);
 	}
 
 	if (fs_info->extent_root->node)
@@ -2248,13 +2186,13 @@ int close_ctree(struct btrfs_root *root)
 	if (fs_info->tree_root->node)
 		free_extent_buffer(fs_info->tree_root->node);
 
-	if (root->fs_info->chunk_root->node);
+	if (root->fs_info->chunk_root->node)
 		free_extent_buffer(root->fs_info->chunk_root->node);
 
-	if (root->fs_info->dev_root->node);
+	if (root->fs_info->dev_root->node)
 		free_extent_buffer(root->fs_info->dev_root->node);
 
-	if (root->fs_info->csum_root->node);
+	if (root->fs_info->csum_root->node)
 		free_extent_buffer(root->fs_info->csum_root->node);
 
 	btrfs_free_block_groups(root->fs_info);
@@ -2273,7 +2211,7 @@ int close_ctree(struct btrfs_root *root)
 	btrfs_stop_workers(&fs_info->submit_workers);
 
 #if 0
-	while(!list_empty(&fs_info->hashers)) {
+	while (!list_empty(&fs_info->hashers)) {
 		struct btrfs_hasher *hasher;
 		hasher = list_entry(fs_info->hashers.next, struct btrfs_hasher,
 				    hashers);
@@ -2324,9 +2262,11 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
 
 	WARN_ON(!btrfs_tree_locked(buf));
 	if (transid != root->fs_info->generation) {
-		printk(KERN_CRIT "transid mismatch buffer %llu, found %Lu running %Lu\n",
+		printk(KERN_CRIT "btrfs transid mismatch buffer %llu, "
+		       "found %llu running %llu\n",
 			(unsigned long long)buf->start,
-			transid, root->fs_info->generation);
+			(unsigned long long)transid,
+			(unsigned long long)root->fs_info->generation);
 		WARN_ON(1);
 	}
 	set_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree, buf);
@@ -2361,9 +2301,8 @@ int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid)
 	struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root;
 	int ret;
 	ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid);
-	if (ret == 0) {
+	if (ret == 0)
 		buf->flags |= EXTENT_UPTODATE;
-	}
 	return ret;
 }
 
diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
index 48b82cd7583c..85315d2c90de 100644
--- a/fs/btrfs/export.c
+++ b/fs/btrfs/export.c
@@ -7,9 +7,11 @@
 #include "export.h"
 #include "compat.h"
 
-#define BTRFS_FID_SIZE_NON_CONNECTABLE		(offsetof(struct btrfs_fid, parent_objectid)/4)
-#define BTRFS_FID_SIZE_CONNECTABLE		(offsetof(struct btrfs_fid, parent_root_objectid)/4)
-#define BTRFS_FID_SIZE_CONNECTABLE_ROOT		(sizeof(struct btrfs_fid)/4)
+#define BTRFS_FID_SIZE_NON_CONNECTABLE (offsetof(struct btrfs_fid, \
+						 parent_objectid) / 4)
+#define BTRFS_FID_SIZE_CONNECTABLE (offsetof(struct btrfs_fid, \
+					     parent_root_objectid) / 4)
+#define BTRFS_FID_SIZE_CONNECTABLE_ROOT (sizeof(struct btrfs_fid) / 4)
 
 static int btrfs_encode_fh(struct dentry *dentry, u32 *fh, int *max_len,
 			   int connectable)
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 780c1eeb8299..ec43fa526d77 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -49,10 +49,10 @@ struct pending_extent_op {
 	int del;
 };
 
-static int finish_current_insert(struct btrfs_trans_handle *trans, struct
-				 btrfs_root *extent_root, int all);
-static int del_pending_extents(struct btrfs_trans_handle *trans, struct
-			       btrfs_root *extent_root, int all);
+static int finish_current_insert(struct btrfs_trans_handle *trans,
+				 struct btrfs_root *extent_root, int all);
+static int del_pending_extents(struct btrfs_trans_handle *trans,
+			       struct btrfs_root *extent_root, int all);
 static int pin_down_bytes(struct btrfs_trans_handle *trans,
 			  struct btrfs_root *root,
 			  u64 bytenr, u64 num_bytes, int is_data);
@@ -247,7 +247,7 @@ static int cache_block_group(struct btrfs_root *root,
 	if (ret < 0)
 		goto err;
 
-	while(1) {
+	while (1) {
 		leaf = path->nodes[0];
 		slot = path->slots[0];
 		if (slot >= btrfs_header_nritems(leaf)) {
@@ -292,9 +292,8 @@ err:
 /*
  * return the block group that starts at or after bytenr
  */
-static struct btrfs_block_group_cache *btrfs_lookup_first_block_group(struct
-						       btrfs_fs_info *info,
-							 u64 bytenr)
+static struct btrfs_block_group_cache *
+btrfs_lookup_first_block_group(struct btrfs_fs_info *info, u64 bytenr)
 {
 	struct btrfs_block_group_cache *cache;
 
@@ -306,9 +305,9 @@ static struct btrfs_block_group_cache *btrfs_lookup_first_block_group(struct
 /*
  * return the block group that contains teh given bytenr
  */
-struct btrfs_block_group_cache *btrfs_lookup_block_group(struct
-							 btrfs_fs_info *info,
-							 u64 bytenr)
+struct btrfs_block_group_cache *btrfs_lookup_block_group(
+						 struct btrfs_fs_info *info,
+						 u64 bytenr)
 {
 	struct btrfs_block_group_cache *cache;
 
@@ -492,7 +491,7 @@ int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len)
  * to the key objectid.
  */
 
-static int noinline lookup_extent_backref(struct btrfs_trans_handle *trans,
+static noinline int lookup_extent_backref(struct btrfs_trans_handle *trans,
 					  struct btrfs_root *root,
 					  struct btrfs_path *path,
 					  u64 bytenr, u64 parent,
@@ -537,7 +536,7 @@ out:
  * updates all the backrefs that are pending on update_list for the
  * extent_root
  */
-static int noinline update_backrefs(struct btrfs_trans_handle *trans,
+static noinline int update_backrefs(struct btrfs_trans_handle *trans,
 				    struct btrfs_root *extent_root,
 				    struct btrfs_path *path,
 				    struct list_head *update_list)
@@ -573,9 +572,11 @@ loop:
 	    btrfs_ref_generation(leaf, ref) != op->orig_generation ||
 	    (ref_objectid != op->level &&
 	     ref_objectid != BTRFS_MULTIPLE_OBJECTIDS)) {
-		printk(KERN_ERR "couldn't find %Lu, parent %Lu, root %Lu, "
-		       "owner %u\n", op->bytenr, op->orig_parent,
-		       ref_root, op->level);
+		printk(KERN_ERR "btrfs couldn't find %llu, parent %llu, "
+		       "root %llu, owner %u\n",
+		       (unsigned long long)op->bytenr,
+		       (unsigned long long)op->orig_parent,
+		       (unsigned long long)ref_root, op->level);
 		btrfs_print_leaf(extent_root, leaf);
 		BUG();
 	}
@@ -620,7 +621,7 @@ out:
 	return 0;
 }
 
-static int noinline insert_extents(struct btrfs_trans_handle *trans,
+static noinline int insert_extents(struct btrfs_trans_handle *trans,
 				   struct btrfs_root *extent_root,
 				   struct btrfs_path *path,
 				   struct list_head *insert_list, int nr)
@@ -781,7 +782,7 @@ static int noinline insert_extents(struct btrfs_trans_handle *trans,
 	return ret;
 }
 
-static int noinline insert_extent_backref(struct btrfs_trans_handle *trans,
+static noinline int insert_extent_backref(struct btrfs_trans_handle *trans,
 					  struct btrfs_root *root,
 					  struct btrfs_path *path,
 					  u64 bytenr, u64 parent,
@@ -840,7 +841,7 @@ out:
 	return ret;
 }
 
-static int noinline remove_extent_backref(struct btrfs_trans_handle *trans,
+static noinline int remove_extent_backref(struct btrfs_trans_handle *trans,
 					  struct btrfs_root *root,
 					  struct btrfs_path *path)
 {
@@ -868,7 +869,7 @@ static int noinline remove_extent_backref(struct btrfs_trans_handle *trans,
 static void btrfs_issue_discard(struct block_device *bdev,
 				u64 start, u64 len)
 {
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,28)
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 28)
 	blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL);
 #else
 	blkdev_issue_discard(bdev, start >> 9, len >> 9);
@@ -908,7 +909,7 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
 #endif
 }
 
-static int noinline free_extents(struct btrfs_trans_handle *trans,
+static noinline int free_extents(struct btrfs_trans_handle *trans,
 				 struct btrfs_root *extent_root,
 				 struct list_head *del_list)
 {
@@ -937,10 +938,11 @@ search:
 				    extent_root->root_key.objectid,
 				    op->orig_generation, op->level, 1);
 	if (ret) {
-		printk("Unable to find backref byte nr %Lu root %Lu gen %Lu "
-		       "owner %u\n", op->bytenr,
-		       extent_root->root_key.objectid, op->orig_generation,
-		       op->level);
+		printk(KERN_ERR "btrfs unable to find backref byte nr %llu "
+		       "root %llu gen %llu owner %u\n",
+		       (unsigned long long)op->bytenr,
+		       (unsigned long long)extent_root->root_key.objectid,
+		       (unsigned long long)op->orig_generation, op->level);
 		btrfs_print_leaf(extent_root, path->nodes[0]);
 		WARN_ON(1);
 		goto out;
@@ -1282,7 +1284,9 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
 	btrfs_item_key_to_cpu(l, &key, path->slots[0]);
 	if (key.objectid != bytenr) {
 		btrfs_print_leaf(root->fs_info->extent_root, path->nodes[0]);
-		printk("wanted %Lu found %Lu\n", bytenr, key.objectid);
+		printk(KERN_ERR "btrfs wanted %llu found %llu\n",
+		       (unsigned long long)bytenr,
+		       (unsigned long long)key.objectid);
 		BUG();
 	}
 	BUG_ON(key.type != BTRFS_EXTENT_ITEM_KEY);
@@ -1353,7 +1357,8 @@ int btrfs_lookup_extent_ref(struct btrfs_trans_handle *trans,
 		goto out;
 	if (ret != 0) {
 		btrfs_print_leaf(root, path->nodes[0]);
-		printk("failed to find block number %Lu\n", bytenr);
+		printk(KERN_INFO "btrfs failed to find block number %llu\n",
+		       (unsigned long long)bytenr);
 		BUG();
 	}
 	l = path->nodes[0];
@@ -1738,7 +1743,7 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
 	if (!path)
 		return -ENOMEM;
 
-	while(1) {
+	while (1) {
 		cache = NULL;
 		spin_lock(&root->fs_info->block_group_cache_lock);
 		for (n = rb_first(&root->fs_info->block_group_cache_tree);
@@ -1921,10 +1926,8 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
 	spin_unlock(&space_info->lock);
 
 	ret = btrfs_alloc_chunk(trans, extent_root, flags);
-	if (ret) {
-printk("space info full %Lu\n", flags);
+	if (ret)
 		space_info->full = 1;
-	}
 out:
 	mutex_unlock(&extent_root->fs_info->chunk_mutex);
 	return ret;
@@ -1941,7 +1944,7 @@ static int update_block_group(struct btrfs_trans_handle *trans,
 	u64 old_val;
 	u64 byte_in_group;
 
-	while(total) {
+	while (total) {
 		cache = btrfs_lookup_block_group(info, bytenr);
 		if (!cache)
 			return -1;
@@ -2089,7 +2092,7 @@ int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy)
 	int ret;
 
 	mutex_lock(&root->fs_info->pinned_mutex);
-	while(1) {
+	while (1) {
 		ret = find_first_extent_bit(pinned_extents, last,
 					    &start, &end, EXTENT_DIRTY);
 		if (ret)
@@ -2110,7 +2113,7 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
 	int ret;
 
 	mutex_lock(&root->fs_info->pinned_mutex);
-	while(1) {
+	while (1) {
 		ret = find_first_extent_bit(unpin, 0, &start, &end,
 					    EXTENT_DIRTY);
 		if (ret)
@@ -2400,7 +2403,7 @@ static int __free_extent(struct btrfs_trans_handle *trans,
 	if (ret == 0) {
 		struct btrfs_key found_key;
 		extent_slot = path->slots[0];
-		while(extent_slot > 0) {
+		while (extent_slot > 0) {
 			extent_slot--;
 			btrfs_item_key_to_cpu(path->nodes[0], &found_key,
 					      extent_slot);
@@ -2422,8 +2425,8 @@ static int __free_extent(struct btrfs_trans_handle *trans,
 						&key, path, -1, 1);
 			if (ret) {
 				printk(KERN_ERR "umm, got %d back from search"
-				       ", was looking for %Lu\n", ret,
-				       bytenr);
+				       ", was looking for %llu\n", ret,
+				       (unsigned long long)bytenr);
 				btrfs_print_leaf(extent_root, path->nodes[0]);
 			}
 			BUG_ON(ret);
@@ -2432,9 +2435,12 @@ static int __free_extent(struct btrfs_trans_handle *trans,
 	} else {
 		btrfs_print_leaf(extent_root, path->nodes[0]);
 		WARN_ON(1);
-		printk("Unable to find ref byte nr %Lu root %Lu "
-		       "gen %Lu owner %Lu\n", bytenr,
-		       root_objectid, ref_generation, owner_objectid);
+		printk(KERN_ERR "btrfs unable to find ref byte nr %llu "
+		       "root %llu gen %llu owner %llu\n",
+		       (unsigned long long)bytenr,
+		       (unsigned long long)root_objectid,
+		       (unsigned long long)ref_generation,
+		       (unsigned long long)owner_objectid);
 	}
 
 	leaf = path->nodes[0];
@@ -2517,8 +2523,8 @@ static int __free_extent(struct btrfs_trans_handle *trans,
  * find all the blocks marked as pending in the radix tree and remove
  * them from the extent map
  */
-static int del_pending_extents(struct btrfs_trans_handle *trans, struct
-			       btrfs_root *extent_root, int all)
+static int del_pending_extents(struct btrfs_trans_handle *trans,
+			       struct btrfs_root *extent_root, int all)
 {
 	int ret;
 	int err = 0;
@@ -2539,7 +2545,7 @@ static int del_pending_extents(struct btrfs_trans_handle *trans, struct
 
 again:
 	mutex_lock(&info->extent_ins_mutex);
-	while(1) {
+	while (1) {
 		ret = find_first_extent_bit(pending_del, search, &start, &end,
 					    EXTENT_WRITEBACK);
 		if (ret) {
@@ -2753,7 +2759,7 @@ static u64 stripe_align(struct btrfs_root *root, u64 val)
  * ins->offset == number of blocks
  * Any available blocks before search_start are skipped.
  */
-static int noinline find_free_extent(struct btrfs_trans_handle *trans,
+static noinline int find_free_extent(struct btrfs_trans_handle *trans,
 				     struct btrfs_root *orig_root,
 				     u64 num_bytes, u64 empty_size,
 				     u64 search_start, u64 search_end,
@@ -2762,7 +2768,7 @@ static int noinline find_free_extent(struct btrfs_trans_handle *trans,
 				     int data)
 {
 	int ret = 0;
-	struct btrfs_root * root = orig_root->fs_info->extent_root;
+	struct btrfs_root *root = orig_root->fs_info->extent_root;
 	u64 total_needed = num_bytes;
 	u64 *last_ptr = NULL;
 	u64 last_wanted = 0;
@@ -2995,8 +3001,10 @@ loop_check:
 			*last_ptr = ins->objectid + ins->offset;
 		ret = 0;
 	} else if (!ret) {
-		printk(KERN_ERR "we were searching for %Lu bytes, num_bytes %Lu,"
-		       " loop %d, allowed_alloc %d\n", total_needed, num_bytes,
+		printk(KERN_ERR "btrfs searching for %llu bytes, "
+		       "num_bytes %llu, loop %d, allowed_alloc %d\n",
+		       (unsigned long long)total_needed,
+		       (unsigned long long)num_bytes,
 		       loop, allowed_chunk_alloc);
 		ret = -ENOSPC;
 	}
@@ -3012,19 +3020,22 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes)
 	struct btrfs_block_group_cache *cache;
 	struct list_head *l;
 
-	printk(KERN_INFO "space_info has %Lu free, is %sfull\n",
-	       info->total_bytes - info->bytes_used - info->bytes_pinned -
-	       info->bytes_reserved, (info->full) ? "" : "not ");
+	printk(KERN_INFO "space_info has %llu free, is %sfull\n",
+	       (unsigned long long)(info->total_bytes - info->bytes_used -
+				    info->bytes_pinned - info->bytes_reserved),
+	       (info->full) ? "" : "not ");
 
 	down_read(&info->groups_sem);
 	list_for_each(l, &info->block_groups) {
 		cache = list_entry(l, struct btrfs_block_group_cache, list);
 		spin_lock(&cache->lock);
-		printk(KERN_INFO "block group %Lu has %Lu bytes, %Lu used "
-		       "%Lu pinned %Lu reserved\n",
-		       cache->key.objectid, cache->key.offset,
-		       btrfs_block_group_used(&cache->item),
-		       cache->pinned, cache->reserved);
+		printk(KERN_INFO "block group %llu has %llu bytes, %llu used "
+		       "%llu pinned %llu reserved\n",
+		       (unsigned long long)cache->key.objectid,
+		       (unsigned long long)cache->key.offset,
+		       (unsigned long long)btrfs_block_group_used(&cache->item),
+		       (unsigned long long)cache->pinned,
+		       (unsigned long long)cache->reserved);
 		btrfs_dump_free_space(cache, bytes);
 		spin_unlock(&cache->lock);
 	}
@@ -3045,15 +3056,15 @@ static int __btrfs_reserve_extent(struct btrfs_trans_handle *trans,
 
 	if (data) {
 		alloc_profile = info->avail_data_alloc_bits &
-			        info->data_alloc_profile;
+			info->data_alloc_profile;
 		data = BTRFS_BLOCK_GROUP_DATA | alloc_profile;
 	} else if (root == root->fs_info->chunk_root) {
 		alloc_profile = info->avail_system_alloc_bits &
-			        info->system_alloc_profile;
+			info->system_alloc_profile;
 		data = BTRFS_BLOCK_GROUP_SYSTEM | alloc_profile;
 	} else {
 		alloc_profile = info->avail_metadata_alloc_bits &
-			        info->metadata_alloc_profile;
+			info->metadata_alloc_profile;
 		data = BTRFS_BLOCK_GROUP_METADATA | alloc_profile;
 	}
 again:
@@ -3092,8 +3103,9 @@ again:
 		struct btrfs_space_info *sinfo;
 
 		sinfo = __find_space_info(root->fs_info, data);
-		printk("allocation failed flags %Lu, wanted %Lu\n",
-		       data, num_bytes);
+		printk(KERN_ERR "btrfs allocation failed flags %llu, "
+		       "wanted %llu\n", (unsigned long long)data,
+		       (unsigned long long)num_bytes);
 		dump_space_info(sinfo, num_bytes);
 		BUG();
 	}
@@ -3108,7 +3120,8 @@ int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len)
 
 	cache = btrfs_lookup_block_group(root->fs_info, start);
 	if (!cache) {
-		printk(KERN_ERR "Unable to find block group for %Lu\n", start);
+		printk(KERN_ERR "Unable to find block group for %llu\n",
+		       (unsigned long long)start);
 		return -ENOSPC;
 	}
 
@@ -3235,10 +3248,12 @@ static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
 	}
 
 update_block:
-	ret = update_block_group(trans, root, ins->objectid, ins->offset, 1, 0);
+	ret = update_block_group(trans, root, ins->objectid,
+				 ins->offset, 1, 0);
 	if (ret) {
-		printk("update block group failed for %Lu %Lu\n",
-		       ins->objectid, ins->offset);
+		printk(KERN_ERR "btrfs update block group failed for %llu "
+		       "%llu\n", (unsigned long long)ins->objectid,
+		       (unsigned long long)ins->offset);
 		BUG();
 	}
 out:
@@ -3420,7 +3435,7 @@ int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
 	return 0;
 }
 
-static int noinline cache_drop_leaf_ref(struct btrfs_trans_handle *trans,
+static noinline int cache_drop_leaf_ref(struct btrfs_trans_handle *trans,
 					struct btrfs_root *root,
 					struct btrfs_leaf_ref *ref)
 {
@@ -3445,15 +3460,15 @@ static int noinline cache_drop_leaf_ref(struct btrfs_trans_handle *trans,
 	return 0;
 }
 
-static int drop_snap_lookup_refcount(struct btrfs_root *root, u64 start, u64 len,
-			      u32 *refs)
+static int drop_snap_lookup_refcount(struct btrfs_root *root, u64 start,
+				     u64 len, u32 *refs)
 {
 	int ret;
 
 	ret = btrfs_lookup_extent_ref(NULL, root, start, len, refs);
 	BUG_ON(ret);
 
-#if 0 // some debugging code in case we see problems here
+#if 0 /* some debugging code in case we see problems here */
 	/* if the refs count is one, it won't get increased again.  But
 	 * if the ref count is > 1, someone may be decreasing it at
 	 * the same time we are.
@@ -3474,8 +3489,8 @@ static int drop_snap_lookup_refcount(struct btrfs_root *root, u64 start, u64 len
 			free_extent_buffer(eb);
 		}
 		if (*refs == 1) {
-			printk("block %llu went down to one during drop_snap\n",
-			       (unsigned long long)start);
+			printk(KERN_ERR "btrfs block %llu went down to one "
+			       "during drop_snap\n", (unsigned long long)start);
 		}
 
 	}
@@ -3489,7 +3504,7 @@ static int drop_snap_lookup_refcount(struct btrfs_root *root, u64 start, u64 len
  * helper function for drop_snapshot, this walks down the tree dropping ref
  * counts as it goes.
  */
-static int noinline walk_down_tree(struct btrfs_trans_handle *trans,
+static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
 				   struct btrfs_root *root,
 				   struct btrfs_path *path, int *level)
 {
@@ -3516,7 +3531,7 @@ static int noinline walk_down_tree(struct btrfs_trans_handle *trans,
 	/*
 	 * walk down to the last node level and free all the leaves
 	 */
-	while(*level >= 0) {
+	while (*level >= 0) {
 		WARN_ON(*level < 0);
 		WARN_ON(*level >= BTRFS_MAX_LEVEL);
 		cur = path->nodes[*level];
@@ -3576,10 +3591,6 @@ static int noinline walk_down_tree(struct btrfs_trans_handle *trans,
 				*level = 0;
 				break;
 			}
-			if (printk_ratelimit()) {
-				printk("leaf ref miss for bytenr %llu\n",
-				       (unsigned long long)bytenr);
-			}
 		}
 		next = btrfs_find_tree_block(root, bytenr, blocksize);
 		if (!next || !btrfs_buffer_uptodate(next, ptr_gen)) {
@@ -3641,7 +3652,7 @@ out:
  * walk_down_tree. The main difference is that it checks reference
  * counts while tree blocks are locked.
  */
-static int noinline walk_down_subtree(struct btrfs_trans_handle *trans,
+static noinline int walk_down_subtree(struct btrfs_trans_handle *trans,
 				      struct btrfs_root *root,
 				      struct btrfs_path *path, int *level)
 {
@@ -3730,7 +3741,7 @@ out:
  * to find the first node higher up where we haven't yet gone through
  * all the slots
  */
-static int noinline walk_up_tree(struct btrfs_trans_handle *trans,
+static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
 				 struct btrfs_root *root,
 				 struct btrfs_path *path,
 				 int *level, int max_level)
@@ -3839,7 +3850,7 @@ int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
 			}
 		}
 	}
-	while(1) {
+	while (1) {
 		wret = walk_down_tree(trans, root, path, &level);
 		if (wret > 0)
 			break;
@@ -3920,7 +3931,7 @@ static unsigned long calc_ra(unsigned long start, unsigned long last,
 	return min(last, start + nr - 1);
 }
 
-static int noinline relocate_inode_pages(struct inode *inode, u64 start,
+static noinline int relocate_inode_pages(struct inode *inode, u64 start,
 					 u64 len)
 {
 	u64 page_start;
@@ -4011,7 +4022,7 @@ out_unlock:
 	return ret;
 }
 
-static int noinline relocate_data_extent(struct inode *reloc_inode,
+static noinline int relocate_data_extent(struct inode *reloc_inode,
 					 struct btrfs_key *extent_key,
 					 u64 offset)
 {
@@ -4087,7 +4098,7 @@ static int is_cowonly_root(u64 root_objectid)
 	return 0;
 }
 
-static int noinline __next_ref_path(struct btrfs_trans_handle *trans,
+static noinline int __next_ref_path(struct btrfs_trans_handle *trans,
 				    struct btrfs_root *extent_root,
 				    struct btrfs_ref_path *ref_path,
 				    int first_time)
@@ -4119,11 +4130,10 @@ walk_down:
 		if (level < ref_path->lowest_level)
 			break;
 
-		if (level >= 0) {
+		if (level >= 0)
 			bytenr = ref_path->nodes[level];
-		} else {
+		else
 			bytenr = ref_path->extent_start;
-		}
 		BUG_ON(bytenr == 0);
 
 		parent = ref_path->nodes[level + 1];
@@ -4170,11 +4180,12 @@ walk_up:
 	level = ref_path->current_level;
 	while (level < BTRFS_MAX_LEVEL - 1) {
 		u64 ref_objectid;
-		if (level >= 0) {
+
+		if (level >= 0)
 			bytenr = ref_path->nodes[level];
-		} else {
+		else
 			bytenr = ref_path->extent_start;
-		}
+
 		BUG_ON(bytenr == 0);
 
 		key.objectid = bytenr;
@@ -4299,7 +4310,7 @@ static int btrfs_next_ref_path(struct btrfs_trans_handle *trans,
 	return __next_ref_path(trans, extent_root, ref_path, 0);
 }
 
-static int noinline get_new_locations(struct inode *reloc_inode,
+static noinline int get_new_locations(struct inode *reloc_inode,
 				      struct btrfs_key *extent_key,
 				      u64 offset, int no_fragment,
 				      struct disk_extent **extents,
@@ -4420,7 +4431,7 @@ out:
 	return ret;
 }
 
-static int noinline replace_one_extent(struct btrfs_trans_handle *trans,
+static noinline int replace_one_extent(struct btrfs_trans_handle *trans,
 					struct btrfs_root *root,
 					struct btrfs_path *path,
 					struct btrfs_key *extent_key,
@@ -4778,7 +4789,7 @@ int btrfs_reloc_tree_cache_ref(struct btrfs_trans_handle *trans,
 	return 0;
 }
 
-static int noinline invalidate_extent_cache(struct btrfs_root *root,
+static noinline int invalidate_extent_cache(struct btrfs_root *root,
 					struct extent_buffer *leaf,
 					struct btrfs_block_group_cache *group,
 					struct btrfs_root *target_root)
@@ -4826,7 +4837,7 @@ static int noinline invalidate_extent_cache(struct btrfs_root *root,
 	return 0;
 }
 
-static int noinline replace_extents_in_leaf(struct btrfs_trans_handle *trans,
+static noinline int replace_extents_in_leaf(struct btrfs_trans_handle *trans,
 					struct btrfs_root *root,
 					struct extent_buffer *leaf,
 					struct btrfs_block_group_cache *group,
@@ -5035,7 +5046,7 @@ int btrfs_cleanup_reloc_trees(struct btrfs_root *root)
 	return 0;
 }
 
-static int noinline init_reloc_tree(struct btrfs_trans_handle *trans,
+static noinline int init_reloc_tree(struct btrfs_trans_handle *trans,
 				    struct btrfs_root *root)
 {
 	struct btrfs_root *reloc_root;
@@ -5102,7 +5113,7 @@ static int noinline init_reloc_tree(struct btrfs_trans_handle *trans,
  * tree blocks are shared between reloc trees, so they are also shared
  * between subvols.
  */
-static int noinline relocate_one_path(struct btrfs_trans_handle *trans,
+static noinline int relocate_one_path(struct btrfs_trans_handle *trans,
 				      struct btrfs_root *root,
 				      struct btrfs_path *path,
 				      struct btrfs_key *first_key,
@@ -5199,7 +5210,7 @@ static int noinline relocate_one_path(struct btrfs_trans_handle *trans,
 	return 0;
 }
 
-static int noinline relocate_tree_block(struct btrfs_trans_handle *trans,
+static noinline int relocate_tree_block(struct btrfs_trans_handle *trans,
 					struct btrfs_root *root,
 					struct btrfs_path *path,
 					struct btrfs_key *first_key,
@@ -5217,7 +5228,7 @@ static int noinline relocate_tree_block(struct btrfs_trans_handle *trans,
 	return 0;
 }
 
-static int noinline del_extent_zero(struct btrfs_trans_handle *trans,
+static noinline int del_extent_zero(struct btrfs_trans_handle *trans,
 				    struct btrfs_root *extent_root,
 				    struct btrfs_path *path,
 				    struct btrfs_key *extent_key)
@@ -5233,7 +5244,7 @@ out:
 	return ret;
 }
 
-static struct btrfs_root noinline *read_ref_root(struct btrfs_fs_info *fs_info,
+static noinline struct btrfs_root *read_ref_root(struct btrfs_fs_info *fs_info,
 						struct btrfs_ref_path *ref_path)
 {
 	struct btrfs_key root_key;
@@ -5248,7 +5259,7 @@ static struct btrfs_root noinline *read_ref_root(struct btrfs_fs_info *fs_info,
 	return btrfs_read_fs_root_no_name(fs_info, &root_key);
 }
 
-static int noinline relocate_one_extent(struct btrfs_root *extent_root,
+static noinline int relocate_one_extent(struct btrfs_root *extent_root,
 					struct btrfs_path *path,
 					struct btrfs_key *extent_key,
 					struct btrfs_block_group_cache *group,
@@ -5276,8 +5287,8 @@ static int noinline relocate_one_extent(struct btrfs_root *extent_root,
 
 	ref_path = kmalloc(sizeof(*ref_path), GFP_NOFS);
 	if (!ref_path) {
-	       ret = -ENOMEM;
-	       goto out;
+		ret = -ENOMEM;
+		goto out;
 	}
 
 	for (loops = 0; ; loops++) {
@@ -5497,7 +5508,7 @@ out:
 	return ret;
 }
 
-static struct inode noinline *create_reloc_inode(struct btrfs_fs_info *fs_info,
+static noinline struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
 					struct btrfs_block_group_cache *group)
 {
 	struct inode *inode = NULL;
@@ -5617,7 +5628,7 @@ int btrfs_relocate_block_group(struct btrfs_root *root, u64 group_start)
 	block_group = btrfs_lookup_block_group(info, group_start);
 	BUG_ON(!block_group);
 
-	printk("btrfs relocating block group %llu flags %llu\n",
+	printk(KERN_INFO "btrfs relocating block group %llu flags %llu\n",
 	       (unsigned long long)block_group->key.objectid,
 	       (unsigned long long)block_group->flags);
 
@@ -5649,7 +5660,7 @@ again:
 	btrfs_remove_leaf_refs(info->tree_root, (u64)-1, 1);
 	mutex_unlock(&root->fs_info->cleaner_mutex);
 
-	while(1) {
+	while (1) {
 		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
 		if (ret < 0)
 			goto out;
@@ -5712,7 +5723,7 @@ next:
 	}
 
 	if (total_found > 0) {
-		printk("btrfs found %llu extents in pass %d\n",
+		printk(KERN_INFO "btrfs found %llu extents in pass %d\n",
 		       (unsigned long long)total_found, pass);
 		pass++;
 		if (total_found == skipped && pass > 2) {
@@ -5754,7 +5765,7 @@ static int find_first_block_group(struct btrfs_root *root,
 	if (ret < 0)
 		goto out;
 
-	while(1) {
+	while (1) {
 		slot = path->slots[0];
 		leaf = path->nodes[0];
 		if (slot >= btrfs_header_nritems(leaf)) {
@@ -5825,7 +5836,7 @@ int btrfs_read_block_groups(struct btrfs_root *root)
 	if (!path)
 		return -ENOMEM;
 
-	while(1) {
+	while (1) {
 		ret = find_first_block_group(root, path, &key);
 		if (ret > 0) {
 			ret = 0;
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 0bf7684207aa..39edb551dca6 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -32,7 +32,7 @@ static LIST_HEAD(states);
 
 #define LEAK_DEBUG 0
 #ifdef LEAK_DEBUG
-static spinlock_t leak_lock = SPIN_LOCK_UNLOCKED;
+static DEFINE_SPINLOCK(leak_lock);
 #endif
 
 #define BUFFER_LRU_MAX 64
@@ -81,7 +81,11 @@ void extent_io_exit(void)
 
 	while (!list_empty(&states)) {
 		state = list_entry(states.next, struct extent_state, leak_list);
-		printk("state leak: start %Lu end %Lu state %lu in tree %p refs %d\n", state->start, state->end, state->state, state->tree, atomic_read(&state->refs));
+		printk(KERN_ERR "btrfs state leak: start %llu end %llu "
+		       "state %lu in tree %p refs %d\n",
+		       (unsigned long long)state->start,
+		       (unsigned long long)state->end,
+		       state->state, state->tree, atomic_read(&state->refs));
 		list_del(&state->leak_list);
 		kmem_cache_free(extent_state_cache, state);
 
@@ -89,7 +93,9 @@ void extent_io_exit(void)
 
 	while (!list_empty(&buffers)) {
 		eb = list_entry(buffers.next, struct extent_buffer, leak_list);
-		printk("buffer leak start %Lu len %lu refs %d\n", eb->start, eb->len, atomic_read(&eb->refs));
+		printk(KERN_ERR "btrfs buffer leak start %llu len %lu "
+		       "refs %d\n", (unsigned long long)eb->start,
+		       eb->len, atomic_read(&eb->refs));
 		list_del(&eb->leak_list);
 		kmem_cache_free(extent_buffer_cache, eb);
 	}
@@ -158,11 +164,11 @@ EXPORT_SYMBOL(free_extent_state);
 static struct rb_node *tree_insert(struct rb_root *root, u64 offset,
 				   struct rb_node *node)
 {
-	struct rb_node ** p = &root->rb_node;
-	struct rb_node * parent = NULL;
+	struct rb_node **p = &root->rb_node;
+	struct rb_node *parent = NULL;
 	struct tree_entry *entry;
 
-	while(*p) {
+	while (*p) {
 		parent = *p;
 		entry = rb_entry(parent, struct tree_entry, rb_node);
 
@@ -185,13 +191,13 @@ static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset,
 				     struct rb_node **next_ret)
 {
 	struct rb_root *root = &tree->state;
-	struct rb_node * n = root->rb_node;
+	struct rb_node *n = root->rb_node;
 	struct rb_node *prev = NULL;
 	struct rb_node *orig_prev = NULL;
 	struct tree_entry *entry;
 	struct tree_entry *prev_entry = NULL;
 
-	while(n) {
+	while (n) {
 		entry = rb_entry(n, struct tree_entry, rb_node);
 		prev = n;
 		prev_entry = entry;
@@ -200,14 +206,13 @@ static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset,
 			n = n->rb_left;
 		else if (offset > entry->end)
 			n = n->rb_right;
-		else {
+		else
 			return n;
-		}
 	}
 
 	if (prev_ret) {
 		orig_prev = prev;
-		while(prev && offset > prev_entry->end) {
+		while (prev && offset > prev_entry->end) {
 			prev = rb_next(prev);
 			prev_entry = rb_entry(prev, struct tree_entry, rb_node);
 		}
@@ -217,7 +222,7 @@ static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset,
 
 	if (next_ret) {
 		prev_entry = rb_entry(prev, struct tree_entry, rb_node);
-		while(prev && offset < prev_entry->start) {
+		while (prev && offset < prev_entry->start) {
 			prev = rb_prev(prev);
 			prev_entry = rb_entry(prev, struct tree_entry, rb_node);
 		}
@@ -233,9 +238,8 @@ static inline struct rb_node *tree_search(struct extent_io_tree *tree,
 	struct rb_node *ret;
 
 	ret = __etree_search(tree, offset, &prev, NULL);
-	if (!ret) {
+	if (!ret)
 		return prev;
-	}
 	return ret;
 }
 
@@ -243,11 +247,11 @@ static struct extent_buffer *buffer_tree_insert(struct extent_io_tree *tree,
 					  u64 offset, struct rb_node *node)
 {
 	struct rb_root *root = &tree->buffer;
-	struct rb_node ** p = &root->rb_node;
-	struct rb_node * parent = NULL;
+	struct rb_node **p = &root->rb_node;
+	struct rb_node *parent = NULL;
 	struct extent_buffer *eb;
 
-	while(*p) {
+	while (*p) {
 		parent = *p;
 		eb = rb_entry(parent, struct extent_buffer, rb_node);
 
@@ -268,10 +272,10 @@ static struct extent_buffer *buffer_search(struct extent_io_tree *tree,
 					   u64 offset)
 {
 	struct rb_root *root = &tree->buffer;
-	struct rb_node * n = root->rb_node;
+	struct rb_node *n = root->rb_node;
 	struct extent_buffer *eb;
 
-	while(n) {
+	while (n) {
 		eb = rb_entry(n, struct extent_buffer, rb_node);
 		if (offset < eb->start)
 			n = n->rb_left;
@@ -363,7 +367,9 @@ static int insert_state(struct extent_io_tree *tree,
 	struct rb_node *node;
 
 	if (end < start) {
-		printk("end < start %Lu %Lu\n", end, start);
+		printk(KERN_ERR "btrfs end < start %llu %llu\n",
+		       (unsigned long long)end,
+		       (unsigned long long)start);
 		WARN_ON(1);
 	}
 	if (bits & EXTENT_DIRTY)
@@ -376,7 +382,10 @@ static int insert_state(struct extent_io_tree *tree,
 	if (node) {
 		struct extent_state *found;
 		found = rb_entry(node, struct extent_state, rb_node);
-		printk("found node %Lu %Lu on insert of %Lu %Lu\n", found->start, found->end, start, end);
+		printk(KERN_ERR "btrfs found node %llu %llu on insert of "
+		       "%llu %llu\n", (unsigned long long)found->start,
+		       (unsigned long long)found->end,
+		       (unsigned long long)start, (unsigned long long)end);
 		free_extent_state(state);
 		return -EEXIST;
 	}
@@ -412,7 +421,6 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
 	if (node) {
 		struct extent_state *found;
 		found = rb_entry(node, struct extent_state, rb_node);
-		printk("found node %Lu %Lu on insert of %Lu %Lu\n", found->start, found->end, prealloc->start, prealloc->end);
 		free_extent_state(prealloc);
 		return -EEXIST;
 	}
@@ -661,8 +669,9 @@ static void set_state_bits(struct extent_io_tree *tree,
  * [start, end] is inclusive
  * This takes the tree lock.
  */
-static int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits,
-		   int exclusive, u64 *failed_start, gfp_t mask)
+static int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+			  int bits, int exclusive, u64 *failed_start,
+			  gfp_t mask)
 {
 	struct extent_state *state;
 	struct extent_state *prealloc = NULL;
@@ -763,7 +772,7 @@ again:
 		if (end < last_start)
 			this_end = end;
 		else
-			this_end = last_start -1;
+			this_end = last_start - 1;
 		err = insert_state(tree, prealloc, start, this_end,
 				   bits);
 		prealloc = NULL;
@@ -891,8 +900,8 @@ int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
 }
 EXPORT_SYMBOL(set_extent_uptodate);
 
-static int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
-			  gfp_t mask)
+static int clear_extent_uptodate(struct extent_io_tree *tree, u64 start,
+				 u64 end, gfp_t mask)
 {
 	return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0, mask);
 }
@@ -904,8 +913,8 @@ static int set_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end,
 			      0, NULL, mask);
 }
 
-static int clear_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end,
-			   gfp_t mask)
+static int clear_extent_writeback(struct extent_io_tree *tree, u64 start,
+				  u64 end, gfp_t mask)
 {
 	return clear_extent_bit(tree, start, end, EXTENT_WRITEBACK, 1, 0, mask);
 }
@@ -1025,11 +1034,10 @@ int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
 	 * our range starts.
 	 */
 	node = tree_search(tree, start);
-	if (!node) {
+	if (!node)
 		goto out;
-	}
 
-	while(1) {
+	while (1) {
 		state = rb_entry(node, struct extent_state, rb_node);
 		if (state->end >= start && (state->state & bits)) {
 			*start_ret = state->start;
@@ -1062,15 +1070,14 @@ struct extent_state *find_first_extent_bit_state(struct extent_io_tree *tree,
 	 * our range starts.
 	 */
 	node = tree_search(tree, start);
-	if (!node) {
+	if (!node)
 		goto out;
-	}
 
-	while(1) {
+	while (1) {
 		state = rb_entry(node, struct extent_state, rb_node);
-		if (state->end >= start && (state->state & bits)) {
+		if (state->end >= start && (state->state & bits))
 			return state;
-		}
+
 		node = rb_next(node);
 		if (!node)
 			break;
@@ -1108,7 +1115,7 @@ static noinline u64 find_delalloc_range(struct extent_io_tree *tree,
 		goto out;
 	}
 
-	while(1) {
+	while (1) {
 		state = rb_entry(node, struct extent_state, rb_node);
 		if (found && (state->start != cur_start ||
 			      (state->state & EXTENT_BOUNDARY))) {
@@ -1150,7 +1157,7 @@ static noinline int __unlock_for_delalloc(struct inode *inode,
 	if (index == locked_page->index && end_index == index)
 		return 0;
 
-	while(nr_pages > 0) {
+	while (nr_pages > 0) {
 		ret = find_get_pages_contig(inode->i_mapping, index,
 				     min_t(unsigned long, nr_pages,
 				     ARRAY_SIZE(pages)), pages);
@@ -1186,7 +1193,7 @@ static noinline int lock_delalloc_pages(struct inode *inode,
 
 	/* skip the page at the start index */
 	nrpages = end_index - index + 1;
-	while(nrpages > 0) {
+	while (nrpages > 0) {
 		ret = find_get_pages_contig(inode->i_mapping, index,
 				     min_t(unsigned long,
 				     nrpages, ARRAY_SIZE(pages)), pages);
@@ -1263,17 +1270,16 @@ again:
 	 * pages in order, so we can't process delalloc bytes before
 	 * locked_page
 	 */
-	if (delalloc_start < *start) {
+	if (delalloc_start < *start)
 		delalloc_start = *start;
-	}
 
 	/*
 	 * make sure to limit the number of pages we try to lock down
 	 * if we're looping.
 	 */
-	if (delalloc_end + 1 - delalloc_start > max_bytes && loops) {
+	if (delalloc_end + 1 - delalloc_start > max_bytes && loops)
 		delalloc_end = delalloc_start + PAGE_CACHE_SIZE - 1;
-	}
+
 	/* step two, lock all the pages after the page that has start */
 	ret = lock_delalloc_pages(inode, locked_page,
 				  delalloc_start, delalloc_end);
@@ -1341,7 +1347,7 @@ int extent_clear_unlock_delalloc(struct inode *inode,
 	if (!(unlock_pages || clear_dirty || set_writeback || end_writeback))
 		return 0;
 
-	while(nr_pages > 0) {
+	while (nr_pages > 0) {
 		ret = find_get_pages_contig(inode->i_mapping, index,
 				     min_t(unsigned long,
 				     nr_pages, ARRAY_SIZE(pages)), pages);
@@ -1384,7 +1390,6 @@ u64 count_range_bits(struct extent_io_tree *tree,
 	int found = 0;
 
 	if (search_end <= cur_start) {
-		printk("search_end %Lu start %Lu\n", search_end, cur_start);
 		WARN_ON(1);
 		return 0;
 	}
@@ -1399,11 +1404,10 @@ u64 count_range_bits(struct extent_io_tree *tree,
 	 * our range starts.
 	 */
 	node = tree_search(tree, cur_start);
-	if (!node) {
+	if (!node)
 		goto out;
-	}
 
-	while(1) {
+	while (1) {
 		state = rb_entry(node, struct extent_state, rb_node);
 		if (state->start > search_end)
 			break;
@@ -1927,19 +1931,15 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree,
 		nr = bio_get_nr_vecs(bdev);
 
 	bio = extent_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH);
-	if (!bio) {
-		printk("failed to allocate bio nr %d\n", nr);
-	}
 
 	bio_add_page(bio, page, page_size, offset);
 	bio->bi_end_io = end_io_func;
 	bio->bi_private = tree;
 
-	if (bio_ret) {
+	if (bio_ret)
 		*bio_ret = bio;
-	} else {
+	else
 		ret = submit_one_bio(rw, bio, mirror_num, bio_flags);
-	}
 
 	return ret;
 }
@@ -2028,13 +2028,7 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
 			break;
 		}
 		extent_offset = cur - em->start;
-		if (extent_map_end(em) <= cur) {
-printk("bad mapping em [%Lu %Lu] cur %Lu\n", em->start, extent_map_end(em), cur);
-		}
 		BUG_ON(extent_map_end(em) <= cur);
-		if (end < cur) {
-printk("2bad mapping end %Lu cur %Lu\n", end, cur);
-		}
 		BUG_ON(end < cur);
 
 		if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
@@ -2199,7 +2193,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
 	delalloc_end = 0;
 	page_started = 0;
 	if (!epd->extent_locked) {
-		while(delalloc_end < page_end) {
+		while (delalloc_end < page_end) {
 			nr_delalloc = find_lock_delalloc_range(inode, tree,
 						       page,
 						       &delalloc_start,
@@ -2242,9 +2236,8 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
 	nr_written++;
 
 	end = page_end;
-	if (test_range_bit(tree, start, page_end, EXTENT_DELALLOC, 0)) {
-		printk("found delalloc bits after lock_extent\n");
-	}
+	if (test_range_bit(tree, start, page_end, EXTENT_DELALLOC, 0))
+		printk(KERN_ERR "btrfs delalloc bits after lock_extent\n");
 
 	if (last_byte <= start) {
 		clear_extent_dirty(tree, start, page_end, GFP_NOFS);
@@ -2297,7 +2290,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
 			clear_extent_dirty(tree, cur,
 					   cur + iosize - 1, GFP_NOFS);
 
-			unlock_extent(tree, unlock_start, cur + iosize -1,
+			unlock_extent(tree, unlock_start, cur + iosize - 1,
 				      GFP_NOFS);
 
 			/*
@@ -2344,9 +2337,9 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
 
 			set_range_writeback(tree, cur, cur + iosize - 1);
 			if (!PageWriteback(page)) {
-				printk("warning page %lu not writeback, "
-				       "cur %llu end %llu\n", page->index,
-				       (unsigned long long)cur,
+				printk(KERN_ERR "btrfs warning page %lu not "
+				       "writeback, cur %llu end %llu\n",
+				       page->index, (unsigned long long)cur,
 				       (unsigned long long)end);
 			}
 
@@ -2430,8 +2423,8 @@ static int extent_write_cache_pages(struct extent_io_tree *tree,
 retry:
 	while (!done && (index <= end) &&
 	       (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
-					      PAGECACHE_TAG_DIRTY,
-					      min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
+			      PAGECACHE_TAG_DIRTY, min(end - index,
+				  (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
 		unsigned i;
 
 		scanned = 1;
@@ -2536,9 +2529,8 @@ int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
 
 	extent_write_cache_pages(tree, mapping, &wbc_writepages,
 				 __extent_writepage, &epd, flush_write_bio);
-	if (epd.bio) {
+	if (epd.bio)
 		submit_one_bio(WRITE, epd.bio, 0, 0);
-	}
 	return ret;
 }
 EXPORT_SYMBOL(extent_write_full_page);
@@ -2568,7 +2560,7 @@ int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode,
 		.range_end	= end + 1,
 	};
 
-	while(start <= end) {
+	while (start <= end) {
 		page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT);
 		if (clear_page_dirty_for_io(page))
 			ret = __extent_writepage(page, &wbc_writepages, &epd);
@@ -2606,9 +2598,8 @@ int extent_writepages(struct extent_io_tree *tree,
 	ret = extent_write_cache_pages(tree, mapping, wbc,
 				       __extent_writepage, &epd,
 				       flush_write_bio);
-	if (epd.bio) {
+	if (epd.bio)
 		submit_one_bio(WRITE, epd.bio, 0, 0);
-	}
 	return ret;
 }
 EXPORT_SYMBOL(extent_writepages);
@@ -2666,7 +2657,7 @@ int extent_invalidatepage(struct extent_io_tree *tree,
 	u64 end = start + PAGE_CACHE_SIZE - 1;
 	size_t blocksize = page->mapping->host->i_sb->s_blocksize;
 
-	start += (offset + blocksize -1) & ~(blocksize - 1);
+	start += (offset + blocksize - 1) & ~(blocksize - 1);
 	if (start > end)
 		return 0;
 
@@ -2727,12 +2718,12 @@ int extent_prepare_write(struct extent_io_tree *tree,
 	orig_block_start = block_start;
 
 	lock_extent(tree, page_start, page_end, GFP_NOFS);
-	while(block_start <= block_end) {
+	while (block_start <= block_end) {
 		em = get_extent(inode, page, page_offset, block_start,
 				block_end - block_start + 1, 1);
-		if (IS_ERR(em) || !em) {
+		if (IS_ERR(em) || !em)
 			goto err;
-		}
+
 		cur_end = min(block_end, extent_map_end(em) - 1);
 		block_off_start = block_start & (PAGE_CACHE_SIZE - 1);
 		block_off_end = block_off_start + blocksize;
@@ -3170,7 +3161,7 @@ int set_extent_buffer_dirty(struct extent_io_tree *tree,
 		}
 		__set_page_dirty_nobuffers(extent_buffer_page(eb, i));
 		set_extent_dirty(tree, page_offset(page),
-				 page_offset(page) + PAGE_CACHE_SIZE -1,
+				 page_offset(page) + PAGE_CACHE_SIZE - 1,
 				 GFP_NOFS);
 		unlock_page(page);
 	}
@@ -3235,7 +3226,7 @@ int extent_range_uptodate(struct extent_io_tree *tree,
 	ret = test_range_bit(tree, start, end, EXTENT_UPTODATE, 1);
 	if (ret)
 		return 1;
-	while(start <= end) {
+	while (start <= end) {
 		index = start >> PAGE_CACHE_SHIFT;
 		page = find_get_page(tree->mapping, index);
 		uptodate = PageUptodate(page);
@@ -3321,16 +3312,12 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
 			lock_page(page);
 		}
 		locked_pages++;
-		if (!PageUptodate(page)) {
+		if (!PageUptodate(page))
 			all_uptodate = 0;
-		}
 	}
 	if (all_uptodate) {
 		if (start_i == 0)
 			eb->flags |= EXTENT_UPTODATE;
-		if (ret) {
-			printk("all up to date but ret is %d\n", ret);
-		}
 		goto unlock_exit;
 	}
 
@@ -3345,10 +3332,8 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
 			err = __extent_read_full_page(tree, page,
 						      get_extent, &bio,
 						      mirror_num, &bio_flags);
-			if (err) {
+			if (err)
 				ret = err;
-				printk("err %d from __extent_read_full_page\n", ret);
-			}
 		} else {
 			unlock_page(page);
 		}
@@ -3357,26 +3342,23 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
 	if (bio)
 		submit_one_bio(READ, bio, mirror_num, bio_flags);
 
-	if (ret || !wait) {
-		if (ret)
-			printk("ret %d wait %d returning\n", ret, wait);
+	if (ret || !wait)
 		return ret;
-	}
+
 	for (i = start_i; i < num_pages; i++) {
 		page = extent_buffer_page(eb, i);
 		wait_on_page_locked(page);
-		if (!PageUptodate(page)) {
-			printk("page not uptodate after wait_on_page_locked\n");
+		if (!PageUptodate(page))
 			ret = -EIO;
-		}
 	}
+
 	if (!ret)
 		eb->flags |= EXTENT_UPTODATE;
 	return ret;
 
 unlock_exit:
 	i = start_i;
-	while(locked_pages > 0) {
+	while (locked_pages > 0) {
 		page = extent_buffer_page(eb, i);
 		i++;
 		unlock_page(page);
@@ -3403,7 +3385,7 @@ void read_extent_buffer(struct extent_buffer *eb, void *dstv,
 
 	offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
 
-	while(len > 0) {
+	while (len > 0) {
 		page = extent_buffer_page(eb, i);
 
 		cur = min(len, (PAGE_CACHE_SIZE - offset));
@@ -3442,8 +3424,11 @@ int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start,
 		offset = 0;
 		*map_start = ((u64)i << PAGE_CACHE_SHIFT) - start_offset;
 	}
+
 	if (start + min_len > eb->len) {
-printk("bad mapping eb start %Lu len %lu, wanted %lu %lu\n", eb->start, eb->len, start, min_len);
+		printk(KERN_ERR "btrfs bad mapping eb start %llu len %lu, "
+		       "wanted %lu %lu\n", (unsigned long long)eb->start,
+		       eb->len, start, min_len);
 		WARN_ON(1);
 	}
 
@@ -3506,7 +3491,7 @@ int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv,
 
 	offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
 
-	while(len > 0) {
+	while (len > 0) {
 		page = extent_buffer_page(eb, i);
 
 		cur = min(len, (PAGE_CACHE_SIZE - offset));
@@ -3542,7 +3527,7 @@ void write_extent_buffer(struct extent_buffer *eb, const void *srcv,
 
 	offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
 
-	while(len > 0) {
+	while (len > 0) {
 		page = extent_buffer_page(eb, i);
 		WARN_ON(!PageUptodate(page));
 
@@ -3574,7 +3559,7 @@ void memset_extent_buffer(struct extent_buffer *eb, char c,
 
 	offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
 
-	while(len > 0) {
+	while (len > 0) {
 		page = extent_buffer_page(eb, i);
 		WARN_ON(!PageUptodate(page));
 
@@ -3607,7 +3592,7 @@ void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src,
 	offset = (start_offset + dst_offset) &
 		((unsigned long)PAGE_CACHE_SIZE - 1);
 
-	while(len > 0) {
+	while (len > 0) {
 		page = extent_buffer_page(dst, i);
 		WARN_ON(!PageUptodate(page));
 
@@ -3674,17 +3659,17 @@ void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
 	unsigned long src_i;
 
 	if (src_offset + len > dst->len) {
-		printk("memmove bogus src_offset %lu move len %lu len %lu\n",
-		       src_offset, len, dst->len);
+		printk(KERN_ERR "btrfs memmove bogus src_offset %lu move "
+		       "len %lu dst len %lu\n", src_offset, len, dst->len);
 		BUG_ON(1);
 	}
 	if (dst_offset + len > dst->len) {
-		printk("memmove bogus dst_offset %lu move len %lu len %lu\n",
-		       dst_offset, len, dst->len);
+		printk(KERN_ERR "btrfs memmove bogus dst_offset %lu move "
+		       "len %lu dst len %lu\n", dst_offset, len, dst->len);
 		BUG_ON(1);
 	}
 
-	while(len > 0) {
+	while (len > 0) {
 		dst_off_in_page = (start_offset + dst_offset) &
 			((unsigned long)PAGE_CACHE_SIZE - 1);
 		src_off_in_page = (start_offset + src_offset) &
@@ -3722,20 +3707,20 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
 	unsigned long src_i;
 
 	if (src_offset + len > dst->len) {
-		printk("memmove bogus src_offset %lu move len %lu len %lu\n",
-		       src_offset, len, dst->len);
+		printk(KERN_ERR "btrfs memmove bogus src_offset %lu move "
+		       "len %lu len %lu\n", src_offset, len, dst->len);
 		BUG_ON(1);
 	}
 	if (dst_offset + len > dst->len) {
-		printk("memmove bogus dst_offset %lu move len %lu len %lu\n",
-		       dst_offset, len, dst->len);
+		printk(KERN_ERR "btrfs memmove bogus dst_offset %lu move "
+		       "len %lu len %lu\n", dst_offset, len, dst->len);
 		BUG_ON(1);
 	}
 	if (dst_offset < src_offset) {
 		memcpy_extent_buffer(dst, dst_offset, src_offset, len);
 		return;
 	}
-	while(len > 0) {
+	while (len > 0) {
 		dst_i = (start_offset + dst_end) >> PAGE_CACHE_SHIFT;
 		src_i = (start_offset + src_end) >> PAGE_CACHE_SHIFT;
 
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index fd3ebfb8c3c5..4a83e33ada32 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -89,11 +89,11 @@ EXPORT_SYMBOL(free_extent_map);
 static struct rb_node *tree_insert(struct rb_root *root, u64 offset,
 				   struct rb_node *node)
 {
-	struct rb_node ** p = &root->rb_node;
-	struct rb_node * parent = NULL;
+	struct rb_node **p = &root->rb_node;
+	struct rb_node *parent = NULL;
 	struct extent_map *entry;
 
-	while(*p) {
+	while (*p) {
 		parent = *p;
 		entry = rb_entry(parent, struct extent_map, rb_node);
 
@@ -122,13 +122,13 @@ static struct rb_node *__tree_search(struct rb_root *root, u64 offset,
 				     struct rb_node **prev_ret,
 				     struct rb_node **next_ret)
 {
-	struct rb_node * n = root->rb_node;
+	struct rb_node *n = root->rb_node;
 	struct rb_node *prev = NULL;
 	struct rb_node *orig_prev = NULL;
 	struct extent_map *entry;
 	struct extent_map *prev_entry = NULL;
 
-	while(n) {
+	while (n) {
 		entry = rb_entry(n, struct extent_map, rb_node);
 		prev = n;
 		prev_entry = entry;
@@ -145,7 +145,7 @@ static struct rb_node *__tree_search(struct rb_root *root, u64 offset,
 
 	if (prev_ret) {
 		orig_prev = prev;
-		while(prev && offset >= extent_map_end(prev_entry)) {
+		while (prev && offset >= extent_map_end(prev_entry)) {
 			prev = rb_next(prev);
 			prev_entry = rb_entry(prev, struct extent_map, rb_node);
 		}
@@ -155,7 +155,7 @@ static struct rb_node *__tree_search(struct rb_root *root, u64 offset,
 
 	if (next_ret) {
 		prev_entry = rb_entry(prev, struct extent_map, rb_node);
-		while(prev && offset < prev_entry->start) {
+		while (prev && offset < prev_entry->start) {
 			prev = rb_prev(prev);
 			prev_entry = rb_entry(prev, struct extent_map, rb_node);
 		}
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index cc6e0b6de949..b11abfad81a5 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -24,7 +24,7 @@
 #include "transaction.h"
 #include "print-tree.h"
 
-#define MAX_CSUM_ITEMS(r,size) ((((BTRFS_LEAF_DATA_SIZE(r) - \
+#define MAX_CSUM_ITEMS(r, size) ((((BTRFS_LEAF_DATA_SIZE(r) - \
 				   sizeof(struct btrfs_item) * 2) / \
 				  size) - 1))
 int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
@@ -166,7 +166,7 @@ int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
 	WARN_ON(bio->bi_vcnt <= 0);
 
 	disk_bytenr = (u64)bio->bi_sector << 9;
-	while(bio_index < bio->bi_vcnt) {
+	while (bio_index < bio->bi_vcnt) {
 		offset = page_offset(bvec->bv_page) + bvec->bv_offset;
 		ret = btrfs_find_ordered_sum(inode, offset, disk_bytenr, &sum);
 		if (ret == 0)
@@ -192,8 +192,9 @@ int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
 						offset + bvec->bv_len - 1,
 						EXTENT_NODATASUM, GFP_NOFS);
 				} else {
-					printk("no csum found for inode %lu "
-					       "start %llu\n", inode->i_ino,
+					printk(KERN_INFO "btrfs no csum found "
+					       "for inode %lu start %llu\n",
+					       inode->i_ino,
 					       (unsigned long long)offset);
 				}
 				item = NULL;
@@ -373,7 +374,7 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
 	BUG_ON(!ordered);
 	sums->bytenr = ordered->start;
 
-	while(bio_index < bio->bi_vcnt) {
+	while (bio_index < bio->bi_vcnt) {
 		if (!contig)
 			offset = page_offset(bvec->bv_page) + bvec->bv_offset;
 
@@ -507,7 +508,7 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
 
 	path = btrfs_alloc_path();
 
-	while(1) {
+	while (1) {
 		key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
 		key.offset = end_byte - 1;
 		key.type = BTRFS_EXTENT_CSUM_KEY;
@@ -715,9 +716,8 @@ again:
 			goto csum;
 
 		diff = diff - btrfs_item_size_nr(leaf, path->slots[0]);
-		if (diff != csum_size) {
+		if (diff != csum_size)
 			goto insert;
-		}
 
 		ret = btrfs_extend_item(trans, root, path, diff);
 		BUG_ON(ret);
@@ -732,7 +732,7 @@ insert:
 		u64 next_sector = sector_sum->bytenr;
 		struct btrfs_sector_sum *next = sector_sum + 1;
 
-		while(tmp < sums->len) {
+		while (tmp < sums->len) {
 			if (next_sector + root->sectorsize != next->bytenr)
 				break;
 			tmp += root->sectorsize;
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 5908521922fb..0e3a13a45653 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -44,10 +44,10 @@
 /* simple helper to fault in pages and copy.  This should go away
  * and be replaced with calls into generic code.
  */
-static int noinline btrfs_copy_from_user(loff_t pos, int num_pages,
+static noinline int btrfs_copy_from_user(loff_t pos, int num_pages,
 					 int write_bytes,
 					 struct page **prepared_pages,
-					 const char __user * buf)
+					 const char __user *buf)
 {
 	long page_fault = 0;
 	int i;
@@ -78,7 +78,7 @@ static int noinline btrfs_copy_from_user(loff_t pos, int num_pages,
 /*
  * unlocks pages after btrfs_file_write is done with them
  */
-static void noinline btrfs_drop_pages(struct page **pages, size_t num_pages)
+static noinline void btrfs_drop_pages(struct page **pages, size_t num_pages)
 {
 	size_t i;
 	for (i = 0; i < num_pages; i++) {
@@ -103,7 +103,7 @@ static void noinline btrfs_drop_pages(struct page **pages, size_t num_pages)
  * this also makes the decision about creating an inline extent vs
  * doing real data extents, marking pages dirty and delalloc as required.
  */
-static int noinline dirty_and_release_pages(struct btrfs_trans_handle *trans,
+static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans,
 				   struct btrfs_root *root,
 				   struct file *file,
 				   struct page **pages,
@@ -137,9 +137,6 @@ static int noinline dirty_and_release_pages(struct btrfs_trans_handle *trans,
 	btrfs_set_trans_block_group(trans, inode);
 	hint_byte = 0;
 
-	if ((end_of_last_block & 4095) == 0) {
-		printk("strange end of last %Lu %zu %Lu\n", start_pos, write_bytes, end_of_last_block);
-	}
 	set_extent_uptodate(io_tree, start_pos, end_of_last_block, GFP_NOFS);
 
 	/* check for reserved extents on each page, we don't want
@@ -185,7 +182,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
 		len = (u64)-1;
 		testend = 0;
 	}
-	while(1) {
+	while (1) {
 		if (!split)
 			split = alloc_extent_map(GFP_NOFS);
 		if (!split2)
@@ -295,7 +292,7 @@ int btrfs_check_file(struct btrfs_root *root, struct inode *inode)
 	path = btrfs_alloc_path();
 	ret = btrfs_lookup_file_extent(NULL, root, path, inode->i_ino,
 				       last_offset, 0);
-	while(1) {
+	while (1) {
 		nritems = btrfs_header_nritems(path->nodes[0]);
 		if (path->slots[0] >= nritems) {
 			ret = btrfs_next_leaf(root, path);
@@ -314,8 +311,10 @@ int btrfs_check_file(struct btrfs_root *root, struct inode *inode)
 		if (found_key.offset < last_offset) {
 			WARN_ON(1);
 			btrfs_print_leaf(root, leaf);
-			printk("inode %lu found offset %Lu expected %Lu\n",
-			       inode->i_ino, found_key.offset, last_offset);
+			printk(KERN_ERR "inode %lu found offset %llu "
+			       "expected %llu\n", inode->i_ino,
+			       (unsigned long long)found_key.offset,
+			       (unsigned long long)last_offset);
 			err = 1;
 			goto out;
 		}
@@ -331,7 +330,7 @@ int btrfs_check_file(struct btrfs_root *root, struct inode *inode)
 			extent_end = found_key.offset +
 			     btrfs_file_extent_inline_len(leaf, extent);
 			extent_end = (extent_end + root->sectorsize - 1) &
-				~((u64)root->sectorsize -1 );
+				~((u64)root->sectorsize - 1);
 		}
 		last_offset = extent_end;
 		path->slots[0]++;
@@ -339,8 +338,9 @@ int btrfs_check_file(struct btrfs_root *root, struct inode *inode)
 	if (0 && last_offset < inode->i_size) {
 		WARN_ON(1);
 		btrfs_print_leaf(root, leaf);
-		printk("inode %lu found offset %Lu size %Lu\n", inode->i_ino,
-		       last_offset, inode->i_size);
+		printk(KERN_ERR "inode %lu found offset %llu size %llu\n",
+		       inode->i_ino, (unsigned long long)last_offset,
+		       (unsigned long long)inode->i_size);
 		err = 1;
 
 	}
@@ -362,7 +362,7 @@ out:
  * inline_limit is used to tell this code which offsets in the file to keep
  * if they contain inline extents.
  */
-int noinline btrfs_drop_extents(struct btrfs_trans_handle *trans,
+noinline int btrfs_drop_extents(struct btrfs_trans_handle *trans,
 		       struct btrfs_root *root, struct inode *inode,
 		       u64 start, u64 end, u64 inline_limit, u64 *hint_byte)
 {
@@ -398,7 +398,7 @@ int noinline btrfs_drop_extents(struct btrfs_trans_handle *trans,
 	path = btrfs_alloc_path();
 	if (!path)
 		return -ENOMEM;
-	while(1) {
+	while (1) {
 		recow = 0;
 		btrfs_release_path(root, path);
 		ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino,
@@ -649,16 +649,15 @@ next_slot:
 			if (disk_bytenr != 0) {
 				ret = btrfs_update_extent_ref(trans, root,
 						disk_bytenr, orig_parent,
-					        leaf->start,
+						leaf->start,
 						root->root_key.objectid,
 						trans->transid, ins.objectid);
 
 				BUG_ON(ret);
 			}
 			btrfs_release_path(root, path);
-			if (disk_bytenr != 0) {
+			if (disk_bytenr != 0)
 				inode_add_bytes(inode, extent_end - end);
-			}
 		}
 
 		if (found_extent && !keep) {
@@ -944,7 +943,7 @@ done:
  * waits for data=ordered extents to finish before allowing the pages to be
  * modified.
  */
-static int noinline prepare_pages(struct btrfs_root *root, struct file *file,
+static noinline int prepare_pages(struct btrfs_root *root, struct file *file,
 			 struct page **pages, size_t num_pages,
 			 loff_t pos, unsigned long first_index,
 			 unsigned long last_index, size_t write_bytes)
@@ -979,7 +978,8 @@ again:
 		struct btrfs_ordered_extent *ordered;
 		lock_extent(&BTRFS_I(inode)->io_tree,
 			    start_pos, last_pos - 1, GFP_NOFS);
-		ordered = btrfs_lookup_first_ordered_extent(inode, last_pos -1);
+		ordered = btrfs_lookup_first_ordered_extent(inode,
+							    last_pos - 1);
 		if (ordered &&
 		    ordered->file_offset + ordered->len > start_pos &&
 		    ordered->file_offset < last_pos) {
@@ -1085,7 +1085,7 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
 		}
 	}
 
-	while(count > 0) {
+	while (count > 0) {
 		size_t offset = pos & (PAGE_CACHE_SIZE - 1);
 		size_t write_bytes = min(count, nrptrs *
 					(size_t)PAGE_CACHE_SIZE -
@@ -1178,7 +1178,7 @@ out_nolock:
 	return num_written ? num_written : err;
 }
 
-int btrfs_release_file(struct inode * inode, struct file * filp)
+int btrfs_release_file(struct inode *inode, struct file *filp)
 {
 	if (filp->private_data)
 		btrfs_ioctl_trans_end(filp);
@@ -1237,9 +1237,8 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
 	}
 
 	ret = btrfs_log_dentry_safe(trans, root, file->f_dentry);
-	if (ret < 0) {
+	if (ret < 0)
 		goto out;
-	}
 
 	/* we've logged all the items and now have a consistent
 	 * version of the file in the log.  It is possible that
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 2e69b9c30437..d1e5f0e84c58 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -213,10 +213,13 @@ static int __btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
 		info->offset = offset;
 		info->bytes += bytes;
 	} else if (right_info && right_info->offset != offset+bytes) {
-		printk(KERN_ERR "adding space in the middle of an existing "
-		       "free space area. existing: offset=%Lu, bytes=%Lu. "
-		       "new: offset=%Lu, bytes=%Lu\n", right_info->offset,
-		       right_info->bytes, offset, bytes);
+		printk(KERN_ERR "btrfs adding space in the middle of an "
+		       "existing free space area. existing: "
+		       "offset=%llu, bytes=%llu. new: offset=%llu, "
+		       "bytes=%llu\n", (unsigned long long)right_info->offset,
+		       (unsigned long long)right_info->bytes,
+		       (unsigned long long)offset,
+		       (unsigned long long)bytes);
 		BUG();
 	}
 
@@ -225,11 +228,14 @@ static int __btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
 
 		if (unlikely((left_info->offset + left_info->bytes) !=
 			     offset)) {
-			printk(KERN_ERR "free space to the left of new free "
-			       "space isn't quite right. existing: offset=%Lu,"
-			       " bytes=%Lu. new: offset=%Lu, bytes=%Lu\n",
-			       left_info->offset, left_info->bytes, offset,
-			       bytes);
+			printk(KERN_ERR "btrfs free space to the left "
+			       "of new free space isn't "
+			       "quite right. existing: offset=%llu, "
+			       "bytes=%llu. new: offset=%llu, bytes=%llu\n",
+			       (unsigned long long)left_info->offset,
+			       (unsigned long long)left_info->bytes,
+			       (unsigned long long)offset,
+			       (unsigned long long)bytes);
 			BUG();
 		}
 
@@ -265,8 +271,7 @@ out:
 			BUG();
 	}
 
-	if (alloc_info)
-		kfree(alloc_info);
+	kfree(alloc_info);
 
 	return ret;
 }
@@ -283,9 +288,11 @@ __btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
 
 	if (info && info->offset == offset) {
 		if (info->bytes < bytes) {
-			printk(KERN_ERR "Found free space at %Lu, size %Lu,"
-			       "trying to use %Lu\n",
-			       info->offset, info->bytes, bytes);
+			printk(KERN_ERR "Found free space at %llu, size %llu,"
+			       "trying to use %llu\n",
+			       (unsigned long long)info->offset,
+			       (unsigned long long)info->bytes,
+			       (unsigned long long)bytes);
 			WARN_ON(1);
 			ret = -EINVAL;
 			goto out;
@@ -401,8 +408,6 @@ void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
 		info = rb_entry(n, struct btrfs_free_space, offset_index);
 		if (info->bytes >= bytes)
 			count++;
-		//printk(KERN_INFO "offset=%Lu, bytes=%Lu\n", info->offset,
-		//       info->bytes);
 	}
 	printk(KERN_INFO "%d blocks of free space at or bigger than bytes is"
 	       "\n", count);
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index 80038c5ef7cf..2aa79873eb46 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -129,7 +129,6 @@ int btrfs_find_free_objectid(struct btrfs_trans_handle *trans,
 		last_ino = key.objectid + 1;
 		path->slots[0]++;
 	}
-	// FIXME -ENOSPC
 	BUG_ON(1);
 found:
 	btrfs_release_path(root, path);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 068bad463387..1b35ea63b6ce 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -124,7 +124,7 @@ int btrfs_check_free_space(struct btrfs_root *root, u64 num_required,
  * the btree.  The caller should have done a btrfs_drop_extents so that
  * no overlapping inline items exist in the btree
  */
-static int noinline insert_inline_extent(struct btrfs_trans_handle *trans,
+static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,
 				struct btrfs_root *root, struct inode *inode,
 				u64 start, size_t size, size_t compressed_size,
 				struct page **compressed_pages)
@@ -148,7 +148,8 @@ static int noinline insert_inline_extent(struct btrfs_trans_handle *trans,
 		cur_size = compressed_size;
 	}
 
-	path = btrfs_alloc_path(); if (!path)
+	path = btrfs_alloc_path();
+	if (!path)
 		return -ENOMEM;
 
 	btrfs_set_trans_block_group(trans, inode);
@@ -165,7 +166,6 @@ static int noinline insert_inline_extent(struct btrfs_trans_handle *trans,
 	BUG_ON(ret);
 	if (ret) {
 		err = ret;
-		printk("got bad ret %d\n", ret);
 		goto fail;
 	}
 	leaf = path->nodes[0];
@@ -181,7 +181,7 @@ static int noinline insert_inline_extent(struct btrfs_trans_handle *trans,
 	if (use_compress) {
 		struct page *cpage;
 		int i = 0;
-		while(compressed_size > 0) {
+		while (compressed_size > 0) {
 			cpage = compressed_pages[i];
 			cur_size = min_t(unsigned long, compressed_size,
 				       PAGE_CACHE_SIZE);
@@ -519,8 +519,7 @@ free_pages_out:
 		WARN_ON(pages[i]->mapping);
 		page_cache_release(pages[i]);
 	}
-	if (pages)
-		kfree(pages);
+	kfree(pages);
 
 	goto out;
 }
@@ -549,7 +548,7 @@ static noinline int submit_compressed_extents(struct inode *inode,
 
 	trans = btrfs_join_transaction(root, 1);
 
-	while(!list_empty(&async_cow->extents)) {
+	while (!list_empty(&async_cow->extents)) {
 		async_extent = list_entry(async_cow->extents.next,
 					  struct async_extent, list);
 		list_del(&async_extent->list);
@@ -562,8 +561,8 @@ static noinline int submit_compressed_extents(struct inode *inode,
 			unsigned long nr_written = 0;
 
 			lock_extent(io_tree, async_extent->start,
-				    async_extent->start + async_extent->ram_size - 1,
-				    GFP_NOFS);
+				    async_extent->start +
+				    async_extent->ram_size - 1, GFP_NOFS);
 
 			/* allocate blocks */
 			cow_file_range(inode, async_cow->locked_page,
@@ -581,7 +580,7 @@ static noinline int submit_compressed_extents(struct inode *inode,
 			if (!page_started)
 				extent_write_locked_range(io_tree,
 						  inode, async_extent->start,
-					          async_extent->start +
+						  async_extent->start +
 						  async_extent->ram_size - 1,
 						  btrfs_get_extent,
 						  WB_SYNC_ALL);
@@ -618,7 +617,7 @@ static noinline int submit_compressed_extents(struct inode *inode,
 		set_bit(EXTENT_FLAG_PINNED, &em->flags);
 		set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
 
-		while(1) {
+		while (1) {
 			spin_lock(&em_tree->lock);
 			ret = add_extent_mapping(em_tree, em);
 			spin_unlock(&em_tree->lock);
@@ -651,11 +650,11 @@ static noinline int submit_compressed_extents(struct inode *inode,
 					     NULL, 1, 1, 0, 1, 1, 0);
 
 		ret = btrfs_submit_compressed_write(inode,
-				         async_extent->start,
-					 async_extent->ram_size,
-					 ins.objectid,
-					 ins.offset, async_extent->pages,
-					 async_extent->nr_pages);
+				    async_extent->start,
+				    async_extent->ram_size,
+				    ins.objectid,
+				    ins.offset, async_extent->pages,
+				    async_extent->nr_pages);
 
 		BUG_ON(ret);
 		trans = btrfs_join_transaction(root, 1);
@@ -735,14 +734,13 @@ static noinline int cow_file_range(struct inode *inode,
 
 	btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0);
 
-	while(disk_num_bytes > 0) {
+	while (disk_num_bytes > 0) {
 		cur_alloc_size = min(disk_num_bytes, root->fs_info->max_extent);
 		ret = btrfs_reserve_extent(trans, root, cur_alloc_size,
 					   root->sectorsize, 0, alloc_hint,
 					   (u64)-1, &ins, 1);
-		if (ret) {
-			BUG();
-		}
+		BUG_ON(ret);
+
 		em = alloc_extent_map(GFP_NOFS);
 		em->start = start;
 		em->orig_start = em->start;
@@ -755,7 +753,7 @@ static noinline int cow_file_range(struct inode *inode,
 		em->bdev = root->fs_info->fs_devices->latest_bdev;
 		set_bit(EXTENT_FLAG_PINNED, &em->flags);
 
-		while(1) {
+		while (1) {
 			spin_lock(&em_tree->lock);
 			ret = add_extent_mapping(em_tree, em);
 			spin_unlock(&em_tree->lock);
@@ -779,11 +777,9 @@ static noinline int cow_file_range(struct inode *inode,
 			BUG_ON(ret);
 		}
 
-		if (disk_num_bytes < cur_alloc_size) {
-			printk("num_bytes %Lu cur_alloc %Lu\n", disk_num_bytes,
-			       cur_alloc_size);
+		if (disk_num_bytes < cur_alloc_size)
 			break;
-		}
+
 		/* we're not doing compressed IO, don't unlock the first
 		 * page (which the caller expects to stay locked), don't
 		 * clear any dirty bits and don't set any writeback bits
@@ -842,9 +838,8 @@ static noinline void async_cow_submit(struct btrfs_work *work)
 	    waitqueue_active(&root->fs_info->async_submit_wait))
 		wake_up(&root->fs_info->async_submit_wait);
 
-	if (async_cow->inode) {
+	if (async_cow->inode)
 		submit_compressed_extents(async_cow->inode, async_cow);
-	}
 }
 
 static noinline void async_cow_free(struct btrfs_work *work)
@@ -871,7 +866,7 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page,
 
 	clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED |
 			 EXTENT_DELALLOC, 1, 0, GFP_NOFS);
-	while(start < end) {
+	while (start < end) {
 		async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS);
 		async_cow->inode = inode;
 		async_cow->root = root;
@@ -904,7 +899,7 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page,
 			    limit));
 		}
 
-		while(atomic_read(&root->fs_info->async_submit_draining) &&
+		while (atomic_read(&root->fs_info->async_submit_draining) &&
 		      atomic_read(&root->fs_info->async_delalloc_pages)) {
 			wait_event(root->fs_info->async_submit_wait,
 			  (atomic_read(&root->fs_info->async_delalloc_pages) ==
@@ -918,7 +913,7 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page,
 	return 0;
 }
 
-static int noinline csum_exist_in_range(struct btrfs_root *root,
+static noinline int csum_exist_in_range(struct btrfs_root *root,
 					u64 bytenr, u64 num_bytes)
 {
 	int ret;
@@ -1146,13 +1141,13 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
 
 	if (btrfs_test_flag(inode, NODATACOW))
 		ret = run_delalloc_nocow(inode, locked_page, start, end,
-                                        page_started, 1, nr_written);
+					 page_started, 1, nr_written);
 	else if (btrfs_test_flag(inode, PREALLOC))
 		ret = run_delalloc_nocow(inode, locked_page, start, end,
-                                        page_started, 0, nr_written);
+					 page_started, 0, nr_written);
 	else
 		ret = cow_file_range_async(inode, locked_page, start, end,
-				     page_started, nr_written);
+					   page_started, nr_written);
 
 	return ret;
 }
@@ -1200,8 +1195,11 @@ static int btrfs_clear_bit_hook(struct inode *inode, u64 start, u64 end,
 
 		spin_lock(&root->fs_info->delalloc_lock);
 		if (end - start + 1 > root->fs_info->delalloc_bytes) {
-			printk("warning: delalloc account %Lu %Lu\n",
-			       end - start + 1, root->fs_info->delalloc_bytes);
+			printk(KERN_INFO "btrfs warning: delalloc account "
+			       "%llu %llu\n",
+			       (unsigned long long)end - start + 1,
+			       (unsigned long long)
+			       root->fs_info->delalloc_bytes);
 			root->fs_info->delalloc_bytes = 0;
 			BTRFS_I(inode)->delalloc_bytes = 0;
 		} else {
@@ -1241,9 +1239,8 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
 	ret = btrfs_map_block(map_tree, READ, logical,
 			      &map_length, NULL, 0);
 
-	if (map_length < length + size) {
+	if (map_length < length + size)
 		return 1;
-	}
 	return 0;
 }
 
@@ -1255,8 +1252,9 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
  * At IO completion time the cums attached on the ordered extent record
  * are inserted into the btree
  */
-static int __btrfs_submit_bio_start(struct inode *inode, int rw, struct bio *bio,
-			  int mirror_num, unsigned long bio_flags)
+static int __btrfs_submit_bio_start(struct inode *inode, int rw,
+				    struct bio *bio, int mirror_num,
+				    unsigned long bio_flags)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	int ret = 0;
@@ -1341,9 +1339,8 @@ static noinline int add_pending_csums(struct btrfs_trans_handle *trans,
 
 int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end)
 {
-	if ((end & (PAGE_CACHE_SIZE - 1)) == 0) {
+	if ((end & (PAGE_CACHE_SIZE - 1)) == 0)
 		WARN_ON(1);
-	}
 	return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end,
 				   GFP_NOFS);
 }
@@ -1755,14 +1752,14 @@ static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end,
 	}
 	local_irq_save(flags);
 	kaddr = kmap_atomic(page, KM_IRQ0);
-	if (ret) {
+	if (ret)
 		goto zeroit;
-	}
+
 	csum = btrfs_csum_data(root, kaddr + offset, csum,  end - start + 1);
 	btrfs_csum_final(csum, (char *)&csum);
-	if (csum != private) {
+	if (csum != private)
 		goto zeroit;
-	}
+
 	kunmap_atomic(kaddr, KM_IRQ0);
 	local_irq_restore(flags);
 good:
@@ -1773,9 +1770,10 @@ good:
 	return 0;
 
 zeroit:
-	printk("btrfs csum failed ino %lu off %llu csum %u private %Lu\n",
-	       page->mapping->host->i_ino, (unsigned long long)start, csum,
-	       private);
+	printk(KERN_INFO "btrfs csum failed ino %lu off %llu csum %u "
+	       "private %llu\n", page->mapping->host->i_ino,
+	       (unsigned long long)start, csum,
+	       (unsigned long long)private);
 	memset(kaddr + offset, 1, end - start + 1);
 	flush_dcache_page(page);
 	kunmap_atomic(kaddr, KM_IRQ0);
@@ -2097,9 +2095,8 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
 /*
  * copy everything in the in-memory inode into the btree.
  */
-int noinline btrfs_update_inode(struct btrfs_trans_handle *trans,
-			      struct btrfs_root *root,
-			      struct inode *inode)
+noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
+				struct btrfs_root *root, struct inode *inode)
 {
 	struct btrfs_inode_item *inode_item;
 	struct btrfs_path *path;
@@ -2174,7 +2171,7 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
 				  inode->i_ino,
 				  dir->i_ino, &index);
 	if (ret) {
-		printk("failed to delete reference to %.*s, "
+		printk(KERN_INFO "btrfs failed to delete reference to %.*s, "
 		       "inode %lu parent %lu\n", name_len, name,
 		       inode->i_ino, dir->i_ino);
 		goto err;
@@ -2280,9 +2277,8 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
 	/* now the directory is empty */
 	err = btrfs_unlink_inode(trans, root, dir, dentry->d_inode,
 				 dentry->d_name.name, dentry->d_name.len);
-	if (!err) {
+	if (!err)
 		btrfs_i_size_write(inode, 0);
-	}
 
 fail_trans:
 	nr = trans->blocks_used;
@@ -2516,9 +2512,9 @@ noinline int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
 
 search_again:
 	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
-	if (ret < 0) {
+	if (ret < 0)
 		goto error;
-	}
+
 	if (ret > 0) {
 		/* there are no items in the tree for us to truncate, we're
 		 * done
@@ -2530,7 +2526,7 @@ search_again:
 		path->slots[0]--;
 	}
 
-	while(1) {
+	while (1) {
 		fi = NULL;
 		leaf = path->nodes[0];
 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
@@ -2562,19 +2558,18 @@ search_again:
 			item_end--;
 		}
 		if (item_end < new_size) {
-			if (found_type == BTRFS_DIR_ITEM_KEY) {
+			if (found_type == BTRFS_DIR_ITEM_KEY)
 				found_type = BTRFS_INODE_ITEM_KEY;
-			} else if (found_type == BTRFS_EXTENT_ITEM_KEY) {
+			else if (found_type == BTRFS_EXTENT_ITEM_KEY)
 				found_type = BTRFS_EXTENT_DATA_KEY;
-			} else if (found_type == BTRFS_EXTENT_DATA_KEY) {
+			else if (found_type == BTRFS_EXTENT_DATA_KEY)
 				found_type = BTRFS_XATTR_ITEM_KEY;
-			} else if (found_type == BTRFS_XATTR_ITEM_KEY) {
+			else if (found_type == BTRFS_XATTR_ITEM_KEY)
 				found_type = BTRFS_INODE_REF_KEY;
-			} else if (found_type) {
+			else if (found_type)
 				found_type--;
-			} else {
+			else
 				break;
-			}
 			btrfs_set_key_type(&key, found_type);
 			goto next;
 		}
@@ -2656,7 +2651,7 @@ delete:
 				pending_del_nr++;
 				pending_del_slot = path->slots[0];
 			} else {
-				printk("bad pending slot %d pending_del_nr %d pending_del_slot %d\n", path->slots[0], pending_del_nr, pending_del_slot);
+				BUG();
 			}
 		} else {
 			break;
@@ -2938,9 +2933,10 @@ static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry,
 				    namelen, 0);
 	if (IS_ERR(di))
 		ret = PTR_ERR(di);
-	if (!di || IS_ERR(di)) {
+
+	if (!di || IS_ERR(di))
 		goto out_err;
-	}
+
 	btrfs_dir_item_key_to_cpu(path->nodes[0], di, location);
 out:
 	btrfs_free_path(path);
@@ -3020,8 +3016,8 @@ static int btrfs_init_locked_inode(struct inode *inode, void *p)
 static int btrfs_find_actor(struct inode *inode, void *opaque)
 {
 	struct btrfs_iget_args *args = opaque;
-	return (args->ino == inode->i_ino &&
-		args->root == BTRFS_I(inode)->root);
+	return args->ino == inode->i_ino &&
+		args->root == BTRFS_I(inode)->root;
 }
 
 struct inode *btrfs_ilookup(struct super_block *s, u64 objectid,
@@ -3085,7 +3081,7 @@ struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
 
 struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
 {
-	struct inode * inode;
+	struct inode *inode;
 	struct btrfs_inode *bi = BTRFS_I(dir);
 	struct btrfs_root *root = bi->root;
 	struct btrfs_root *sub_root = root;
@@ -3385,9 +3381,8 @@ int btrfs_set_inode_index(struct inode *dir, u64 *index)
 
 	if (BTRFS_I(dir)->index_cnt == (u64)-1) {
 		ret = btrfs_set_inode_index_count(dir);
-		if (ret) {
+		if (ret)
 			return ret;
-		}
 	}
 
 	*index = BTRFS_I(dir)->index_cnt;
@@ -3879,12 +3874,13 @@ static noinline int uncompress_inline(struct btrfs_path *path,
 
 /*
  * a bit scary, this does extent mapping from logical file offset to the disk.
- * the ugly parts come from merging extents from the disk with the
- * in-ram representation.  This gets more complex because of the data=ordered code,
+ * the ugly parts come from merging extents from the disk with the in-ram
+ * representation.  This gets more complex because of the data=ordered code,
  * where the in-ram extents might be locked pending data=ordered completion.
  *
  * This also copies inline extents directly into the page.
  */
+
 struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
 				    size_t pg_offset, u64 start, u64 len,
 				    int create)
@@ -4081,7 +4077,7 @@ again:
 				    extent_map_end(em) - 1, GFP_NOFS);
 		goto insert;
 	} else {
-		printk("unkknown found_type %d\n", found_type);
+		printk(KERN_ERR "btrfs unknown found_type %d\n", found_type);
 		WARN_ON(1);
 	}
 not_found:
@@ -4093,7 +4089,11 @@ not_found_em:
 insert:
 	btrfs_release_path(root, path);
 	if (em->start > start || extent_map_end(em) <= start) {
-		printk("bad extent! em: [%Lu %Lu] passed [%Lu %Lu]\n", em->start, em->len, start, len);
+		printk(KERN_ERR "Btrfs: bad extent! em: [%llu %llu] passed "
+		       "[%llu %llu]\n", (unsigned long long)em->start,
+		       (unsigned long long)em->len,
+		       (unsigned long long)start,
+		       (unsigned long long)len);
 		err = -EIO;
 		goto out;
 	}
@@ -4130,8 +4130,6 @@ insert:
 				}
 			} else {
 				err = -EIO;
-				printk("failing to insert %Lu %Lu\n",
-				       start, len);
 				free_extent_map(em);
 				em = NULL;
 			}
@@ -4147,9 +4145,8 @@ out:
 		btrfs_free_path(path);
 	if (trans) {
 		ret = btrfs_end_transaction(trans, root);
-		if (!err) {
+		if (!err)
 			err = ret;
-		}
 	}
 	if (err) {
 		free_extent_map(em);
@@ -4482,13 +4479,15 @@ void btrfs_destroy_inode(struct inode *inode)
 	}
 	spin_unlock(&BTRFS_I(inode)->root->list_lock);
 
-	while(1) {
+	while (1) {
 		ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1);
 		if (!ordered)
 			break;
 		else {
-			printk("found ordered extent %Lu %Lu\n",
-			       ordered->file_offset, ordered->len);
+			printk(KERN_ERR "btrfs found ordered "
+			       "extent %llu %llu on inode cleanup\n",
+			       (unsigned long long)ordered->file_offset,
+			       (unsigned long long)ordered->len);
 			btrfs_remove_ordered_extent(inode, ordered);
 			btrfs_put_ordered_extent(ordered);
 			btrfs_put_ordered_extent(ordered);
@@ -4572,8 +4571,8 @@ static int btrfs_getattr(struct vfsmount *mnt,
 	return 0;
 }
 
-static int btrfs_rename(struct inode * old_dir, struct dentry *old_dentry,
-			   struct inode * new_dir,struct dentry *new_dentry)
+static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
+			   struct inode *new_dir, struct dentry *new_dentry)
 {
 	struct btrfs_trans_handle *trans;
 	struct btrfs_root *root = BTRFS_I(old_dir)->root;
@@ -4663,7 +4662,7 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root)
 		return -EROFS;
 
 	spin_lock(&root->fs_info->delalloc_lock);
-	while(!list_empty(head)) {
+	while (!list_empty(head)) {
 		binode = list_entry(head->next, struct btrfs_inode,
 				    delalloc_inodes);
 		inode = igrab(&binode->vfs_inode);
@@ -4684,7 +4683,7 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root)
 	 * ordered extents get created before we return
 	 */
 	atomic_inc(&root->fs_info->async_submit_draining);
-	while(atomic_read(&root->fs_info->nr_async_submits) ||
+	while (atomic_read(&root->fs_info->nr_async_submits) ||
 	      atomic_read(&root->fs_info->async_delalloc_pages)) {
 		wait_event(root->fs_info->async_submit_wait,
 		   (atomic_read(&root->fs_info->nr_async_submits) == 0 &&
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index ba484aac1b9c..c2aa33e3feb5 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -311,7 +311,7 @@ static noinline int btrfs_mksubvol(struct path *parent, char *name,
 		 * to see if is references the subvolume where we are
 		 * placing this new snapshot.
 		 */
-		while(1) {
+		while (1) {
 			if (!test ||
 			    dir == snap_src->fs_info->sb->s_root ||
 			    test == snap_src->fs_info->sb->s_root ||
@@ -319,7 +319,8 @@ static noinline int btrfs_mksubvol(struct path *parent, char *name,
 				break;
 			}
 			if (S_ISLNK(test->d_inode->i_mode)) {
-				printk("Symlink in snapshot path, failed\n");
+				printk(KERN_INFO "Btrfs symlink in snapshot "
+				       "path, failed\n");
 				error = -EMLINK;
 				btrfs_free_path(path);
 				goto out_drop_write;
@@ -329,7 +330,8 @@ static noinline int btrfs_mksubvol(struct path *parent, char *name,
 			ret = btrfs_find_root_ref(snap_src->fs_info->tree_root,
 				  path, test_oid, parent_oid);
 			if (ret == 0) {
-				printk("Snapshot creation failed, looping\n");
+				printk(KERN_INFO "Btrfs snapshot creation "
+				       "failed, looping\n");
 				error = -EMLINK;
 				btrfs_free_path(path);
 				goto out_drop_write;
@@ -617,7 +619,8 @@ static noinline int btrfs_ioctl_snap_create(struct file *file,
 
 		src_inode = src_file->f_path.dentry->d_inode;
 		if (src_inode->i_sb != file->f_path.dentry->d_inode->i_sb) {
-			printk("btrfs: Snapshot src from another FS\n");
+			printk(KERN_INFO "btrfs: Snapshot src from "
+			       "another FS\n");
 			ret = -EINVAL;
 			fput(src_file);
 			goto out;
@@ -810,9 +813,6 @@ static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
 	    ((off + len) & (bs-1)))
 		goto out_unlock;
 
-	printk("final src extent is %llu~%llu\n", off, len);
-	printk("final dst extent is %llu~%llu\n", destoff, len);
-
 	/* do any pending delalloc/csum calc on src, one way or
 	   another, and lock file content */
 	while (1) {
@@ -883,10 +883,13 @@ static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
 			comp = btrfs_file_extent_compression(leaf, extent);
 			type = btrfs_file_extent_type(leaf, extent);
 			if (type == BTRFS_FILE_EXTENT_REG) {
-				disko = btrfs_file_extent_disk_bytenr(leaf, extent);
-				diskl = btrfs_file_extent_disk_num_bytes(leaf, extent);
+				disko = btrfs_file_extent_disk_bytenr(leaf,
+								      extent);
+				diskl = btrfs_file_extent_disk_num_bytes(leaf,
+								 extent);
 				datao = btrfs_file_extent_offset(leaf, extent);
-				datal = btrfs_file_extent_num_bytes(leaf, extent);
+				datal = btrfs_file_extent_num_bytes(leaf,
+								    extent);
 			} else if (type == BTRFS_FILE_EXTENT_INLINE) {
 				/* take upper bound, may be compressed */
 				datal = btrfs_file_extent_ram_bytes(leaf,
@@ -916,8 +919,6 @@ static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
 
 				extent = btrfs_item_ptr(leaf, slot,
 						struct btrfs_file_extent_item);
-				printk("  orig disk %llu~%llu data %llu~%llu\n",
-				       disko, diskl, datao, datal);
 
 				if (off > key.offset) {
 					datao += off - key.offset;
@@ -929,8 +930,6 @@ static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
 				/* disko == 0 means it's a hole */
 				if (!disko)
 					datao = 0;
-				printk(" final disk %llu~%llu data %llu~%llu\n",
-				       disko, diskl, datao, datal);
 
 				btrfs_set_file_extent_offset(leaf, extent,
 							     datao);
@@ -952,12 +951,11 @@ static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
 					skip = off - key.offset;
 					new_key.offset += skip;
 				}
+
 				if (key.offset + datal > off+len)
 					trim = key.offset + datal - (off+len);
-				printk("len %lld skip %lld trim %lld\n",
-				       datal, skip, trim);
+
 				if (comp && (skip || trim)) {
-					printk("btrfs clone_range can't split compressed inline extents yet\n");
 					ret = -EINVAL;
 					goto out;
 				}
@@ -969,7 +967,8 @@ static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
 					goto out;
 
 				if (skip) {
-					u32 start = btrfs_file_extent_calc_inline_size(0);
+					u32 start =
+					  btrfs_file_extent_calc_inline_size(0);
 					memmove(buf+start, buf+start+skip,
 						datal);
 				}
@@ -985,7 +984,7 @@ static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
 			btrfs_mark_buffer_dirty(leaf);
 		}
 
-	next:
+next:
 		btrfs_release_path(root, path);
 		key.offset++;
 	}
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index e30aa6e2958f..39bae7761db6 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -31,9 +31,10 @@
  * difference in almost every workload, but spinning for the right amount of
  * time needs some help.
  *
- * In general, we want to spin as long as the lock holder is doing btree searches,
- * and we should give up if they are in more expensive code.
+ * In general, we want to spin as long as the lock holder is doing btree
+ * searches, and we should give up if they are in more expensive code.
  */
+
 int btrfs_tree_lock(struct extent_buffer *eb)
 {
 	int i;
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index d9e232227da4..a20940170274 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -39,11 +39,11 @@ static u64 entry_end(struct btrfs_ordered_extent *entry)
 static struct rb_node *tree_insert(struct rb_root *root, u64 file_offset,
 				   struct rb_node *node)
 {
-	struct rb_node ** p = &root->rb_node;
-	struct rb_node * parent = NULL;
+	struct rb_node **p = &root->rb_node;
+	struct rb_node *parent = NULL;
 	struct btrfs_ordered_extent *entry;
 
-	while(*p) {
+	while (*p) {
 		parent = *p;
 		entry = rb_entry(parent, struct btrfs_ordered_extent, rb_node);
 
@@ -67,13 +67,13 @@ static struct rb_node *tree_insert(struct rb_root *root, u64 file_offset,
 static struct rb_node *__tree_search(struct rb_root *root, u64 file_offset,
 				     struct rb_node **prev_ret)
 {
-	struct rb_node * n = root->rb_node;
+	struct rb_node *n = root->rb_node;
 	struct rb_node *prev = NULL;
 	struct rb_node *test;
 	struct btrfs_ordered_extent *entry;
 	struct btrfs_ordered_extent *prev_entry = NULL;
 
-	while(n) {
+	while (n) {
 		entry = rb_entry(n, struct btrfs_ordered_extent, rb_node);
 		prev = n;
 		prev_entry = entry;
@@ -88,7 +88,7 @@ static struct rb_node *__tree_search(struct rb_root *root, u64 file_offset,
 	if (!prev_ret)
 		return NULL;
 
-	while(prev && file_offset >= entry_end(prev_entry)) {
+	while (prev && file_offset >= entry_end(prev_entry)) {
 		test = rb_next(prev);
 		if (!test)
 			break;
@@ -102,7 +102,7 @@ static struct rb_node *__tree_search(struct rb_root *root, u64 file_offset,
 	if (prev)
 		prev_entry = rb_entry(prev, struct btrfs_ordered_extent,
 				      rb_node);
-	while(prev && file_offset < entry_end(prev_entry)) {
+	while (prev && file_offset < entry_end(prev_entry)) {
 		test = rb_prev(prev);
 		if (!test)
 			break;
@@ -193,10 +193,8 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
 
 	node = tree_insert(&tree->tree, file_offset,
 			   &entry->rb_node);
-	if (node) {
-		printk("warning dup entry from add_ordered_extent\n");
-		BUG();
-	}
+	BUG_ON(node);
+
 	set_extent_ordered(&BTRFS_I(inode)->io_tree, file_offset,
 			   entry_end(entry) - 1, GFP_NOFS);
 
@@ -282,7 +280,7 @@ int btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry)
 	struct btrfs_ordered_sum *sum;
 
 	if (atomic_dec_and_test(&entry->refs)) {
-		while(!list_empty(&entry->list)) {
+		while (!list_empty(&entry->list)) {
 			cur = entry->list.next;
 			sum = list_entry(cur, struct btrfs_ordered_sum, list);
 			list_del(&sum->list);
@@ -432,11 +430,10 @@ again:
 					   orig_end >> PAGE_CACHE_SHIFT);
 
 	end = orig_end;
-	while(1) {
+	while (1) {
 		ordered = btrfs_lookup_first_ordered_extent(inode, end);
-		if (!ordered) {
+		if (!ordered)
 			break;
-		}
 		if (ordered->file_offset > orig_end) {
 			btrfs_put_ordered_extent(ordered);
 			break;
@@ -492,7 +489,7 @@ out:
  * if none is found
  */
 struct btrfs_ordered_extent *
-btrfs_lookup_first_ordered_extent(struct inode * inode, u64 file_offset)
+btrfs_lookup_first_ordered_extent(struct inode *inode, u64 file_offset)
 {
 	struct btrfs_ordered_inode_tree *tree;
 	struct rb_node *node;
@@ -553,7 +550,7 @@ int btrfs_ordered_update_i_size(struct inode *inode,
 	 * yet
 	 */
 	node = &ordered->rb_node;
-	while(1) {
+	while (1) {
 		node = rb_prev(node);
 		if (!node)
 			break;
@@ -581,9 +578,8 @@ int btrfs_ordered_update_i_size(struct inode *inode,
 		 * between our ordered extent and the next one.
 		 */
 		test = rb_entry(node, struct btrfs_ordered_extent, rb_node);
-		if (test->file_offset > entry_end(ordered)) {
+		if (test->file_offset > entry_end(ordered))
 			i_size_test = test->file_offset;
-		}
 	} else {
 		i_size_test = i_size_read(inode);
 	}
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index 64725c13aa11..5f8f218c1005 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -24,13 +24,14 @@ static void print_chunk(struct extent_buffer *eb, struct btrfs_chunk *chunk)
 {
 	int num_stripes = btrfs_chunk_num_stripes(eb, chunk);
 	int i;
-	printk("\t\tchunk length %llu owner %llu type %llu num_stripes %d\n",
+	printk(KERN_INFO "\t\tchunk length %llu owner %llu type %llu "
+	       "num_stripes %d\n",
 	       (unsigned long long)btrfs_chunk_length(eb, chunk),
 	       (unsigned long long)btrfs_chunk_owner(eb, chunk),
 	       (unsigned long long)btrfs_chunk_type(eb, chunk),
 	       num_stripes);
 	for (i = 0 ; i < num_stripes ; i++) {
-		printk("\t\t\tstripe %d devid %llu offset %llu\n", i,
+		printk(KERN_INFO "\t\t\tstripe %d devid %llu offset %llu\n", i,
 		      (unsigned long long)btrfs_stripe_devid_nr(eb, chunk, i),
 		      (unsigned long long)btrfs_stripe_offset_nr(eb, chunk, i));
 	}
@@ -38,8 +39,8 @@ static void print_chunk(struct extent_buffer *eb, struct btrfs_chunk *chunk)
 static void print_dev_item(struct extent_buffer *eb,
 			   struct btrfs_dev_item *dev_item)
 {
-	printk("\t\tdev item devid %llu "
-	       "total_bytes %llu bytes used %Lu\n",
+	printk(KERN_INFO "\t\tdev item devid %llu "
+	       "total_bytes %llu bytes used %llu\n",
 	       (unsigned long long)btrfs_device_id(eb, dev_item),
 	       (unsigned long long)btrfs_device_total_bytes(eb, dev_item),
 	       (unsigned long long)btrfs_device_bytes_used(eb, dev_item));
@@ -61,14 +62,15 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
 	struct btrfs_dev_extent *dev_extent;
 	u32 type;
 
-	printk("leaf %llu total ptrs %d free space %d\n",
+	printk(KERN_INFO "leaf %llu total ptrs %d free space %d\n",
 		(unsigned long long)btrfs_header_bytenr(l), nr,
 		btrfs_leaf_free_space(root, l));
 	for (i = 0 ; i < nr ; i++) {
 		item = btrfs_item_nr(l, i);
 		btrfs_item_key_to_cpu(l, &key, i);
 		type = btrfs_key_type(&key);
-		printk("\titem %d key (%llu %x %llu) itemoff %d itemsize %d\n",
+		printk(KERN_INFO "\titem %d key (%llu %x %llu) itemoff %d "
+		       "itemsize %d\n",
 			i,
 			(unsigned long long)key.objectid, type,
 			(unsigned long long)key.offset,
@@ -76,33 +78,36 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
 		switch (type) {
 		case BTRFS_INODE_ITEM_KEY:
 			ii = btrfs_item_ptr(l, i, struct btrfs_inode_item);
-			printk("\t\tinode generation %llu size %llu mode %o\n",
-		              (unsigned long long)btrfs_inode_generation(l, ii),
+			printk(KERN_INFO "\t\tinode generation %llu size %llu "
+			       "mode %o\n",
+			       (unsigned long long)
+			       btrfs_inode_generation(l, ii),
 			      (unsigned long long)btrfs_inode_size(l, ii),
 			       btrfs_inode_mode(l, ii));
 			break;
 		case BTRFS_DIR_ITEM_KEY:
 			di = btrfs_item_ptr(l, i, struct btrfs_dir_item);
 			btrfs_dir_item_key_to_cpu(l, di, &found_key);
-			printk("\t\tdir oid %llu type %u\n",
+			printk(KERN_INFO "\t\tdir oid %llu type %u\n",
 				(unsigned long long)found_key.objectid,
 				btrfs_dir_type(l, di));
 			break;
 		case BTRFS_ROOT_ITEM_KEY:
 			ri = btrfs_item_ptr(l, i, struct btrfs_root_item);
-			printk("\t\troot data bytenr %llu refs %u\n",
-				(unsigned long long)btrfs_disk_root_bytenr(l, ri),
+			printk(KERN_INFO "\t\troot data bytenr %llu refs %u\n",
+				(unsigned long long)
+				btrfs_disk_root_bytenr(l, ri),
 				btrfs_disk_root_refs(l, ri));
 			break;
 		case BTRFS_EXTENT_ITEM_KEY:
 			ei = btrfs_item_ptr(l, i, struct btrfs_extent_item);
-			printk("\t\textent data refs %u\n",
+			printk(KERN_INFO "\t\textent data refs %u\n",
 				btrfs_extent_refs(l, ei));
 			break;
 		case BTRFS_EXTENT_REF_KEY:
 			ref = btrfs_item_ptr(l, i, struct btrfs_extent_ref);
-			printk("\t\textent back ref root %llu gen %llu "
-			       "owner %llu num_refs %lu\n",
+			printk(KERN_INFO "\t\textent back ref root %llu "
+			       "gen %llu owner %llu num_refs %lu\n",
 			       (unsigned long long)btrfs_ref_root(l, ref),
 			       (unsigned long long)btrfs_ref_generation(l, ref),
 			       (unsigned long long)btrfs_ref_objectid(l, ref),
@@ -114,26 +119,36 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
 					    struct btrfs_file_extent_item);
 			if (btrfs_file_extent_type(l, fi) ==
 			    BTRFS_FILE_EXTENT_INLINE) {
-				printk("\t\tinline extent data size %u\n",
-			           btrfs_file_extent_inline_len(l, fi));
+				printk(KERN_INFO "\t\tinline extent data "
+				       "size %u\n",
+				       btrfs_file_extent_inline_len(l, fi));
 				break;
 			}
-			printk("\t\textent data disk bytenr %llu nr %llu\n",
-			       (unsigned long long)btrfs_file_extent_disk_bytenr(l, fi),
-			       (unsigned long long)btrfs_file_extent_disk_num_bytes(l, fi));
-			printk("\t\textent data offset %llu nr %llu ram %llu\n",
-			  (unsigned long long)btrfs_file_extent_offset(l, fi),
-			  (unsigned long long)btrfs_file_extent_num_bytes(l, fi),
-			  (unsigned long long)btrfs_file_extent_ram_bytes(l, fi));
+			printk(KERN_INFO "\t\textent data disk bytenr %llu "
+			       "nr %llu\n",
+			       (unsigned long long)
+			       btrfs_file_extent_disk_bytenr(l, fi),
+			       (unsigned long long)
+			       btrfs_file_extent_disk_num_bytes(l, fi));
+			printk(KERN_INFO "\t\textent data offset %llu "
+			       "nr %llu ram %llu\n",
+			       (unsigned long long)
+			       btrfs_file_extent_offset(l, fi),
+			       (unsigned long long)
+			       btrfs_file_extent_num_bytes(l, fi),
+			       (unsigned long long)
+			       btrfs_file_extent_ram_bytes(l, fi));
 			break;
 		case BTRFS_BLOCK_GROUP_ITEM_KEY:
 			bi = btrfs_item_ptr(l, i,
 					    struct btrfs_block_group_item);
-			printk("\t\tblock group used %llu\n",
-			       (unsigned long long)btrfs_disk_block_group_used(l, bi));
+			printk(KERN_INFO "\t\tblock group used %llu\n",
+			       (unsigned long long)
+			       btrfs_disk_block_group_used(l, bi));
 			break;
 		case BTRFS_CHUNK_ITEM_KEY:
-			print_chunk(l, btrfs_item_ptr(l, i, struct btrfs_chunk));
+			print_chunk(l, btrfs_item_ptr(l, i,
+						      struct btrfs_chunk));
 			break;
 		case BTRFS_DEV_ITEM_KEY:
 			print_dev_item(l, btrfs_item_ptr(l, i,
@@ -142,7 +157,7 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
 		case BTRFS_DEV_EXTENT_KEY:
 			dev_extent = btrfs_item_ptr(l, i,
 						    struct btrfs_dev_extent);
-			printk("\t\tdev extent chunk_tree %llu\n"
+			printk(KERN_INFO "\t\tdev extent chunk_tree %llu\n"
 			       "\t\tchunk objectid %llu chunk offset %llu "
 			       "length %llu\n",
 			       (unsigned long long)
@@ -171,13 +186,13 @@ void btrfs_print_tree(struct btrfs_root *root, struct extent_buffer *c)
 		btrfs_print_leaf(root, c);
 		return;
 	}
-	printk("node %llu level %d total ptrs %d free spc %u\n",
+	printk(KERN_INFO "node %llu level %d total ptrs %d free spc %u\n",
 	       (unsigned long long)btrfs_header_bytenr(c),
 	       btrfs_header_level(c), nr,
 	       (u32)BTRFS_NODEPTRS_PER_BLOCK(root) - nr);
 	for (i = 0; i < nr; i++) {
 		btrfs_node_key_to_cpu(c, &key, i);
-		printk("\tkey %d (%llu %u %llu) block %llu\n",
+		printk(KERN_INFO "\tkey %d (%llu %u %llu) block %llu\n",
 		       i,
 		       (unsigned long long)key.objectid,
 		       key.type,
diff --git a/fs/btrfs/ref-cache.c b/fs/btrfs/ref-cache.c
index a50ebb67055d..6f0acc4c9eab 100644
--- a/fs/btrfs/ref-cache.c
+++ b/fs/btrfs/ref-cache.c
@@ -74,11 +74,11 @@ void btrfs_free_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref)
 static struct rb_node *tree_insert(struct rb_root *root, u64 bytenr,
 				   struct rb_node *node)
 {
-	struct rb_node ** p = &root->rb_node;
-	struct rb_node * parent = NULL;
+	struct rb_node **p = &root->rb_node;
+	struct rb_node *parent = NULL;
 	struct btrfs_leaf_ref *entry;
 
-	while(*p) {
+	while (*p) {
 		parent = *p;
 		entry = rb_entry(parent, struct btrfs_leaf_ref, rb_node);
 
@@ -98,10 +98,10 @@ static struct rb_node *tree_insert(struct rb_root *root, u64 bytenr,
 
 static struct rb_node *tree_search(struct rb_root *root, u64 bytenr)
 {
-	struct rb_node * n = root->rb_node;
+	struct rb_node *n = root->rb_node;
 	struct btrfs_leaf_ref *entry;
 
-	while(n) {
+	while (n) {
 		entry = rb_entry(n, struct btrfs_leaf_ref, rb_node);
 		WARN_ON(!entry->in_tree);
 
@@ -127,7 +127,7 @@ int btrfs_remove_leaf_refs(struct btrfs_root *root, u64 max_root_gen,
 		return 0;
 
 	spin_lock(&tree->lock);
-	while(!list_empty(&tree->list)) {
+	while (!list_empty(&tree->list)) {
 		ref = list_entry(tree->list.next, struct btrfs_leaf_ref, list);
 		BUG_ON(ref->tree != tree);
 		if (ref->root_gen > max_root_gen)
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index f99335a999d6..b48650de4472 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -132,8 +132,9 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
 
 	if (ret != 0) {
 		btrfs_print_leaf(root, path->nodes[0]);
-		printk("unable to update root key %Lu %u %Lu\n",
-		       key->objectid, key->type, key->offset);
+		printk(KERN_CRIT "unable to update root key %llu %u %llu\n",
+		       (unsigned long long)key->objectid, key->type,
+		       (unsigned long long)key->offset);
 		BUG_ON(1);
 	}
 
@@ -159,9 +160,9 @@ int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root
 
 /*
  * at mount time we want to find all the old transaction snapshots that were in
- * the process of being deleted if we crashed.  This is any root item with an offset
- * lower than the latest root.  They need to be queued for deletion to finish
- * what was happening when we crashed.
+ * the process of being deleted if we crashed.  This is any root item with an
+ * offset lower than the latest root.  They need to be queued for deletion to
+ * finish what was happening when we crashed.
  */
 int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid,
 			  struct btrfs_root *latest)
@@ -188,7 +189,7 @@ again:
 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
 	if (ret < 0)
 		goto err;
-	while(1) {
+	while (1) {
 		leaf = path->nodes[0];
 		nritems = btrfs_header_nritems(leaf);
 		slot = path->slots[0];
@@ -258,11 +259,7 @@ int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 	ret = btrfs_search_slot(trans, root, key, path, -1, 1);
 	if (ret < 0)
 		goto out;
-	if (ret) {
-btrfs_print_leaf(root, path->nodes[0]);
-printk("failed to del %Lu %u %Lu\n", key->objectid, key->type, key->offset);
 
-	}
 	BUG_ON(ret != 0);
 	leaf = path->nodes[0];
 	ri = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_item);
diff --git a/fs/btrfs/struct-funcs.c b/fs/btrfs/struct-funcs.c
index 8d7f568009c9..c0f7ecaf1e79 100644
--- a/fs/btrfs/struct-funcs.c
+++ b/fs/btrfs/struct-funcs.c
@@ -66,7 +66,7 @@ u##bits btrfs_##name(struct extent_buffer *eb,				\
 		unsigned long map_len;					\
 		u##bits res;						\
 		err = map_extent_buffer(eb, offset,			\
-			        sizeof(((type *)0)->member),		\
+				sizeof(((type *)0)->member),		\
 				&map_token, &kaddr,			\
 				&map_start, &map_len, KM_USER1);	\
 		if (err) {						\
@@ -103,7 +103,7 @@ void btrfs_set_##name(struct extent_buffer *eb,				\
 		unsigned long map_start;				\
 		unsigned long map_len;					\
 		err = map_extent_buffer(eb, offset,			\
-			        sizeof(((type *)0)->member),		\
+				sizeof(((type *)0)->member),		\
 				&map_token, &kaddr,			\
 				&map_start, &map_len, KM_USER1);	\
 		if (err) {						\
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index ccdcb7bb7ad8..b4c101d9322c 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -55,18 +55,12 @@
 
 static struct super_operations btrfs_super_ops;
 
-static void btrfs_put_super (struct super_block * sb)
+static void btrfs_put_super(struct super_block *sb)
 {
 	struct btrfs_root *root = btrfs_sb(sb);
 	int ret;
 
 	ret = close_ctree(root);
-	if (ret) {
-		printk("close ctree returns %d\n", ret);
-	}
-#if 0
-	btrfs_sysfs_del_super(root->fs_info);
-#endif
 	sb->s_fs_info = NULL;
 }
 
@@ -299,12 +293,12 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags,
 	return error;
 }
 
-static int btrfs_fill_super(struct super_block * sb,
+static int btrfs_fill_super(struct super_block *sb,
 			    struct btrfs_fs_devices *fs_devices,
-			    void * data, int silent)
+			    void *data, int silent)
 {
-	struct inode * inode;
-	struct dentry * root_dentry;
+	struct inode *inode;
+	struct dentry *root_dentry;
 	struct btrfs_super_block *disk_super;
 	struct btrfs_root *tree_root;
 	struct btrfs_inode *bi;
@@ -479,8 +473,10 @@ static int btrfs_get_sb(struct file_system_type *fs_type, int flags,
 		root = dget(s->s_root);
 	else {
 		mutex_lock(&s->s_root->d_inode->i_mutex);
-		root = lookup_one_len(subvol_name, s->s_root, strlen(subvol_name));
+		root = lookup_one_len(subvol_name, s->s_root,
+				      strlen(subvol_name));
 		mutex_unlock(&s->s_root->d_inode->i_mutex);
+
 		if (IS_ERR(root)) {
 			up_write(&s->s_umount);
 			deactivate_super(s);
@@ -557,8 +553,9 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 	buf->f_bavail = buf->f_bfree;
 	buf->f_bsize = dentry->d_sb->s_blocksize;
 	buf->f_type = BTRFS_SUPER_MAGIC;
+
 	/* We treat it as constant endianness (it doesn't matter _which_)
-	   because we want the fsid to come out the same whether mounted 
+	   because we want the fsid to come out the same whether mounted
 	   on a big-endian or little-endian host */
 	buf->f_fsid.val[0] = be32_to_cpu(fsid[0]) ^ be32_to_cpu(fsid[2]);
 	buf->f_fsid.val[1] = be32_to_cpu(fsid[1]) ^ be32_to_cpu(fsid[3]);
@@ -658,7 +655,7 @@ static int btrfs_interface_init(void)
 static void btrfs_interface_exit(void)
 {
 	if (misc_deregister(&btrfs_misc) < 0)
-		printk("misc_deregister failed for control device");
+		printk(KERN_INFO "misc_deregister failed for control device");
 }
 
 static int __init init_btrfs_fs(void)
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 04087c020845..a240b6fa81df 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -67,7 +67,8 @@ struct btrfs_root_attr {
 };
 
 #define ROOT_ATTR(name, mode, show, store) \
-static struct btrfs_root_attr btrfs_root_attr_##name = __ATTR(name, mode, show, store)
+static struct btrfs_root_attr btrfs_root_attr_##name = __ATTR(name, mode, \
+							      show, store)
 
 ROOT_ATTR(blocks_used,	0444,	root_blocks_used_show,	NULL);
 ROOT_ATTR(block_limit,	0644,	root_block_limit_show,	NULL);
@@ -86,7 +87,8 @@ struct btrfs_super_attr {
 };
 
 #define SUPER_ATTR(name, mode, show, store) \
-static struct btrfs_super_attr btrfs_super_attr_##name = __ATTR(name, mode, show, store)
+static struct btrfs_super_attr btrfs_super_attr_##name = __ATTR(name, mode, \
+								show, store)
 
 SUPER_ATTR(blocks_used,		0444,	super_blocks_used_show,		NULL);
 SUPER_ATTR(total_blocks,	0444,	super_total_blocks_show,	NULL);
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 4e7b56e9d3a5..56ab1f5ea11b 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -28,9 +28,6 @@
 #include "ref-cache.h"
 #include "tree-log.h"
 
-extern struct kmem_cache *btrfs_trans_handle_cachep;
-extern struct kmem_cache *btrfs_transaction_cachep;
-
 #define BTRFS_ROOT_TRANS_TAG 0
 
 static noinline void put_transaction(struct btrfs_transaction *transaction)
@@ -85,10 +82,10 @@ static noinline int join_transaction(struct btrfs_root *root)
 }
 
 /*
- * this does all the record keeping required to make sure that a
- * reference counted root is properly recorded in a given transaction.
- * This is required to make sure the old root from before we joined the transaction
- * is deleted when the transaction commits
+ * this does all the record keeping required to make sure that a reference
+ * counted root is properly recorded in a given transaction.  This is required
+ * to make sure the old root from before we joined the transaction is deleted
+ * when the transaction commits
  */
 noinline int btrfs_record_root_in_trans(struct btrfs_root *root)
 {
@@ -144,7 +141,7 @@ static void wait_current_trans(struct btrfs_root *root)
 	if (cur_trans && cur_trans->blocked) {
 		DEFINE_WAIT(wait);
 		cur_trans->use_count++;
-		while(1) {
+		while (1) {
 			prepare_to_wait(&root->fs_info->transaction_wait, &wait,
 					TASK_UNINTERRUPTIBLE);
 			if (cur_trans->blocked) {
@@ -213,7 +210,7 @@ static noinline int wait_for_commit(struct btrfs_root *root,
 {
 	DEFINE_WAIT(wait);
 	mutex_lock(&root->fs_info->trans_mutex);
-	while(!commit->commit_done) {
+	while (!commit->commit_done) {
 		prepare_to_wait(&commit->commit_wait, &wait,
 				TASK_UNINTERRUPTIBLE);
 		if (commit->commit_done)
@@ -228,8 +225,8 @@ static noinline int wait_for_commit(struct btrfs_root *root,
 }
 
 /*
- * rate limit against the drop_snapshot code.  This helps to slow down new operations
- * if the drop_snapshot code isn't able to keep up.
+ * rate limit against the drop_snapshot code.  This helps to slow down new
+ * operations if the drop_snapshot code isn't able to keep up.
  */
 static void throttle_on_drops(struct btrfs_root *root)
 {
@@ -332,12 +329,12 @@ int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
 	u64 end;
 	unsigned long index;
 
-	while(1) {
+	while (1) {
 		ret = find_first_extent_bit(dirty_pages, start, &start, &end,
 					    EXTENT_DIRTY);
 		if (ret)
 			break;
-		while(start <= end) {
+		while (start <= end) {
 			cond_resched();
 
 			index = start >> PAGE_CACHE_SHIFT;
@@ -368,14 +365,14 @@ int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
 			page_cache_release(page);
 		}
 	}
-	while(1) {
+	while (1) {
 		ret = find_first_extent_bit(dirty_pages, 0, &start, &end,
 					    EXTENT_DIRTY);
 		if (ret)
 			break;
 
 		clear_extent_dirty(dirty_pages, start, end, GFP_NOFS);
-		while(start <= end) {
+		while (start <= end) {
 			index = start >> PAGE_CACHE_SHIFT;
 			start = (u64)(index + 1) << PAGE_CACHE_SHIFT;
 			page = find_get_page(btree_inode->i_mapping, index);
@@ -431,7 +428,7 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans,
 	btrfs_write_dirty_block_groups(trans, root);
 	btrfs_extent_post_op(trans, root);
 
-	while(1) {
+	while (1) {
 		old_root_bytenr = btrfs_root_bytenr(&root->root_item);
 		if (old_root_bytenr == root->node->start)
 			break;
@@ -472,7 +469,7 @@ int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans,
 
 	btrfs_extent_post_op(trans, fs_info->tree_root);
 
-	while(!list_empty(&fs_info->dirty_cowonly_roots)) {
+	while (!list_empty(&fs_info->dirty_cowonly_roots)) {
 		next = fs_info->dirty_cowonly_roots.next;
 		list_del_init(next);
 		root = list_entry(next, struct btrfs_root, dirty_list);
@@ -521,7 +518,7 @@ static noinline int add_dirty_roots(struct btrfs_trans_handle *trans,
 	int err = 0;
 	u32 refs;
 
-	while(1) {
+	while (1) {
 		ret = radix_tree_gang_lookup_tag(radix, (void **)gang, 0,
 						 ARRAY_SIZE(gang),
 						 BTRFS_ROOT_TRANS_TAG);
@@ -653,7 +650,7 @@ static noinline int drop_dirty_roots(struct btrfs_root *tree_root,
 	int ret = 0;
 	int err;
 
-	while(!list_empty(list)) {
+	while (!list_empty(list)) {
 		struct btrfs_root *root;
 
 		dirty = list_entry(list->prev, struct btrfs_dirty_root, list);
@@ -663,13 +660,12 @@ static noinline int drop_dirty_roots(struct btrfs_root *tree_root,
 		root = dirty->latest_root;
 		atomic_inc(&root->fs_info->throttles);
 
-		while(1) {
+		while (1) {
 			trans = btrfs_start_transaction(tree_root, 1);
 			mutex_lock(&root->fs_info->drop_mutex);
 			ret = btrfs_drop_snapshot(trans, dirty->root);
-			if (ret != -EAGAIN) {
+			if (ret != -EAGAIN)
 				break;
-			}
 			mutex_unlock(&root->fs_info->drop_mutex);
 
 			err = btrfs_update_root(trans,
@@ -874,7 +870,7 @@ static noinline int finish_pending_snapshots(struct btrfs_trans_handle *trans,
 	struct list_head *head = &trans->transaction->pending_snapshots;
 	int ret;
 
-	while(!list_empty(head)) {
+	while (!list_empty(head)) {
 		pending = list_entry(head->next,
 				     struct btrfs_pending_snapshot, list);
 		ret = finish_pending_snapshot(fs_info, pending);
@@ -1076,9 +1072,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 
 	kmem_cache_free(btrfs_trans_handle_cachep, trans);
 
-	if (root->fs_info->closing) {
+	if (root->fs_info->closing)
 		drop_dirty_roots(root->fs_info->tree_root, &dirty_fs_roots);
-	}
 	return ret;
 }
 
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index ffe7f639732b..ea292117f882 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -66,9 +66,9 @@ static inline void btrfs_set_trans_block_group(struct btrfs_trans_handle *trans,
 	trans->block_group = BTRFS_I(inode)->block_group;
 }
 
-static inline void btrfs_update_inode_block_group(struct
-						  btrfs_trans_handle *trans,
-						  struct inode *inode)
+static inline void btrfs_update_inode_block_group(
+					  struct btrfs_trans_handle *trans,
+					  struct inode *inode)
 {
 	BTRFS_I(inode)->block_group = trans->block_group;
 }
diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c
index a6a3956cedfb..3e8358c36165 100644
--- a/fs/btrfs/tree-defrag.c
+++ b/fs/btrfs/tree-defrag.c
@@ -23,10 +23,11 @@
 #include "transaction.h"
 #include "locking.h"
 
-/* defrag all the leaves in a given btree.  If cache_only == 1, don't read things
- * from disk, otherwise read all the leaves and try to get key order to
+/* defrag all the leaves in a given btree.  If cache_only == 1, don't read
+ * things from disk, otherwise read all the leaves and try to get key order to
  * better reflect disk order
  */
+
 int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
 			struct btrfs_root *root, int cache_only)
 {
@@ -65,9 +66,9 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
 	level = btrfs_header_level(root->node);
 	orig_level = level;
 
-	if (level == 0) {
+	if (level == 0)
 		goto out;
-	}
+
 	if (root->defrag_progress.objectid == 0) {
 		struct extent_buffer *root_node;
 		u32 nritems;
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index b1c2921f5bef..3a72a1b6c247 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -829,7 +829,7 @@ conflict_again:
 		 */
 		ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
 		ptr_end = ptr + btrfs_item_size_nr(leaf, path->slots[0]);
-		while(ptr < ptr_end) {
+		while (ptr < ptr_end) {
 			victim_ref = (struct btrfs_inode_ref *)ptr;
 			victim_name_len = btrfs_inode_ref_name_len(leaf,
 								   victim_ref);
@@ -938,9 +938,8 @@ static noinline int replay_one_csum(struct btrfs_trans_handle *trans,
 
 	file_bytes = (item_size / csum_size) * root->sectorsize;
 	sums = kzalloc(btrfs_ordered_sum_size(root, file_bytes), GFP_NOFS);
-	if (!sums) {
+	if (!sums)
 		return -ENOMEM;
-	}
 
 	INIT_LIST_HEAD(&sums->list);
 	sums->len = file_bytes;
@@ -952,7 +951,7 @@ static noinline int replay_one_csum(struct btrfs_trans_handle *trans,
 	sector_sum = sums->sums;
 	cur_offset = key->offset;
 	ptr = btrfs_item_ptr_offset(eb, slot);
-	while(item_size > 0) {
+	while (item_size > 0) {
 		sector_sum->bytenr = cur_offset;
 		read_extent_buffer(eb, &sector_sum->sum, ptr, csum_size);
 		sector_sum++;
@@ -995,7 +994,7 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
 
 	path = btrfs_alloc_path();
 
-	while(1) {
+	while (1) {
 		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
 		if (ret < 0)
 			break;
@@ -1012,7 +1011,7 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
 		ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
 		ptr_end = ptr + btrfs_item_size_nr(path->nodes[0],
 						   path->slots[0]);
-		while(ptr < ptr_end) {
+		while (ptr < ptr_end) {
 			struct btrfs_inode_ref *ref;
 
 			ref = (struct btrfs_inode_ref *)ptr;
@@ -1048,7 +1047,7 @@ static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
 	key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
 	key.type = BTRFS_ORPHAN_ITEM_KEY;
 	key.offset = (u64)-1;
-	while(1) {
+	while (1) {
 		ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
 		if (ret < 0)
 			break;
@@ -1206,8 +1205,7 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
 	if (key->type == BTRFS_DIR_ITEM_KEY) {
 		dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid,
 				       name, name_len, 1);
-	}
-	else if (key->type == BTRFS_DIR_INDEX_KEY) {
+	} else if (key->type == BTRFS_DIR_INDEX_KEY) {
 		dst_di = btrfs_lookup_dir_index_item(trans, root, path,
 						     key->objectid,
 						     key->offset, name,
@@ -1282,7 +1280,7 @@ static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans,
 
 	ptr = btrfs_item_ptr_offset(eb, slot);
 	ptr_end = ptr + item_size;
-	while(ptr < ptr_end) {
+	while (ptr < ptr_end) {
 		di = (struct btrfs_dir_item *)ptr;
 		name_len = btrfs_dir_name_len(eb, di);
 		ret = replay_one_name(trans, root, path, eb, di, key);
@@ -1408,7 +1406,7 @@ again:
 	item_size = btrfs_item_size_nr(eb, slot);
 	ptr = btrfs_item_ptr_offset(eb, slot);
 	ptr_end = ptr + item_size;
-	while(ptr < ptr_end) {
+	while (ptr < ptr_end) {
 		di = (struct btrfs_dir_item *)ptr;
 		name_len = btrfs_dir_name_len(eb, di);
 		name = kmalloc(name_len, GFP_NOFS);
@@ -1513,14 +1511,14 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
 again:
 	range_start = 0;
 	range_end = 0;
-	while(1) {
+	while (1) {
 		ret = find_dir_range(log, path, dirid, key_type,
 				     &range_start, &range_end);
 		if (ret != 0)
 			break;
 
 		dir_key.offset = range_start;
-		while(1) {
+		while (1) {
 			int nritems;
 			ret = btrfs_search_slot(NULL, root, &dir_key, path,
 						0, 0);
@@ -1676,7 +1674,7 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
 	return 0;
 }
 
-static int noinline walk_down_log_tree(struct btrfs_trans_handle *trans,
+static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
 				   struct btrfs_root *root,
 				   struct btrfs_path *path, int *level,
 				   struct walk_control *wc)
@@ -1694,7 +1692,7 @@ static int noinline walk_down_log_tree(struct btrfs_trans_handle *trans,
 	WARN_ON(*level < 0);
 	WARN_ON(*level >= BTRFS_MAX_LEVEL);
 
-	while(*level > 0) {
+	while (*level > 0) {
 		WARN_ON(*level < 0);
 		WARN_ON(*level >= BTRFS_MAX_LEVEL);
 		cur = path->nodes[*level];
@@ -1753,11 +1751,11 @@ static int noinline walk_down_log_tree(struct btrfs_trans_handle *trans,
 	WARN_ON(*level < 0);
 	WARN_ON(*level >= BTRFS_MAX_LEVEL);
 
-	if (path->nodes[*level] == root->node) {
+	if (path->nodes[*level] == root->node)
 		parent = path->nodes[*level];
-	} else {
+	else
 		parent = path->nodes[*level + 1];
-	}
+
 	bytenr = path->nodes[*level]->start;
 
 	blocksize = btrfs_level_size(root, *level);
@@ -1790,7 +1788,7 @@ static int noinline walk_down_log_tree(struct btrfs_trans_handle *trans,
 	return 0;
 }
 
-static int noinline walk_up_log_tree(struct btrfs_trans_handle *trans,
+static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
 				 struct btrfs_root *root,
 				 struct btrfs_path *path, int *level,
 				 struct walk_control *wc)
@@ -1801,7 +1799,7 @@ static int noinline walk_up_log_tree(struct btrfs_trans_handle *trans,
 	int slot;
 	int ret;
 
-	for(i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) {
+	for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) {
 		slot = path->slots[i];
 		if (slot < btrfs_header_nritems(path->nodes[i]) - 1) {
 			struct extent_buffer *node;
@@ -1875,7 +1873,7 @@ static int walk_log_tree(struct btrfs_trans_handle *trans,
 	extent_buffer_get(log->node);
 	path->slots[level] = 0;
 
-	while(1) {
+	while (1) {
 		wret = walk_down_log_tree(trans, log, path, &level, wc);
 		if (wret > 0)
 			break;
@@ -1941,7 +1939,7 @@ static int wait_log_commit(struct btrfs_root *log)
 			schedule();
 		finish_wait(&log->fs_info->tree_log_wait, &wait);
 		mutex_lock(&log->fs_info->tree_log_mutex);
-	} while(transid == log->fs_info->tree_log_transid &&
+	} while (transid == log->fs_info->tree_log_transid &&
 		atomic_read(&log->fs_info->tree_log_commit));
 	return 0;
 }
@@ -1965,13 +1963,13 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 	}
 	atomic_set(&log->fs_info->tree_log_commit, 1);
 
-	while(1) {
+	while (1) {
 		batch = log->fs_info->tree_log_batch;
 		mutex_unlock(&log->fs_info->tree_log_mutex);
 		schedule_timeout_uninterruptible(1);
 		mutex_lock(&log->fs_info->tree_log_mutex);
 
-		while(atomic_read(&log->fs_info->tree_log_writers)) {
+		while (atomic_read(&log->fs_info->tree_log_writers)) {
 			DEFINE_WAIT(wait);
 			prepare_to_wait(&log->fs_info->tree_log_wait, &wait,
 					TASK_UNINTERRUPTIBLE);
@@ -2030,7 +2028,7 @@ int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
 	ret = walk_log_tree(trans, log, &wc);
 	BUG_ON(ret);
 
-	while(1) {
+	while (1) {
 		ret = find_first_extent_bit(&log->dirty_log_pages,
 				    0, &start, &end, EXTENT_DIRTY);
 		if (ret)
@@ -2287,9 +2285,8 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
 			struct btrfs_key tmp;
 			btrfs_item_key_to_cpu(path->nodes[0], &tmp,
 					      path->slots[0]);
-			if (key_type == tmp.type) {
+			if (key_type == tmp.type)
 				first_offset = max(min_offset, tmp.offset) + 1;
-			}
 		}
 		goto done;
 	}
@@ -2319,7 +2316,7 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
 	 * we have a block from this transaction, log every item in it
 	 * from our directory
 	 */
-	while(1) {
+	while (1) {
 		struct btrfs_key tmp;
 		src = path->nodes[0];
 		nritems = btrfs_header_nritems(src);
@@ -2396,7 +2393,7 @@ static noinline int log_directory_changes(struct btrfs_trans_handle *trans,
 again:
 	min_key = 0;
 	max_key = 0;
-	while(1) {
+	while (1) {
 		ret = log_dir_items(trans, root, inode, path,
 				    dst_path, key_type, min_key,
 				    &max_key);
@@ -2432,7 +2429,7 @@ static int drop_objectid_items(struct btrfs_trans_handle *trans,
 	key.type = max_key_type;
 	key.offset = (u64)-1;
 
-	while(1) {
+	while (1) {
 		ret = btrfs_search_slot(trans, log, &key, path, -1, 1);
 
 		if (ret != 1)
@@ -2481,7 +2478,7 @@ static noinline int copy_extent_csums(struct btrfs_trans_handle *trans,
 	list_add_tail(&sums->list, list);
 
 	path = btrfs_alloc_path();
-	while(disk_bytenr < end) {
+	while (disk_bytenr < end) {
 		if (!item || disk_bytenr < item_start_offset ||
 		    disk_bytenr >= item_last_offset) {
 			struct btrfs_key found_key;
@@ -2496,7 +2493,8 @@ static noinline int copy_extent_csums(struct btrfs_trans_handle *trans,
 				if (ret == -ENOENT || ret == -EFBIG)
 					ret = 0;
 				sum = 0;
-				printk("log no csum found for byte %llu\n",
+				printk(KERN_INFO "log no csum found for "
+				       "byte %llu\n",
 				       (unsigned long long)disk_bytenr);
 				item = NULL;
 				btrfs_release_path(root, path);
@@ -2643,7 +2641,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
 	 * we have to do this after the loop above to avoid changing the
 	 * log tree while trying to change the log tree.
 	 */
-	while(!list_empty(&ordered_sums)) {
+	while (!list_empty(&ordered_sums)) {
 		struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next,
 						   struct btrfs_ordered_sum,
 						   list);
@@ -2736,7 +2734,7 @@ static int __btrfs_log_inode(struct btrfs_trans_handle *trans,
 	BUG_ON(ret);
 	path->keep_locks = 1;
 
-	while(1) {
+	while (1) {
 		ins_nr = 0;
 		ret = btrfs_search_forward(root, &min_key, &max_key,
 					   path, 0, trans->transid);
@@ -2848,7 +2846,7 @@ int btrfs_log_dentry(struct btrfs_trans_handle *trans,
 
 	start_log_trans(trans, root);
 	sb = dentry->d_inode->i_sb;
-	while(1) {
+	while (1) {
 		ret = __btrfs_log_inode(trans, root, dentry->d_inode,
 					inode_only);
 		BUG_ON(ret);
@@ -2919,7 +2917,7 @@ again:
 	key.offset = (u64)-1;
 	btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
 
-	while(1) {
+	while (1) {
 		ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0);
 		if (ret < 0)
 			break;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 6672adcec9f8..b187b537888e 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -140,7 +140,7 @@ static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid)
  * the list if the block device is congested.  This way, multiple devices
  * can make progress from a single worker thread.
  */
-static int noinline run_scheduled_bios(struct btrfs_device *device)
+static noinline int run_scheduled_bios(struct btrfs_device *device)
 {
 	struct bio *pending;
 	struct backing_dev_info *bdi;
@@ -187,7 +187,7 @@ loop:
 	}
 	spin_unlock(&device->io_lock);
 
-	while(pending) {
+	while (pending) {
 		cur = pending;
 		pending = pending->bi_next;
 		cur->bi_next = NULL;
@@ -458,7 +458,7 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
 
 		bdev = open_bdev_exclusive(device->name, flags, holder);
 		if (IS_ERR(bdev)) {
-			printk("open %s failed\n", device->name);
+			printk(KERN_INFO "open %s failed\n", device->name);
 			goto error;
 		}
 		set_blocksize(bdev, 4096);
@@ -570,14 +570,15 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
 	devid = le64_to_cpu(disk_super->dev_item.devid);
 	transid = btrfs_super_generation(disk_super);
 	if (disk_super->label[0])
-		printk("device label %s ", disk_super->label);
+		printk(KERN_INFO "device label %s ", disk_super->label);
 	else {
 		/* FIXME, make a readl uuid parser */
-		printk("device fsid %llx-%llx ",
+		printk(KERN_INFO "device fsid %llx-%llx ",
 		       *(unsigned long long *)disk_super->fsid,
 		       *(unsigned long long *)(disk_super->fsid + 8));
 	}
-	printk("devid %Lu transid %Lu %s\n", devid, transid, path);
+	printk(KERN_INFO "devid %llu transid %llu %s\n",
+	       (unsigned long long)devid, (unsigned long long)transid, path);
 	ret = device_list_add(path, disk_super, devid, fs_devices_ret);
 
 	brelse(bh);
@@ -683,9 +684,8 @@ no_more_items:
 				goto check_pending;
 			}
 		}
-		if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY) {
+		if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY)
 			goto next;
-		}
 
 		start_found = 1;
 		dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
@@ -1001,14 +1001,16 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
 
 	if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) &&
 	    root->fs_info->fs_devices->rw_devices <= 4) {
-		printk("btrfs: unable to go below four devices on raid10\n");
+		printk(KERN_ERR "btrfs: unable to go below four devices "
+		       "on raid10\n");
 		ret = -EINVAL;
 		goto out;
 	}
 
 	if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) &&
 	    root->fs_info->fs_devices->rw_devices <= 2) {
-		printk("btrfs: unable to go below two devices on raid1\n");
+		printk(KERN_ERR "btrfs: unable to go below two "
+		       "devices on raid1\n");
 		ret = -EINVAL;
 		goto out;
 	}
@@ -1031,7 +1033,8 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
 		bh = NULL;
 		disk_super = NULL;
 		if (!device) {
-			printk("btrfs: no missing devices found to remove\n");
+			printk(KERN_ERR "btrfs: no missing devices found to "
+			       "remove\n");
 			goto out;
 		}
 	} else {
@@ -1060,7 +1063,8 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
 	}
 
 	if (device->writeable && root->fs_info->fs_devices->rw_devices == 1) {
-		printk("btrfs: unable to remove the only writeable device\n");
+		printk(KERN_ERR "btrfs: unable to remove the only writeable "
+		       "device\n");
 		ret = -EINVAL;
 		goto error_brelse;
 	}
@@ -1286,9 +1290,8 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
 		return -EINVAL;
 
 	bdev = open_bdev_exclusive(device_path, 0, root->fs_info->bdev_holder);
-	if (!bdev) {
+	if (!bdev)
 		return -EIO;
-	}
 
 	if (root->fs_info->fs_devices->seeding) {
 		seeding_dev = 1;
@@ -1401,8 +1404,8 @@ error:
 	goto out;
 }
 
-static int noinline btrfs_update_device(struct btrfs_trans_handle *trans,
-				 struct btrfs_device *device)
+static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
+					struct btrfs_device *device)
 {
 	int ret;
 	struct btrfs_path *path;
@@ -1563,7 +1566,7 @@ static int btrfs_relocate_chunk(struct btrfs_root *root,
 	int ret;
 	int i;
 
-	printk("btrfs relocating chunk %llu\n",
+	printk(KERN_INFO "btrfs relocating chunk %llu\n",
 	       (unsigned long long)chunk_offset);
 	root = root->fs_info->chunk_root;
 	extent_root = root->fs_info->extent_root;
@@ -1748,7 +1751,7 @@ int btrfs_balance(struct btrfs_root *dev_root)
 	key.offset = (u64)-1;
 	key.type = BTRFS_CHUNK_ITEM_KEY;
 
-	while(1) {
+	while (1) {
 		ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
 		if (ret < 0)
 			goto error;
@@ -1916,7 +1919,7 @@ static int btrfs_add_system_chunk(struct btrfs_trans_handle *trans,
 	return 0;
 }
 
-static u64 noinline chunk_bytes_by_type(u64 type, u64 calc_size,
+static noinline u64 chunk_bytes_by_type(u64 type, u64 calc_size,
 					int num_stripes, int sub_stripes)
 {
 	if (type & (BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_DUP))
@@ -2041,7 +2044,7 @@ again:
 		min_free += 1024 * 1024;
 
 	INIT_LIST_HEAD(&private_devs);
-	while(index < num_stripes) {
+	while (index < num_stripes) {
 		device = list_entry(cur, struct btrfs_device, dev_alloc_list);
 		BUG_ON(!device->writeable);
 		if (device->total_bytes > device->bytes_used)
@@ -2242,7 +2245,7 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 	return 0;
 }
 
-static int noinline init_first_rw_device(struct btrfs_trans_handle *trans,
+static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
 					 struct btrfs_root *root,
 					 struct btrfs_device *device)
 {
@@ -2338,7 +2341,7 @@ void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree)
 {
 	struct extent_map *em;
 
-	while(1) {
+	while (1) {
 		spin_lock(&tree->map_tree.lock);
 		em = lookup_extent_mapping(&tree->map_tree, 0, (u64)-1);
 		if (em)
@@ -2413,9 +2416,8 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
 	int max_errors = 0;
 	struct btrfs_multi_bio *multi = NULL;
 
-	if (multi_ret && !(rw & (1 << BIO_RW))) {
+	if (multi_ret && !(rw & (1 << BIO_RW)))
 		stripes_allocated = 1;
-	}
 again:
 	if (multi_ret) {
 		multi = kzalloc(btrfs_multi_bio_size(stripes_allocated),
@@ -2434,7 +2436,9 @@ again:
 		return 0;
 
 	if (!em) {
-		printk("unable to find logical %Lu len %Lu\n", logical, *length);
+		printk(KERN_CRIT "unable to find logical %llu len %llu\n",
+		       (unsigned long long)logical,
+		       (unsigned long long)*length);
 		BUG();
 	}
 
@@ -2541,9 +2545,8 @@ again:
 			device = map->stripes[stripe_index].dev;
 			if (device->bdev) {
 				bdi = blk_get_backing_dev_info(device->bdev);
-				if (bdi->unplug_io_fn) {
+				if (bdi->unplug_io_fn)
 					bdi->unplug_io_fn(bdi, unplug_page);
-				}
 			}
 		} else {
 			multi->stripes[i].physical =
@@ -2717,7 +2720,7 @@ struct async_sched {
  * This will add one bio to the pending list for a device and make sure
  * the work struct is scheduled.
  */
-static int noinline schedule_bio(struct btrfs_root *root,
+static noinline int schedule_bio(struct btrfs_root *root,
 				 struct btrfs_device *device,
 				 int rw, struct bio *bio)
 {
@@ -2785,8 +2788,10 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
 
 	total_devs = multi->num_stripes;
 	if (map_length < length) {
-		printk("mapping failed logical %Lu bio len %Lu "
-		       "len %Lu\n", logical, length, map_length);
+		printk(KERN_CRIT "mapping failed logical %llu bio len %llu "
+		       "len %llu\n", (unsigned long long)logical,
+		       (unsigned long long)length,
+		       (unsigned long long)map_length);
 		BUG();
 	}
 	multi->end_io = first_bio->bi_end_io;
@@ -2794,7 +2799,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
 	multi->orig_bio = first_bio;
 	atomic_set(&multi->stripes_pending, multi->num_stripes);
 
-	while(dev_nr < total_devs) {
+	while (dev_nr < total_devs) {
 		if (total_devs > 1) {
 			if (dev_nr < total_devs - 1) {
 				bio = bio_clone(first_bio, GFP_NOFS);
@@ -3058,7 +3063,8 @@ static int read_one_dev(struct btrfs_root *root,
 			return -EIO;
 
 		if (!device) {
-			printk("warning devid %Lu missing\n", devid);
+			printk(KERN_WARNING "warning devid %llu missing\n",
+			       (unsigned long long)devid);
 			device = add_missing_dev(root, devid, dev_uuid);
 			if (!device)
 				return -ENOMEM;
@@ -3078,12 +3084,6 @@ static int read_one_dev(struct btrfs_root *root,
 	if (device->writeable)
 		device->fs_devices->total_rw_bytes += device->total_bytes;
 	ret = 0;
-#if 0
-	ret = btrfs_open_device(device);
-	if (ret) {
-		kfree(device);
-	}
-#endif
 	return ret;
 }
 
@@ -3174,7 +3174,7 @@ int btrfs_read_chunk_tree(struct btrfs_root *root)
 	key.type = 0;
 again:
 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
-	while(1) {
+	while (1) {
 		leaf = path->nodes[0];
 		slot = path->slots[0];
 		if (slot >= btrfs_header_nritems(leaf)) {
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 4146f0710e6a..7f332e270894 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -264,7 +264,8 @@ struct xattr_handler *btrfs_xattr_handlers[] = {
  */
 static bool btrfs_is_valid_xattr(const char *name)
 {
-	return !strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) ||
+	return !strncmp(name, XATTR_SECURITY_PREFIX,
+			XATTR_SECURITY_PREFIX_LEN) ||
 	       !strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN) ||
 	       !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) ||
 	       !strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN);
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
index c4617cde6c73..ecfbce836d32 100644
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c
@@ -152,7 +152,7 @@ static int free_workspace(struct workspace *workspace)
 static void free_workspaces(void)
 {
 	struct workspace *workspace;
-	while(!list_empty(&idle_workspace)) {
+	while (!list_empty(&idle_workspace)) {
 		workspace = list_entry(idle_workspace.next, struct workspace,
 				       list);
 		list_del(&workspace->list);
@@ -397,12 +397,10 @@ int btrfs_zlib_decompress_biovec(struct page **pages_in,
 		ret = -1;
 		goto out;
 	}
-	while(workspace->inf_strm.total_in < srclen) {
+	while (workspace->inf_strm.total_in < srclen) {
 		ret = zlib_inflate(&workspace->inf_strm, Z_NO_FLUSH);
-		if (ret != Z_OK && ret != Z_STREAM_END) {
+		if (ret != Z_OK && ret != Z_STREAM_END)
 			break;
-		}
-
 		/*
 		 * buf start is the byte offset we're of the start of
 		 * our workspace buffer
@@ -424,16 +422,14 @@ int btrfs_zlib_decompress_biovec(struct page **pages_in,
 			/* we didn't make progress in this inflate
 			 * call, we're done
 			 */
-			if (ret != Z_STREAM_END) {
+			if (ret != Z_STREAM_END)
 				ret = -1;
-			}
 			break;
 		}
 
 		/* we haven't yet hit data corresponding to this page */
-		if (total_out <= start_byte) {
+		if (total_out <= start_byte)
 			goto next;
-		}
 
 		/*
 		 * the start of the data we care about is offset into
@@ -448,7 +444,7 @@ int btrfs_zlib_decompress_biovec(struct page **pages_in,
 		current_buf_start = buf_start;
 
 		/* copy bytes from the working buffer into the pages */
-		while(working_bytes > 0) {
+		while (working_bytes > 0) {
 			bytes = min(PAGE_CACHE_SIZE - pg_offset,
 				    PAGE_CACHE_SIZE - buf_offset);
 			bytes = min(bytes, working_bytes);
@@ -471,6 +467,7 @@ int btrfs_zlib_decompress_biovec(struct page **pages_in,
 					ret = 0;
 					goto done;
 				}
+
 				page_out = bvec[page_out_index].bv_page;
 				pg_offset = 0;
 				page_bytes_left = PAGE_CACHE_SIZE;
@@ -480,9 +477,8 @@ int btrfs_zlib_decompress_biovec(struct page **pages_in,
 				 * make sure our new page is covered by this
 				 * working buffer
 				 */
-				if (total_out <= start_byte) {
+				if (total_out <= start_byte)
 					goto next;
-				}
 
 				/* the next page in the biovec might not
 				 * be adjacent to the last page, but it
@@ -517,11 +513,10 @@ next:
 							   PAGE_CACHE_SIZE);
 		}
 	}
-	if (ret != Z_STREAM_END) {
+	if (ret != Z_STREAM_END)
 		ret = -1;
-	} else {
+	else
 		ret = 0;
-	}
 done:
 	zlib_inflateEnd(&workspace->inf_strm);
 	if (data_in)
@@ -579,16 +574,15 @@ int btrfs_zlib_decompress(unsigned char *data_in,
 		goto out;
 	}
 
-	while(bytes_left > 0) {
+	while (bytes_left > 0) {
 		unsigned long buf_start;
 		unsigned long buf_offset;
 		unsigned long bytes;
 		unsigned long pg_offset = 0;
 
 		ret = zlib_inflate(&workspace->inf_strm, Z_NO_FLUSH);
-		if (ret != Z_OK && ret != Z_STREAM_END) {
+		if (ret != Z_OK && ret != Z_STREAM_END)
 			break;
-		}
 
 		buf_start = total_out;
 		total_out = workspace->inf_strm.total_out;
@@ -598,15 +592,13 @@ int btrfs_zlib_decompress(unsigned char *data_in,
 			break;
 		}
 
-		if (total_out <= start_byte) {
+		if (total_out <= start_byte)
 			goto next;
-		}
 
-		if (total_out > start_byte && buf_start < start_byte) {
+		if (total_out > start_byte && buf_start < start_byte)
 			buf_offset = start_byte - buf_start;
-		} else {
+		else
 			buf_offset = 0;
-		}
 
 		bytes = min(PAGE_CACHE_SIZE - pg_offset,
 			    PAGE_CACHE_SIZE - buf_offset);
@@ -622,11 +614,12 @@ next:
 		workspace->inf_strm.next_out = workspace->buf;
 		workspace->inf_strm.avail_out = PAGE_CACHE_SIZE;
 	}
-	if (ret != Z_STREAM_END && bytes_left != 0) {
+
+	if (ret != Z_STREAM_END && bytes_left != 0)
 		ret = -1;
-	} else {
+	else
 		ret = 0;
-	}
+
 	zlib_inflateEnd(&workspace->inf_strm);
 out:
 	free_workspace(workspace);
-- 
cgit v1.2.3